### Data loading is straight from Ramon's Exploration1 notebook

In [1]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from os import listdir
from os.path import isfile, join
path = 'ignored_assets/public_data/training_data/order_data'
onlyfiles = [f for f in listdir(path) if isfile(join(path, f)) and f != '.DS_Store']

In [3]:
orderdf = []
for f in onlyfiles:
    orderdf.append(pd.read_csv(join(path, f), delimiter='\t', header = None))

In [4]:
orderdf = pd.concat(orderdf)

In [5]:
orderdf.index = range(len(orderdf))

In [6]:
# Inexplicably, there are 65ish start districts but 793 end districts.
# We will restrictu our analysis to those later

In [7]:
orderdf[6] = pd.to_datetime(orderdf[6])

In [8]:
orderdf.columns = ['order_id', 'driver_id', 'passenger_id', 'start_district_hash', 'dest_district_hash', 'price', 'timestamp']

In [9]:
orderdf.head()

Unnamed: 0,order_id,driver_id,passenger_id,start_district_hash,dest_district_hash,price,timestamp
0,97ebd0c6680f7c0535dbfdead6e51b4b,dd65fa250fca2833a3a8c16d2cf0457c,ed180d7daf639d936f1aeae4f7fb482f,4725c39a5e5f4c188d382da3910b3f3f,3e12208dd0be281c92a6ab57d9a6fb32,24.0,2016-01-01 13:37:23
1,92c3ac9251cc9b5aab90b114a1e363be,c077e0297639edcb1df6189e8cda2c3d,191a180f0a262aff3267775c4fac8972,82cc4851f9e4faa4e54309f8bb73fd7c,b05379ac3f9b7d99370d443cfd5dcc28,2.0,2016-01-01 09:47:54
2,abeefc3e2aec952468e2fd42a1649640,86dbc1b68de435957c61b5a523854b69,7029e813bb3de8cc73a8615e2785070c,fff4e8465d1e12621bc361276b6217cf,fff4e8465d1e12621bc361276b6217cf,9.0,2016-01-01 18:24:02
3,cb31d0be64cda3cc66b46617bf49a05c,4fadfa6eeaa694742de036dddf02b0c4,21dc133ac68e4c07803d1c2f48988a83,4b7f6f4e2bf237b6cc58f57142bea5c0,4b7f6f4e2bf237b6cc58f57142bea5c0,11.0,2016-01-01 22:13:27
4,139d492189ae5a933122c098f63252b3,,26963cc76da2d8450d8f23fc357db987,fc34648599753c9e74ab238e9a4a07ad,87285a66236346350541b8815c5fae94,4.0,2016-01-01 17:00:06


In [10]:
orderdf['quarter_hour'] = orderdf['timestamp'].dt.round('15min')  

In [11]:
tb = pd.pivot_table(orderdf, index=['quarter_hour'], columns=['start_district_hash'], values=['order_id'], aggfunc=len)

## Let's now explore the distance between clusters, as proxied by the price

In [None]:
#orderdf_startdest=orderdf[['order_id','start_district_hash','dest_district_hash']].groupby(['start_district_hash','dest_district_hash']).count()
orderdf_price=orderdf[['price','start_district_hash','dest_district_hash']].groupby(['start_district_hash','dest_district_hash']).median()

In [None]:
orderdf_price.head()

In [None]:
od_ct=pd.crosstab([orderdf['start_district_hash']],[orderdf['dest_district_hash']])

In [None]:
#Filter so that the dest hashes are restricted to the start hashes. This loses us 1mil trips out of 8
start_hash=orderdf['start_district_hash'].unique()
#dest_hash=orderdf['dest_district_hash'].unique()

od_ct=od_ct.filter(items=start_hash)


In [None]:
#deptct=orderdf[['order_id','start_district_hash']].groupby(['start_district_hash']).count()
#destct=orderdf[['order_id','dest_district_hash']].groupby(['dest_district_hash']).count()

In [None]:
od_ct

In [None]:
# Plot the o-d matrix
plt.imshow(od_ct,interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Number of trips by origin and destination')
plt.xlabel('Origin cluster')
plt.ylabel('Destination cluster')
plt.colorbar()

In [None]:
od_price=pd.crosstab([orderdf['start_district_hash']],[orderdf['dest_district_hash']],values=orderdf['price'],aggfunc=np.median)
od_price=od_price.filter(items=start_hash)


In [None]:
od_price

In [None]:
plt.imshow(od_price)
plt.title('Number of trips by origin and destination')
plt.xlabel('Origin cluster')
plt.ylabel('Destination cluster')
plt.colorbar()