In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import cPickle as pickle

plt.style.use('ggplot')

In [2]:
cd ..

/Users/odatakuma/cmu/FleetAI


In [3]:
data_path = 'data/nyc_taxi/trips_2016-05.csv'

In [4]:
cols = ['dayofweek', 'hour', 'minute', 'plat', 'plon', 'dlat', 'dlon', 'trip_distance', 'trip_time']
df = pd.read_csv(data_path, usecols=cols, dtype={key : np.float32 for key in cols})
df.head()

Unnamed: 0,dayofweek,dlat,dlon,hour,minute,plat,plon,trip_distance,trip_time
0,6,40.729668,-73.983788,0,0,40.76825,-73.985474,3.6,17.516666
1,6,40.744732,-73.980942,0,0,40.742039,-73.993431,1.09,7.016667
2,6,40.732506,-74.001831,0,0,40.684368,-73.992065,4.21,19.783333
3,6,40.737793,-73.997871,0,0,40.740585,-74.005615,0.56,6.65
4,6,40.758312,-73.988319,0,0,40.755634,-73.980026,0.63,5.316667


In [5]:
# from engine.mapper.geohelper import distance_in_meters

df['hour'] = df.hour + df.minute/60
df = df.drop('minute', axis=1)
df['trip_distance'] *= 1609.34
# df['d'] = distance_in_meters(df.plat, df.plon, df.dlat, df.dlon).astype(int)

In [6]:
from sklearn.cross_validation import train_test_split

X = df.drop('trip_time', axis=1).values
y = df['trip_time'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print X.shape



(12806249, 7)


In [7]:
%%time
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=10, min_samples_split=5e-5, n_jobs=-1)
forest.fit(X_train, y_train)
rmse_train = np.sqrt(((y_train - forest.predict(X_train))**2).mean())
rmse_test = np.sqrt(((y_test - forest.predict(X_test))**2).mean())
print "RMSE train/val: %.1f / %.1f" % (rmse_train, rmse_test)

RMSE train/val: 4.4 / 4.5
CPU times: user 24min 36s, sys: 10.6 s, total: 24min 47s
Wall time: 7min 46s


In [8]:
forest = RandomForestRegressor(n_estimators=10, min_samples_split=5e-5, n_jobs=-1)
forest.fit(X, y)
rmse = np.sqrt(((y - forest.predict(X))**2).mean())

In [9]:
rmse

4.4777028541641029

In [10]:
path = 'data/pickle/triptime_predictor.pkl'

In [11]:
with open(path, 'wb') as f:
    pickle.dump(forest, f)

In [12]:
with open(path, 'r') as f:
    model = pickle.load(f)

In [13]:
geohash_table = pd.read_csv('data/table/zones.csv', index_col='geohash')
geohash_table.head()

Unnamed: 0_level_0,lat,lon,taxi_zone,x,y,road_density,intxn_density
geohash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dr5qgxx,40.603358,-74.015682,10,2,0,32,35
dr5qgxy,40.604832,-74.017029,10,2,0,34,8
dr5qgxz,40.604637,-74.015335,10,2,0,0,0
dr5qgz5,40.600984,-74.009745,10,3,0,13,5
dr5qgz7,40.602178,-74.010322,10,3,0,28,9


In [14]:
df = geohash_table.groupby('taxi_zone')[['lat', 'lon']].mean()
taxi_zones = np.array(df.index).astype(int)
df.index = taxi_zones
df.head()

Unnamed: 0,lat,lon
2,40.865157,-73.849539
3,40.724038,-73.977351
6,40.761465,-73.91969
7,40.778308,-73.923262
8,40.753034,-73.788619


In [21]:
from engine.mapper.pathgenerator import PathGenerator

GRAPH_PATH = 'data/pickle/nyc_network_graph.pkl'
with open(GRAPH_PATH, 'r') as f:
    G = pickle.load(f)
    
path_generator = PathGenerator(G)

In [43]:
locations = []
for loc in df.values:
    locations.append(path_generator.mm_convert(loc, georange=0.005))

In [46]:
df.loc[:, :] = locations
df.head()

Unnamed: 0,lat,lon
2,40.865198,-73.849627
3,40.724197,-73.977234
6,40.761291,-73.919315
7,40.778923,-73.924125
8,40.753017,-73.788949


In [64]:
data = pd.DataFrame(index=[(p, d) for p in taxi_zones for d in taxi_zones], columns=['plat', 'plon', 'dlat', 'dlon'])
data.loc[[(p, d) for p in taxi_zones for d in taxi_zones], :] = [(plat, plon, dlat, dlon) for plat, plon in df.values
                                                           for dlat, dlon in df.values]
data.head()

Unnamed: 0,plat,plon,dlat,dlon
"(2, 2)",40.865198,-73.849627,40.865198,-73.849627
"(2, 3)",40.865198,-73.849627,40.724197,-73.977234
"(2, 6)",40.865198,-73.849627,40.761291,-73.919315
"(2, 7)",40.865198,-73.849627,40.778923,-73.924125
"(2, 8)",40.865198,-73.849627,40.753017,-73.788949


In [66]:
data['distance'] = 0
n = 0
for (s, t), (plat, plon, dlat, dlon) in data[['plat', 'plon', 'dlat', 'dlon']].iterrows():
    if s < t:
        if n % 1000 == 0:
            print n
        _, d, _, _ = path_generator.map_matching_shortest_path((plat, plon), (dlat, dlon))
        data.loc[(s, t), 'distance'] = d
        data.loc[(t, s), 'distance'] = d    
        n += 1

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000


In [68]:
data['dayofweek'] = 0
data['hour'] = 0

In [74]:
data = data[[u'plat', u'plon', u'dlat', u'dlon', u'dayofweek', u'hour', u'distance']]

In [75]:
taxi_zones = df.index
nzones = len(taxi_zones)
df_od = pd.DataFrame(index=[(d, h, z) for d in range(7) for h in range(24) for z in taxi_zones],
                    columns=taxi_zones)

for d in range(7):
    for h in range(24):
        df_od.loc[[(d, h, z) for z in taxi_zones], :] = model.predict(data.values).reshape((nzones, nzones))
df_od.head()

Unnamed: 0,2,3,6,7,8,9,10,11,12,13,...,253,254,255,256,257,258,259,260,261,262
"(0, 0, 2)",2.38667,24.7271,19.2035,19.2035,19.2035,25.3212,39.0924,26.9773,25.3212,35.198,...,6.08344,24.7271,24.7271,26.9773,24.7271,9.00868,22.7476,25.3212,18.5406,17.617
"(0, 0, 3)",24.7271,2.38667,14.4238,16.2234,23.1294,24.4811,19.2035,15.2586,9.47956,17.617,...,24.7271,10.383,9.00868,15.2586,17.617,24.7271,15.2586,9.00868,12.1138,13.1729
"(0, 0, 6)",19.2035,14.4238,2.38667,7.96909,16.4948,19.2035,24.7271,19.2035,16.4948,24.7271,...,19.2035,13.1729,13.7035,19.2035,16.2234,22.7476,7.01523,16.4948,13.652,13.652
"(0, 0, 7)",19.2035,16.2234,7.96909,2.38667,17.0932,21.2532,24.7271,22.7476,17.617,24.7271,...,19.2035,14.4238,14.4238,19.2035,17.3959,22.7476,10.383,17.0932,14.4238,14.4238
"(0, 0, 8)",19.2035,23.1294,16.4948,17.0932,2.38667,15.2586,26.9773,31.3433,24.7271,26.1974,...,19.2035,19.2035,19.2035,25.3212,16.2234,22.7476,15.2586,24.7271,22.7476,22.7476


In [76]:
for key in ['dayofweek', 'hour', 'pickup_zone']:
    df_od[key] = 0
df_od.loc[:, ['dayofweek', 'hour', 'pickup_zone']] = list(df_od.index)
df_od = df_od.reset_index().drop('index', axis=1)
df_od = df_od.set_index(['dayofweek', 'hour', 'pickup_zone'])
df_od.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2,3,6,7,8,9,10,11,12,13,...,253,254,255,256,257,258,259,260,261,262
dayofweek,hour,pickup_zone,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,0,2,2.38667,24.7271,19.2035,19.2035,19.2035,25.3212,39.0924,26.9773,25.3212,35.198,...,6.08344,24.7271,24.7271,26.9773,24.7271,9.00868,22.7476,25.3212,18.5406,17.617
0,0,3,24.7271,2.38667,14.4238,16.2234,23.1294,24.4811,19.2035,15.2586,9.47956,17.617,...,24.7271,10.383,9.00868,15.2586,17.617,24.7271,15.2586,9.00868,12.1138,13.1729
0,0,6,19.2035,14.4238,2.38667,7.96909,16.4948,19.2035,24.7271,19.2035,16.4948,24.7271,...,19.2035,13.1729,13.7035,19.2035,16.2234,22.7476,7.01523,16.4948,13.652,13.652
0,0,7,19.2035,16.2234,7.96909,2.38667,17.0932,21.2532,24.7271,22.7476,17.617,24.7271,...,19.2035,14.4238,14.4238,19.2035,17.3959,22.7476,10.383,17.0932,14.4238,14.4238
0,0,8,19.2035,23.1294,16.4948,17.0932,2.38667,15.2586,26.9773,31.3433,24.7271,26.1974,...,19.2035,19.2035,19.2035,25.3212,16.2234,22.7476,15.2586,24.7271,22.7476,22.7476


In [77]:
df_od.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 37968 entries, (0, 0, 2) to (6, 23, 262)
Columns: 226 entries, 2 to 262
dtypes: object(226)
memory usage: 65.8+ MB


In [78]:
df_od.to_csv('data/table/eta.csv')