In [1]:
import pandas as pd
import numpy as np
import pytz as tz

In [2]:
train = pd.read_csv('./data/train.csv')

In [3]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'],infer_datetime_format=True)
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'],infer_datetime_format=True)
print(train['pickup_datetime'].min())
print(train['pickup_datetime'].max())
print(train['dropoff_datetime'].min())
print(train['dropoff_datetime'].max())

2016-01-01 00:00:17
2016-06-30 23:59:39
2016-01-01 00:03:31
2016-07-01 23:02:03


In [4]:
def to_epoch(ts):
    return int((ts - pd.Timestamp('1970-01-01')) / np.timedelta64(1, 's'))

In [5]:
train['pickup_datetime_epoch'] = train['pickup_datetime'].apply(to_epoch)
train['dropoff_datetime_epoch'] = train['dropoff_datetime'].apply(to_epoch)

In [6]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetime_epoch,dropoff_datetime_epoch
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1457976295,1457976750
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1465692215,1465692878
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,1453203324,1453205448
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1459971151,1459971580
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1458999055,1458999490


In [8]:
routes1 = pd.read_csv('./data/routes_1.csv')
routes2 = pd.read_csv('./data/routes_2.csv')

In [12]:
routes1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 6 columns):
ID                      700000 non-null object
Origin_start_name       700000 non-null object
Destination_end_name    700000 non-null object
Total_distance          700000 non-null float64
Total_time              700000 non-null float64
Number_of_steps         700000 non-null int64
dtypes: float64(2), int64(1), object(3)
memory usage: 32.0+ MB


In [13]:
routes2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 758643 entries, 0 to 758642
Data columns (total 6 columns):
ID                      758643 non-null object
Origin_start_name       758643 non-null object
Destination_end_name    758643 non-null object
Total_distance          758643 non-null float64
Total_time              758643 non-null float64
Number_of_steps         758643 non-null int64
dtypes: float64(2), int64(1), object(3)
memory usage: 34.7+ MB


In [19]:
routes = pd.concat([routes1, routes2])
routes.rename({'ID': 'id'}, axis='columns', inplace=True)

In [20]:
routes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1458643 entries, 0 to 758642
Data columns (total 6 columns):
id                      1458643 non-null object
Origin_start_name       1458643 non-null object
Destination_end_name    1458643 non-null object
Total_distance          1458643 non-null float64
Total_time              1458643 non-null float64
Number_of_steps         1458643 non-null int64
dtypes: float64(2), int64(1), object(3)
memory usage: 77.9+ MB


In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 13 columns):
id                        1458644 non-null object
vendor_id                 1458644 non-null int64
pickup_datetime           1458644 non-null datetime64[ns]
dropoff_datetime          1458644 non-null datetime64[ns]
passenger_count           1458644 non-null int64
pickup_longitude          1458644 non-null float64
pickup_latitude           1458644 non-null float64
dropoff_longitude         1458644 non-null float64
dropoff_latitude          1458644 non-null float64
store_and_fwd_flag        1458644 non-null object
trip_duration             1458644 non-null int64
pickup_datetime_epoch     1458644 non-null int64
dropoff_datetime_epoch    1458644 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(5), object(2)
memory usage: 144.7+ MB


In [24]:
train_with_routes = pd.merge(train, routes, on='id')

In [25]:
train_with_routes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1458643 entries, 0 to 1458642
Data columns (total 18 columns):
id                        1458643 non-null object
vendor_id                 1458643 non-null int64
pickup_datetime           1458643 non-null datetime64[ns]
dropoff_datetime          1458643 non-null datetime64[ns]
passenger_count           1458643 non-null int64
pickup_longitude          1458643 non-null float64
pickup_latitude           1458643 non-null float64
dropoff_longitude         1458643 non-null float64
dropoff_latitude          1458643 non-null float64
store_and_fwd_flag        1458643 non-null object
trip_duration             1458643 non-null int64
pickup_datetime_epoch     1458643 non-null int64
dropoff_datetime_epoch    1458643 non-null int64
Origin_start_name         1458643 non-null object
Destination_end_name      1458643 non-null object
Total_distance            1458643 non-null float64
Total_time                1458643 non-null float64
Number_of_steps     

In [29]:
train_with_routes.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetime_epoch,dropoff_datetime_epoch,Origin_start_name,Destination_end_name,Total_distance,Total_time,Number_of_steps
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1457976295,1457976750,Columbus Circle,East 65th Street,2009.1,164.9,5
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1465692215,1465692878,2nd Avenue,Washington Square West,2513.2,332.0,6
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,1453203324,1453205448,West 56th Street,Beekman Street,11060.8,767.6,16
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1459971151,1459971580,Greenwich Street,Broadway,1779.4,235.8,4
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1458999055,1458999490,Broadway,West 81st Street,1614.9,140.1,5


In [28]:
train_with_routes.to_csv('./data/train_with_routes.csv')