In [46]:
# imports
import numpy as np
import pandas as pd
import datetime
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

first_datetime = datetime.datetime.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')
train_valid_split_datetime = datetime.datetime.strptime('2018-01-21 00:00:00', '%Y-%m-%d %H:%M:%S')
print('first_datetime:', first_datetime)
print('train_valid_split_datetime:', train_valid_split_datetime)

first_datetime: 2018-01-01 00:00:00
train_valid_split_datetime: 2018-01-21 00:00:00


In [47]:
taxi_zone_lookup = pd.read_csv('nyc-tlc/misc/taxi _zone_lookup.csv')
print('taxi_zone_lookup:', taxi_zone_lookup.shape)
print(taxi_zone_lookup.head())
manhattan_location_ids = taxi_zone_lookup[taxi_zone_lookup['Borough']=='Manhattan']['LocationID'].values
print('manhattan_location_ids:', manhattan_location_ids.shape)
print(manhattan_location_ids)

taxi_zone_lookup: (265, 4)
   LocationID        Borough                     Zone service_zone
0           1            EWR           Newark Airport          EWR
1           2         Queens              Jamaica Bay    Boro Zone
2           3          Bronx  Allerton/Pelham Gardens    Boro Zone
3           4      Manhattan            Alphabet City  Yellow Zone
4           5  Staten Island            Arden Heights    Boro Zone
manhattan_location_ids: (69,)
[  4  12  13  24  41  42  43  45  48  50  68  74  75  79  87  88  90 100
 103 104 105 107 113 114 116 120 125 127 128 137 140 141 142 143 144 148
 151 152 153 158 161 162 163 164 166 170 186 194 202 209 211 224 229 230
 231 232 233 234 236 237 238 239 243 244 246 249 261 262 263]


In [48]:
def get_5min_id(x):
    return (x-first_datetime).total_seconds()//(5*60)

def get_15min_id(x):
    return (x-first_datetime).total_seconds()//(15*60)

def get_30min_id(x):
    return (x-first_datetime).total_seconds()//(30*60)

In [50]:
start = time.time()
# sample = pd.read_csv('nyc-tlc/trip data/sample.csv')
sample = pd.read_csv('nyc-tlc/trip data/yellow_tripdata_2018-01.csv')
print('read_csv:', time.time()-start)
print('sample:', sample.shape)
#print(sample.head())
sample_manhattan = sample[sample['PULocationID'].isin(manhattan_location_ids)].copy()
print('sample_manhattan:', sample_manhattan.shape)
#print(sample_manhattan.head())
print('manhattan_location_ids:', time.time()-start)

sample_manhattan['store_and_fwd_flag'] = sample_manhattan['store_and_fwd_flag'].map(lambda x: x=='N' and 0 or 1)
print('store_and_fwd_flag:', time.time()-start)
sample_manhattan['tpep_pickup_datetime'] = pd.to_datetime(sample_manhattan['tpep_pickup_datetime'])
print('tpep_pickup_datetime:', time.time()-start)
sample_manhattan['tpep_dropoff_datetime'] = pd.to_datetime(sample_manhattan['tpep_dropoff_datetime'])
print('tpep_dropoff_datetime:', time.time()-start)
sample_manhattan['trip_duration'] = sample_manhattan['tpep_dropoff_datetime']-sample_manhattan['tpep_pickup_datetime']
print('trip_duration:', time.time()-start)
sample_manhattan['trip_duration'] = sample_manhattan['trip_duration'].map(lambda x: x.total_seconds())
sample_manhattan['trip_speed'] = sample_manhattan['trip_distance']/sample_manhattan['trip_duration']
print('trip_speed:', time.time()-start)
sample_manhattan['tpep_pickup_year'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.year)
print('tpep_pickup_year:', time.time()-start)
sample_manhattan['tpep_pickup_month'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.month)
print('tpep_pickup_month:', time.time()-start)
sample_manhattan['tpep_pickup_day'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.day)
print('tpep_pickup_day:', time.time()-start)
sample_manhattan['tpep_pickup_hour'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.hour)
print('tpep_pickup_hour:', time.time()-start)
sample_manhattan['tpep_pickup_weekday'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.weekday())
print('tpep_pickup_weekday:', time.time()-start)
sample_manhattan['is_weekend'] = sample_manhattan['tpep_pickup_weekday'].map(lambda x: x >= 5 and 1 or 0)
print('is_weekend:', time.time()-start)
sample_manhattan['is_morning_peak'] = sample_manhattan['tpep_pickup_hour'].map(lambda x: 7 <= x <= 9 and 1 or 0)
print('is_morning_peak:', time.time()-start)
sample_manhattan['is_evening_peak'] = sample_manhattan['tpep_pickup_hour'].map(lambda x: 17 <= x <= 19 and 1 or 0)
print('is_evening_peak:', time.time()-start)

sample_manhattan['tpep_pickup_5min_id'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: get_5min_id(x))
print('tpep_pickup_5min_id:', time.time()-start)
sample_manhattan['tpep_pickup_15min_id'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: get_15min_id(x))
print('tpep_pickup_15min_id:', time.time()-start)
sample_manhattan['tpep_pickup_30min_id'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: get_30min_id(x))
print('tpep_pickup_30min_id:', time.time()-start)

sample_manhattan = sample_manhattan[sample_manhattan['tpep_pickup_5min_id'] >= 0]
print('filter tpep_pickup_5min_id:', time.time()-start)

sample_manhattan.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1, inplace=True)

print(sample_manhattan.head())

read_csv: 24.77973484992981
sample: (8759874, 17)
sample_manhattan: (7965703, 17)
manhattan_location_ids: 29.243255853652954
store_and_fwd_flag: 32.8868727684021
tpep_pickup_datetime: 34.80854868888855
tpep_dropoff_datetime: 36.67246389389038
trip_duration: 36.78250074386597
trip_duration: 109.39003682136536
trip_speed: 109.60747385025024
tpep_pickup_year: 141.46104764938354
tpep_pickup_month: 173.77325987815857
tpep_pickup_day: 204.80911207199097
tpep_pickup_hour: 237.49810194969177
tpep_pickup_weekday: 269.04331493377686
is_weekend: 272.2670168876648
is_morning_peak: 275.75064277648926
is_evening_peak: 279.17119884490967
tpep_pickup_5min_id: 438.8752498626709
tpep_pickup_15min_id: 598.274386882782
tpep_pickup_30min_id: 759.2338318824768
   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2018-01-01 00:21:05   2018-01-01 00:24:23                1   
1         1  2018-01-01 00:44:55   2018-01-01 01:03:05                1   
2         1  2018-01-01 00

In [53]:
def stat(x):
    return pd.Series(
        [x.count(), x.min(), x.idxmin(), x.quantile(.25), x.median(), x.quantile(.75), x.mean(), x.max(), x.idxmax(),
         x.mad(), x.var(), x.std(), x.skew(), x.kurt()],
        index=['总数', '最小值', '最小值位置', '25%分位数', '中位数', '75%分位数', '均值', '最大值', '最大值位数', '平均绝对偏差', '方差', '标准差', '偏度',
               '峰度'])

In [57]:
sample_manhattan = sample_manhattan[sample_manhattan['tpep_pickup_5min_id'] >= 0]
sample_manhattan_stat = sample_manhattan.apply(stat)
print('sample_manhattan_stat:')
print(sample_manhattan_stat)

sample_manhattan_stat:
          VendorID  passenger_count  trip_distance  RatecodeID  \
总数     7965469.000      7965469.000    7965469.000 7965469.000   
最小值          1.000            0.000          0.000       1.000   
最小值位置        0.000          505.000         63.000       0.000   
25%分位数       1.000            1.000          0.900       1.000   
中位数          2.000            1.000          1.480       1.000   
75%分位数       2.000            2.000          2.500       1.000   
均值           1.563            1.610          2.233       1.018   
最大值          2.000            9.000        830.800      99.000   
最大值位数       12.000       204469.000    1858065.000 2280156.000   
平均绝对偏差       0.492            0.890          1.510       0.035   
方差           0.246            1.590          6.487       0.062   
标准差          0.496            1.261          2.547       0.249   
偏度          -0.256            2.230          8.282     145.490   
峰度          -1.934            4.068       1433.739   

In [59]:
train_valid_split_5min_id = get_5min_id(train_valid_split_datetime)
train_valid_split_15min_id = get_15min_id(train_valid_split_datetime)
train_valid_split_30min_id = get_30min_id(train_valid_split_datetime)
print('train_valid_split_5min_id:', train_valid_split_5min_id)
print('train_valid_split_15min_id:', train_valid_split_15min_id)
print('train_valid_split_30min_id:', train_valid_split_30min_id)

train_valid_split_5min_id: 5760.0
train_valid_split_15min_id: 1920.0
train_valid_split_30min_id: 960.0


In [63]:
train_sample_5min_count = sample_manhattan[sample_manhattan['tpep_pickup_5min_id'] < train_valid_split_5min_id].groupby(['tpep_pickup_5min_id', 'PULocationID']).count()['VendorID']
print('train_sample_5min_count:', train_sample_5min_count.shape)
print(train_sample_5min_count.head())
test_sample_5min_count = sample_manhattan[sample_manhattan['tpep_pickup_5min_id'] >= train_valid_split_5min_id].groupby(['tpep_pickup_5min_id', 'PULocationID']).count()['VendorID']
print('test_sample_5min_count:', test_sample_5min_count.shape)
print(test_sample_5min_count.head())

train_sample_5min_count: (307202,)
tpep_pickup_5min_id  PULocationID
0.000                4               3
                     13              4
                     24              2
                     41              3
                     42              3
Name: VendorID, dtype: int64
test_sample_5min_count: (169531,)
tpep_pickup_5min_id  PULocationID
5760.000             4               14
                     13               4
                     24               2
                     41               7
                     42               2
Name: VendorID, dtype: int64


In [65]:
print('test_sample_5min_count:', type(test_sample_5min_count))
print(test_sample_5min_count[0, 4])

test_sample_5min_count: <class 'pandas.core.series.Series'>


KeyError: (0, 4)

In [None]:
tripdata_201801 = pd.read_csv('nyc-tlc/trip data/yellow_tripdata_2018-01.csv')
print('tripdata_201801:', tripdata_201801.shape, tripdata_201801.head())