In [106]:
# imports
import numpy as np
import pandas as pd
import datetime
import time
import xgboost as xgb

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

first_datetime = datetime.datetime.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')
last_datetime = datetime.datetime.strptime('2018-07-01 00:00:00', '%Y-%m-%d %H:%M:%S')
train_valid_split_datetime = datetime.datetime.strptime('2018-06-01 00:00:00', '%Y-%m-%d %H:%M:%S')
print('first_datetime:', first_datetime)
print('last_datetime:', last_datetime)
print('train_valid_split_datetime:', train_valid_split_datetime)

first_datetime: 2018-01-01 00:00:00
last_datetime: 2018-07-01 00:00:00
train_valid_split_datetime: 2018-06-01 00:00:00


In [107]:
taxi_zone_lookup = pd.read_csv('nyc-tlc/misc/taxi _zone_lookup.csv')
print('taxi_zone_lookup:', taxi_zone_lookup.shape)
print(taxi_zone_lookup.head())
manhattan_location_ids = taxi_zone_lookup[taxi_zone_lookup['Borough']=='Manhattan']['LocationID'].values
print('manhattan_location_ids:', manhattan_location_ids.shape)
print(manhattan_location_ids)

taxi_zone_lookup: (265, 4)
   LocationID        Borough                     Zone service_zone
0           1            EWR           Newark Airport          EWR
1           2         Queens              Jamaica Bay    Boro Zone
2           3          Bronx  Allerton/Pelham Gardens    Boro Zone
3           4      Manhattan            Alphabet City  Yellow Zone
4           5  Staten Island            Arden Heights    Boro Zone
manhattan_location_ids: (69,)
[  4  12  13  24  41  42  43  45  48  50  68  74  75  79  87  88  90 100
 103 104 105 107 113 114 116 120 125 127 128 137 140 141 142 143 144 148
 151 152 153 158 161 162 163 164 166 170 186 194 202 209 211 224 229 230
 231 232 233 234 236 237 238 239 243 244 246 249 261 262 263]


In [108]:
def get_5min_id(x):
    return (x-first_datetime).total_seconds()//(5*60)

def get_15min_id(x):
    return (x-first_datetime).total_seconds()//(15*60)

def get_30min_id(x):
    return (x-first_datetime).total_seconds()//(30*60)

In [109]:
train_valid_split_5min_id = get_5min_id(train_valid_split_datetime)
train_valid_split_15min_id = get_15min_id(train_valid_split_datetime)
train_valid_split_30min_id = get_30min_id(train_valid_split_datetime)
print('train_valid_split_5min_id:', train_valid_split_5min_id)
print('train_valid_split_15min_id:', train_valid_split_15min_id)
print('train_valid_split_30min_id:', train_valid_split_30min_id)

train_valid_split_5min_id: 43488.0
train_valid_split_15min_id: 14496.0
train_valid_split_30min_id: 7248.0


In [None]:
start = time.time()
# sample = pd.read_csv('nyc-tlc/trip data/sample.csv')
sample_1 = pd.read_csv('nyc-tlc/trip data/yellow_tripdata_2018-01.csv')
print('read_csv 2018-01:', time.time()-start)
sample_2 = pd.read_csv('nyc-tlc/trip data/yellow_tripdata_2018-02.csv')
print('read_csv 2018-02:', time.time()-start)
sample_3 = pd.read_csv('nyc-tlc/trip data/yellow_tripdata_2018-03.csv')
print('read_csv 2018-03:', time.time()-start)
sample_4 = pd.read_csv('nyc-tlc/trip data/yellow_tripdata_2018-04.csv')
print('read_csv 2018-04:', time.time()-start)
sample_5 = pd.read_csv('nyc-tlc/trip data/yellow_tripdata_2018-05.csv')
print('read_csv 2018-05:', time.time()-start)
sample_6 = pd.read_csv('nyc-tlc/trip data/yellow_tripdata_2018-06.csv')
print('read_csv 2018-06:', time.time()-start)
sample = pd.concat([sample_1, sample_2, sample_3, sample_4, sample_5, sample_6], axis=0)
print('concat:', time.time()-start)
print('sample:', sample.shape)
#print(sample.head())
sample_manhattan = sample[sample['PULocationID'].isin(manhattan_location_ids)].copy()
print('sample_manhattan:', sample_manhattan.shape)
#print(sample_manhattan.head())
print('manhattan_location_ids:', time.time()-start)

sample_manhattan['store_and_fwd_flag'] = sample_manhattan['store_and_fwd_flag'].map(lambda x: x=='N' and 0 or 1)
print('store_and_fwd_flag:', time.time()-start)
sample_manhattan['tpep_pickup_datetime'] = pd.to_datetime(sample_manhattan['tpep_pickup_datetime'])
print('tpep_pickup_datetime:', time.time()-start)
sample_manhattan['tpep_dropoff_datetime'] = pd.to_datetime(sample_manhattan['tpep_dropoff_datetime'])
print('tpep_dropoff_datetime:', time.time()-start)
sample_manhattan['trip_duration'] = sample_manhattan['tpep_dropoff_datetime']-sample_manhattan['tpep_pickup_datetime']
print('trip_duration:', time.time()-start)
sample_manhattan['trip_duration'] = sample_manhattan['trip_duration'].map(lambda x: x.total_seconds())
sample_manhattan['trip_speed'] = sample_manhattan['trip_distance']/sample_manhattan['trip_duration']
print('trip_speed:', time.time()-start)
sample_manhattan['tpep_pickup_year'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.year)
print('tpep_pickup_year:', time.time()-start)
sample_manhattan['tpep_pickup_month'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.month)
print('tpep_pickup_month:', time.time()-start)
sample_manhattan['tpep_pickup_day'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.day)
print('tpep_pickup_day:', time.time()-start)
sample_manhattan['tpep_pickup_hour'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.hour)
print('tpep_pickup_hour:', time.time()-start)
sample_manhattan['tpep_pickup_weekday'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: x.weekday())
print('tpep_pickup_weekday:', time.time()-start)
sample_manhattan['is_weekend'] = sample_manhattan['tpep_pickup_weekday'].map(lambda x: x >= 5 and 1 or 0)
print('is_weekend:', time.time()-start)
sample_manhattan['is_morning_peak'] = sample_manhattan['tpep_pickup_hour'].map(lambda x: 7 <= x <= 9 and 1 or 0)
print('is_morning_peak:', time.time()-start)
sample_manhattan['is_evening_peak'] = sample_manhattan['tpep_pickup_hour'].map(lambda x: 17 <= x <= 19 and 1 or 0)
print('is_evening_peak:', time.time()-start)

sample_manhattan['tpep_pickup_5min_id'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: get_5min_id(x))
print('tpep_pickup_5min_id:', time.time()-start)
sample_manhattan['tpep_pickup_15min_id'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: get_15min_id(x))
print('tpep_pickup_15min_id:', time.time()-start)
sample_manhattan['tpep_pickup_30min_id'] = sample_manhattan['tpep_pickup_datetime'].map(lambda x: get_30min_id(x))
print('tpep_pickup_30min_id:', time.time()-start)

sample_manhattan = sample_manhattan[sample_manhattan['trip_duration'] > 0]
print('filter trip_duration:', time.time()-start)
sample_manhattan = sample_manhattan[sample_manhattan['tpep_pickup_datetime'] >= first_datetime]
print('filter tpep_pickup_datetime first_datetime:', time.time()-start)
sample_manhattan = sample_manhattan[sample_manhattan['tpep_pickup_datetime'] < last_datetime]
print('filter tpep_pickup_datetime last_datetime:', time.time()-start)

sample_manhattan.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1, inplace=True)

print(sample_manhattan.head())

read_csv 2018-01: 26.00922417640686
read_csv 2018-02: 48.23054885864258
read_csv 2018-03: 75.53822207450867
read_csv 2018-04: 101.48924684524536
read_csv 2018-05: 127.66417193412781
read_csv 2018-06: 152.3633451461792
concat: 222.19246411323547
sample: (53925735, 17)
sample_manhattan: (48959840, 17)
manhattan_location_ids: 489.9099760055542
store_and_fwd_flag: 520.4687070846558
tpep_pickup_datetime: 534.4488830566406
tpep_dropoff_datetime: 548.3604698181152
trip_duration: 551.1335899829865
trip_speed: 1178.8778040409088
tpep_pickup_year: 1461.6116058826447
tpep_pickup_month: 1723.655357837677
tpep_pickup_day: 2035.2782599925995
tpep_pickup_hour: 2345.657704114914
tpep_pickup_weekday: 2595.4595999717712
is_weekend: 2616.74267411232
is_morning_peak: 2640.3090789318085
is_evening_peak: 2662.765875816345
tpep_pickup_5min_id: 3780.9651639461517
tpep_pickup_15min_id: 5007.410728931427
tpep_pickup_30min_id: 6187.078573942184
filter trip_duration: 7024.000195026398


In [74]:
def stat(x):
    return pd.Series(
        [x.count(), x.min(), x.idxmin(), x.quantile(.25), x.median(), x.quantile(.75), x.mean(), x.max(), x.idxmax(),
         x.mad(), x.var(), x.std(), x.skew(), x.kurt()],
        index=['总数', '最小值', '最小值位置', '25%分位数', '中位数', '75%分位数', '均值', '最大值', '最大值位数', '平均绝对偏差', '方差', '标准差', '偏度',
               '峰度'])

In [76]:
sample_manhattan_stat = sample_manhattan.apply(stat)
print('sample_manhattan_stat:')
print(sample_manhattan_stat)

sample_manhattan_stat:
          VendorID  passenger_count  trip_distance  RatecodeID  \
总数     7959723.000      7959723.000    7959723.000 7959723.000   
最小值          1.000            0.000          0.000       1.000   
最小值位置        0.000          505.000        305.000       0.000   
25%分位数       1.000            1.000          0.900       1.000   
中位数          2.000            1.000          1.480       1.000   
75%分位数       2.000            2.000          2.500       1.000   
均值           1.564            1.610          2.234       1.017   
最大值          2.000            9.000        830.800      99.000   
最大值位数       12.000       204469.000    1858065.000 4387623.000   
平均绝对偏差       0.492            0.890          1.510       0.034   
方差           0.246            1.591          6.487       0.045   
标准差          0.496            1.261          2.547       0.211   
偏度          -0.258            2.230          8.284      62.873   
峰度          -1.934            4.065       1434.769   

In [79]:
train_sample_5min_count = sample_manhattan[sample_manhattan['tpep_pickup_5min_id'] < train_valid_split_5min_id].groupby(['tpep_pickup_5min_id', 'PULocationID']).count()['VendorID']
print('train_sample_5min_count:', train_sample_5min_count.shape)
print(train_sample_5min_count.head())
valid_sample_5min_count = sample_manhattan[sample_manhattan['tpep_pickup_5min_id'] >= train_valid_split_5min_id].groupby(['tpep_pickup_5min_id', 'PULocationID']).count()['VendorID']
print('valid_sample_5min_count:', valid_sample_5min_count.shape)
print(valid_sample_5min_count.head())
print('valid_sample_5min_count:', type(valid_sample_5min_count))
print(valid_sample_5min_count[5760][4])

train_sample_5min_count: (307167,)
tpep_pickup_5min_id  PULocationID
0.000                4               3
                     13              4
                     24              2
                     41              3
                     42              3
Name: VendorID, dtype: int64
valid_sample_5min_count: (169459,)
tpep_pickup_5min_id  PULocationID
5760.000             4               14
                     13               4
                     24               2
                     41               7
                     42               2
Name: VendorID, dtype: int64


In [95]:
sample_manhattan_5min_count = sample_manhattan.groupby(['tpep_pickup_5min_id', 'PULocationID']).count()['VendorID']

def generate_features(data_X):
    start = time.time()
    ago_5min = []
    ago_1day_now = []
    ago_7day_now = []
    ago_14day_now = []
    ago_21day_now = []
    ago_28day_now = []
    for index,row in data_X.iterrows():
        #print('index:', index)
        #print('row:', row)
        try:
            ago_5min.append(sample_manhattan_5min_count[index[0]-1][index[1]])
        except:
            ago_5min.append(0)
        try:
            ago_1day_now.append(sample_manhattan_5min_count[index[0]-288][index[1]])
        except:
            ago_1day_now.append(0)
        try:
            ago_7day_now.append(sample_manhattan_5min_count[index[0]-288*7][index[1]])
        except:
            ago_7day_now.append(0)
        try:
            ago_14day_now.append(sample_manhattan_5min_count[index[0]-288*14][index[1]])
        except:
            ago_14day_now.append(0)
        try:
            ago_21day_now.append(sample_manhattan_5min_count[index[0]-288*21][index[1]])
        except:
            ago_21day_now.append(0)
        try:
            ago_28day_now.append(sample_manhattan_5min_count[index[0]-288*28][index[1]])
        except:
            ago_28day_now.append(0)
    data_X['5min_ago'] = np.array(ago_5min)
    data_X['1day_ago_now'] = np.array(ago_1day_now)
    data_X['7day_ago_now'] = np.array(ago_7day_now)
    data_X['14day_ago_now'] = np.array(ago_14day_now)
    data_X['21day_ago_now'] = np.array(ago_21day_now)
    data_X['28day_ago_now'] = np.array(ago_28day_now)
    print('data_X time:', time.time()-start)
    return data_X

train_X = pd.DataFrame(train_sample_5min_count)
train_X = generate_features(train_X)
print('train_X:', train_X.shape)
print(train_X.head())
valid_X = pd.DataFrame(valid_sample_5min_count)
valid_X = generate_features(valid_X)
print('valid_X:', valid_X.shape)
print(valid_X.head())
train_Y = train_sample_5min_count.values
print('train_Y:', len(train_Y))
valid_Y = valid_sample_5min_count.values
print('valid_Y:', len(valid_Y))

train_X time: 357.75938081741333
train_X: (307167, 7)
                                  VendorID  5min_ago  1day_ago_now  \
tpep_pickup_5min_id PULocationID                                     
0.000               4                    3         0             0   
                    13                   4         0             0   
                    24                   2         0             0   
                    41                   3         0             0   
                    42                   3         0             0   

                                  7day_ago_now  14day_ago_now  21day_ago_now  \
tpep_pickup_5min_id PULocationID                                               
0.000               4                        0              0              0   
                    13                       0              0              0   
                    24                       0              0              0   
                    41                       0         

In [103]:
def xgb_train_validate(train_X, train_Y, test_X, test_Y):
    xg_train = xgb.DMatrix(train_X.drop('VendorID', axis=1), label=train_Y)
    xg_test = xgb.DMatrix(test_X.drop('VendorID', axis=1), label=test_Y)
    # setup parameters for xgboost
    param = {}
    # scale weight of positive examples
    param['eta'] = 0.1  # default
    # param['eta'] = 0.02
    param['max_depth'] = 20  # default: 6
    param['silent'] = 1  # default
    param['nthread'] = 4  # default
    param['gamma'] = 1
    param['subsample'] = 0.9
    param['min_child_weight'] = 1
    param['colsample_bytree'] = 0.9
    param['lambda'] = 1
    param['booster'] = 'gbtree'
    param['eval_metric'] = 'mae'

    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    # num_round = 5
    num_round = 400

    # param['objective'] = 'reg:linear'
    param['objective'] = 'reg:gamma'
    bst = xgb.train(param, xg_train, num_round, watchlist)

    # xgb.plot_tree(bst)
    # plt.savefig('xgboost_tree.png')
    # xgb.plot_importance(bst)
    # plt.savefig('xgboost_importance.png')
    imp = bst.get_fscore()
    print(sorted(imp.items(), key=lambda d: d[1], reverse=True))

In [104]:
xgb_train_validate(train_X, train_Y, valid_X, valid_Y)

[0]	train-mae:15.7128	test-mae:16.9487
[1]	train-mae:15.6607	test-mae:16.8964
[2]	train-mae:15.6042	test-mae:16.8395
[3]	train-mae:15.5428	test-mae:16.7775
[4]	train-mae:15.4764	test-mae:16.7104
[5]	train-mae:15.4045	test-mae:16.6376
[6]	train-mae:15.3267	test-mae:16.5588
[7]	train-mae:15.243	test-mae:16.4739
[8]	train-mae:15.1545	test-mae:16.3839
[9]	train-mae:15.0625	test-mae:16.29
[10]	train-mae:14.9643	test-mae:16.1906
[11]	train-mae:14.8627	test-mae:16.0861
[12]	train-mae:14.7547	test-mae:15.9747
[13]	train-mae:14.6387	test-mae:15.855
[14]	train-mae:14.5139	test-mae:15.7261
[15]	train-mae:14.38	test-mae:15.5876
[16]	train-mae:14.2374	test-mae:15.4396
[17]	train-mae:14.0861	test-mae:15.2818
[18]	train-mae:13.9259	test-mae:15.1146
[19]	train-mae:13.7562	test-mae:14.9365
[20]	train-mae:13.5765	test-mae:14.748
[21]	train-mae:13.3871	test-mae:14.5485
[22]	train-mae:13.1876	test-mae:14.3381
[23]	train-mae:12.9808	test-mae:14.1165
[24]	train-mae:12.7613	test-mae:13.8829
[25]	train-mae:12

[204]	train-mae:3.55259	test-mae:3.63189
[205]	train-mae:3.55252	test-mae:3.63202
[206]	train-mae:3.55255	test-mae:3.63209
[207]	train-mae:3.55245	test-mae:3.63212
[208]	train-mae:3.55224	test-mae:3.63221
[209]	train-mae:3.5521	test-mae:3.63231
[210]	train-mae:3.5519	test-mae:3.63263
[211]	train-mae:3.55186	test-mae:3.63264
[212]	train-mae:3.55187	test-mae:3.63277
[213]	train-mae:3.55179	test-mae:3.63263
[214]	train-mae:3.55178	test-mae:3.63262
[215]	train-mae:3.5517	test-mae:3.63267
[216]	train-mae:3.55156	test-mae:3.6326
[217]	train-mae:3.55151	test-mae:3.63257
[218]	train-mae:3.55152	test-mae:3.63261
[219]	train-mae:3.55137	test-mae:3.63251
[220]	train-mae:3.55139	test-mae:3.63255
[221]	train-mae:3.55139	test-mae:3.63259
[222]	train-mae:3.5514	test-mae:3.63261
[223]	train-mae:3.55117	test-mae:3.63263
[224]	train-mae:3.55116	test-mae:3.63262
[225]	train-mae:3.55117	test-mae:3.63267
[226]	train-mae:3.55113	test-mae:3.63272
[227]	train-mae:3.55105	test-mae:3.63253
[228]	train-mae:3.551

In [105]:
train_X_stat = train_X.apply(stat)
print('train_X_stat:')
print(train_X_stat)
valid_X_stat = valid_X.apply(stat)
print('valid_X_stat:')
print(valid_X_stat)

train_X_stat:
            VendorID      5min_ago  1day_ago_now   7day_ago_now  \
总数            307167        307167        307167         307167   
最小值                1             0             0              0   
最小值位置      (0.0, 45)      (0.0, 4)      (0.0, 4)       (0.0, 4)   
25%分位数         3.000         3.000         2.000          0.000   
中位数           10.000        10.000         9.000          3.000   
75%分位数        24.000        24.000        23.000         14.000   
均值            16.261        16.149        15.204          9.962   
最大值              151           151           151            130   
最大值位数   (3768.0, 79)  (3769.0, 79)  (4056.0, 79)  (2564.0, 230)   
平均绝对偏差        13.045        13.124        13.073         11.125   
方差           290.346       293.540       292.399        229.002   
标准差           17.040        17.133        17.100         15.133   
偏度             1.697         1.674         1.712          2.145   
峰度             3.367         3.287         3.400