In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
import xgboost as xgb
%matplotlib inline

In [3]:
train = pd.read_csv('./train.csv')
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


### Extract features from datetime(8)

In [4]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
train.loc[:, 'pickup_weekday'] = train['pickup_datetime'].dt.weekday
train.loc[:, 'pickup_hour_weekofyear'] = train['pickup_datetime'].dt.weekofyear
train.loc[:, 'pickup_hour'] = train['pickup_datetime'].dt.hour
train.loc[:, 'pickup_minute'] = train['pickup_datetime'].dt.minute
train.loc[:, 'pickup_dt'] = (train['pickup_datetime'] - train['pickup_datetime'].min()).dt.total_seconds()
train.loc[:, 'pickup_week_hour'] = train['pickup_weekday'] * 24 + train['pickup_hour']
train.loc[:, 'pickup_dt_bin'] = (train['pickup_dt'] // (3 * 3600))

### Extract features from longitude and latitude(16)

In [5]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values))

def rotate_distance(lat1,lng1,lat2,lng2):
    phi = (36.1/180)*np.pi
    lat1_rot = lat1*np.sin(phi)+lng1*np.cos(phi)
    lng1_rot = lat1*np.cos(phi)-lng1*np.sin(phi)
    lat2_rot = lat2*np.sin(phi)+lng2*np.cos(phi)
    lng2_rot = lat2*np.cos(phi)-lng2*np.sin(phi)
    rotate_distance = np.abs(lat1_rot-lat2_rot)+np.abs(lng1_rot-lng2_rot)
    return rotate_distance

In [6]:
pca = PCA().fit(coords)
train['pickup_pca0'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 0]
train['pickup_pca1'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 1]
train['dropoff_pca0'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train['dropoff_pca1'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
train.loc[:, 'distance_haversine'] = haversine_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'distance_dummy_manhattan'] = dummy_manhattan_distance(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'direction'] = bearing_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
train.loc[:, 'pca_manhattan'] = np.abs(train['dropoff_pca1'] - train['pickup_pca1']) + np.abs(train['dropoff_pca0'] - train['pickup_pca0'])
train.loc[:, 'rotate_distance'] = rotate_distance(train['pickup_latitude'].values,train['pickup_longitude'].values,train['dropoff_latitude'].values,train['dropoff_longitude'].values)
train.loc[:, 'pickup_lat_bin'] = np.round(train['pickup_latitude'], 2)
train.loc[:, 'pickup_long_bin'] = np.round(train['pickup_longitude'], 2)
train.loc[:, 'center_latitude'] = (train['pickup_latitude'].values + train['dropoff_latitude'].values) / 2
train.loc[:, 'center_longitude'] = (train['pickup_longitude'].values + train['dropoff_longitude'].values) / 2
train.loc[:, 'center_lat_bin'] = np.round(train['center_latitude'], 2)
train.loc[:, 'center_long_bin'] = np.round(train['center_longitude'], 2)
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])
train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])

### Speed features

In [7]:
train.loc[:, 'avg_speed_h'] = 1000 * train['distance_haversine'] / train['trip_duration']
train.loc[:, 'avg_speed_m'] = 1000 * train['distance_dummy_manhattan'] / train['trip_duration']

### Log_trip_duration

In [8]:
train['log_trip_duration'] = np.log(train['trip_duration'].values + 1)

### Aggregate features

In [9]:
for gby_col in ['pickup_hour', 'pickup_date', 'pickup_dt_bin',
               'pickup_week_hour', 'pickup_cluster', 'dropoff_cluster']:
    gby = train.groupby(gby_col).mean()[['avg_speed_h', 'avg_speed_m', 'log_trip_duration']]
    gby.columns = ['%s_gby_%s' % (col, gby_col) for col in gby.columns]
    train = pd.merge(train, gby, how='left', left_on=gby_col, right_index=True)

for gby_cols in [['center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'center_lat_bin', 'center_long_bin'],
                 ['pickup_hour', 'pickup_cluster'],  ['pickup_hour', 'dropoff_cluster'],
                 ['pickup_cluster', 'dropoff_cluster']]:
    coord_speed = train.groupby(gby_cols).mean()[['avg_speed_h']].reset_index()
    coord_count = train.groupby(gby_cols).count()[['id']].reset_index()
    coord_stats = pd.merge(coord_speed, coord_count, on=gby_cols)
    coord_stats = coord_stats[coord_stats['id'] > 100]
    coord_stats.columns = gby_cols + ['avg_speed_h_%s' % '_'.join(gby_cols), 'cnt_%s' %  '_'.join(gby_cols)]
    train = pd.merge(train, coord_stats, how='left', on=gby_cols)

In [10]:
group_freq = '60min'
df_all = train[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round(group_freq)

# Count trips over 60min
df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index()
df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id']
train = train.merge(df_counts, on='id', how='left')

# Count how many trips are going to each cluster over time
dropoff_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('dropoff_cluster').rolling('240min').mean() \
    .drop('dropoff_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'})

train['dropoff_cluster_count'] = train[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)

  # This is added back by InteractiveShellApp.init_path()


In [11]:
# Count how many trips are going from each cluster over time
df_all = train[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
pickup_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'pickup_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('pickup_cluster').rolling('240min').mean() \
    .drop('pickup_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'pickup_cluster_count'})

train['pickup_cluster_count'] = train[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,avg_speed_h_pickup_hour_pickup_cluster,cnt_pickup_hour_pickup_cluster,avg_speed_h_pickup_hour_dropoff_cluster,cnt_pickup_hour_dropoff_cluster,avg_speed_h_pickup_cluster_dropoff_cluster,cnt_pickup_cluster_dropoff_cluster,pickup_datetime_group,count_60min,dropoff_cluster_count,pickup_cluster_count
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,2.998304,2020.0,2.95178,1498.0,2.793804,921.0,2016-03-14 17:00:00,391.0,10.5,18.5
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,4.794508,425.0,3.791164,1461.0,2.829591,313.0,2016-06-12 01:00:00,461.0,11.75,2.25
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,2.989169,1119.0,3.485523,576.0,4.205389,115.0,2016-01-19 12:00:00,380.0,3.75,10.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,4.056109,1095.0,4.541152,532.0,3.762733,330.0,2016-04-06 20:00:00,563.0,3.0,7.25
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,4.128996,1527.0,3.583851,676.0,4.658722,726.0,2016-03-26 14:00:00,432.0,4.75,9.25


### Weather features

In [13]:
weather = pd.read_csv('./weather.csv')
weather.head()

Unnamed: 0,date,maximum temperature,minimum temperature,average temperature,precipitation,snow fall,snow depth
0,1-1-2016,42,34,38.0,0.0,0.0,0
1,2-1-2016,40,32,36.0,0.0,0.0,0
2,3-1-2016,45,35,40.0,0.0,0.0,0
3,4-1-2016,36,14,25.0,0.0,0.0,0
4,5-1-2016,29,11,20.0,0.0,0.0,0


In [14]:
weather['precipitation'][weather['precipitation']=='T']=0
weather['snow fall'][weather['snow fall']=='T']=0
weather['snow depth'][weather['snow depth']=='T']=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
weather.head()

Unnamed: 0,date,maximum temperature,minimum temperature,average temperature,precipitation,snow fall,snow depth
0,1-1-2016,42,34,38.0,0.0,0.0,0
1,2-1-2016,40,32,36.0,0.0,0.0,0
2,3-1-2016,45,35,40.0,0.0,0.0,0
3,4-1-2016,36,14,25.0,0.0,0.0,0
4,5-1-2016,29,11,20.0,0.0,0.0,0


In [16]:
weather['date'] = pd.to_datetime(weather['date'])
weather['precipitation'] = pd.to_numeric(weather['precipitation'])
weather['snow fall']= pd.to_numeric(weather['snow fall'])
weather['snow depth']= pd.to_numeric(weather['snow depth'])
weather.rename(columns={'date':'pickup_date'}, inplace=True)

In [17]:
train.pickup_date = pd.to_datetime(train.pickup_date)
train['store_and_fwd_flag'] = 1 * (train.store_and_fwd_flag.values == 'Y')
train_merge_weather = pd.merge(train,weather,on='pickup_date')

In [18]:
train_merge_weather['pickup_date'] = train_merge_weather['pickup_datetime'].dt.date
train_merge_weather.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,pickup_datetime_group,count_60min,dropoff_cluster_count,pickup_cluster_count,maximum temperature,minimum temperature,average temperature,precipitation,snow fall,snow depth
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,2016-03-14 17:00:00,391.0,10.5,18.5,51,40,45.5,0.29,0.0,0
1,id2129090,1,2016-03-14 14:05:39,2016-03-14 14:28:05,1,-73.97509,40.758766,-73.953201,40.765068,0,...,2016-03-14 14:00:00,414.0,5.25,15.75,51,40,45.5,0.29,0.0,0
2,id0256505,1,2016-03-14 15:04:38,2016-03-14 15:16:13,1,-73.994484,40.745087,-73.998993,40.72271,0,...,2016-03-14 15:00:00,476.0,4.0,8.5,51,40,45.5,0.29,0.0,0
3,id3863815,2,2016-03-14 04:24:36,2016-03-14 04:37:11,3,-73.944359,40.714489,-73.91053,40.709492,0,...,2016-03-14 04:00:00,37.0,0.0,0.0,51,40,45.5,0.29,0.0,0
4,id3817493,2,2016-03-14 14:57:56,2016-03-14 15:15:26,1,-73.952881,40.766468,-73.97863,40.761921,0,...,2016-03-14 15:00:00,478.0,7.75,9.5,51,40,45.5,0.29,0.0,0


In [19]:
feature_names = list(train.columns)
do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime',
                           'trip_duration', 'pickup_date', 'avg_speed_h', 'avg_speed_m',
                           'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin',
                           'pickup_dt_bin', 'pickup_datetime_group']
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
y = np.log(train['trip_duration'].values + 1)
Xtr, Xv, ytr, yv = train_test_split(train[feature_names].values, y, test_size=0.2, random_state=1987)
dtrain = xgb.DMatrix(Xtr, label=ytr)
dvalid = xgb.DMatrix(Xv, label=yv)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.3, 'max_depth': 12,
            'subsample': 0.8, 'lambda': 1., 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}

In [20]:
model = xgb.train(xgb_pars, dtrain, 60, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=10)

[0]	train-rmse:4.23233	valid-rmse:4.23351
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:0.405926	valid-rmse:0.42601
[20]	train-rmse:0.365703	valid-rmse:0.397499
[30]	train-rmse:0.357206	valid-rmse:0.395381
[40]	train-rmse:0.350104	valid-rmse:0.394734
[50]	train-rmse:0.344619	valid-rmse:0.394276
[59]	train-rmse:0.34003	valid-rmse:0.393948


### Add weather features(6)

In [21]:
feature_names = list(train_merge_weather.columns)
do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime',
                           'trip_duration', 'pickup_date', 'avg_speed_h', 'avg_speed_m',
                           'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin',
                           'pickup_dt_bin', 'pickup_datetime_group']
feature_names = [f for f in train_merge_weather.columns if f not in do_not_use_for_training]
y = np.log(train_merge_weather['trip_duration'].values + 1)
Xtr, Xv, ytr, yv = train_test_split(train_merge_weather[feature_names].values, y, test_size=0.2, random_state=1987)
dtrain = xgb.DMatrix(Xtr, label=ytr)
dvalid = xgb.DMatrix(Xv, label=yv)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.3, 'max_depth': 12,
            'subsample': 0.8, 'lambda': 1., 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}
model = xgb.train(xgb_pars, dtrain, 80, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=10)

[0]	train-rmse:4.23254	valid-rmse:4.23378
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:0.403017	valid-rmse:0.41942
[20]	train-rmse:0.368721	valid-rmse:0.395131
[30]	train-rmse:0.357988	valid-rmse:0.392021
[40]	train-rmse:0.35045	valid-rmse:0.390913
[50]	train-rmse:0.342965	valid-rmse:0.390495
[60]	train-rmse:0.33843	valid-rmse:0.390402
[70]	train-rmse:0.334798	valid-rmse:0.390729
[79]	train-rmse:0.33122	valid-rmse:0.390839


### Add bike count features

In [22]:
bike_df = pd.read_csv('./City Bike.csv')
bike_df.head()

Unnamed: 0,id,gender_id,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
0,25242,2,2016-05-30 15:23:05,2016-05-30 15:52:59,-73.976485,40.759923,-74.00215,40.730386,1794
1,20900,1,2016-04-24 19:56:47,2016-04-24 20:02:17,-74.003664,40.743174,-74.00215,40.730386,329
2,18792,1,2016-06-25 15:08:38,2016-06-25 15:15:57,-73.991908,40.716059,-74.005524,40.711464,438
3,17420,1,2016-06-28 18:14:17,2016-06-28 18:35:25,-73.986569,40.701485,-73.9899,40.714275,1268
4,22403,1,2016-06-11 17:10:33,2016-06-11 17:14:44,-73.989551,40.740343,-73.990093,40.73705,251


In [23]:
bike_df['pickup_datetime'] = pd.to_datetime(bike_df['pickup_datetime'])
bike_df.loc[:,'pickup_date'] = bike_df['pickup_datetime'].dt.date
bike_df_date = bike_df.groupby('pickup_date').count()[['id']]
bike_df_date.rename(columns={'id':'count'}, inplace=True)

In [24]:
bike_df_date.reset_index(inplace=True)

In [25]:
train_merge_weather_bike = pd.merge(train_merge_weather,bike_df_date,on='pickup_date')

In [26]:
feature_names = list(train_merge_weather_bike.columns)
do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime',
                           'trip_duration', 'pickup_date', 'avg_speed_h', 'avg_speed_m',
                           'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin',
                           'pickup_dt_bin', 'pickup_datetime_group']
feature_names = [f for f in train_merge_weather_bike.columns if f not in do_not_use_for_training]
y = np.log(train_merge_weather_bike['trip_duration'].values + 1)
Xtr, Xv, ytr, yv = train_test_split(train_merge_weather_bike[feature_names].values, y, test_size=0.2, random_state=1987)
dtrain = xgb.DMatrix(Xtr, label=ytr)
dvalid = xgb.DMatrix(Xv, label=yv)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.3, 'max_depth': 12,
            'subsample': 0.8, 'lambda': 1., 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}
model = xgb.train(xgb_pars, dtrain, 80, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=10)

[0]	train-rmse:4.23475	valid-rmse:4.23697
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:0.405009	valid-rmse:0.420151
[20]	train-rmse:0.366503	valid-rmse:0.392258
[30]	train-rmse:0.357009	valid-rmse:0.390775
[40]	train-rmse:0.349038	valid-rmse:0.389274
[50]	train-rmse:0.344041	valid-rmse:0.388875
[60]	train-rmse:0.340667	valid-rmse:0.388936
[70]	train-rmse:0.337429	valid-rmse:0.389097
[79]	train-rmse:0.334928	valid-rmse:0.388912


### Add hoilday data

In [37]:
import holidays
from datetime import date

In [31]:
ny_holidays = holidays.CountryHoliday('US',state='NY')

In [45]:
is_holidays=pd.Series([each in ny_holidays for each in train_merge_weather_bike['pickup_date']])

In [46]:
train_merge_weather_bike['is_holidays'] = is_holidays

In [52]:
feature_names = list(train_merge_weather_bike.columns)
do_not_use_for_training = ['id', 'log_trip_duration', 'pickup_datetime', 'dropoff_datetime',
                           'trip_duration', 'pickup_date', 'avg_speed_h', 'avg_speed_m',
                           'pickup_lat_bin', 'pickup_long_bin',
                           'center_lat_bin', 'center_long_bin',
                           'pickup_dt_bin', 'pickup_datetime_group']
feature_names = [f for f in train_merge_weather_bike.columns if f not in do_not_use_for_training]
y = np.log(train_merge_weather_bike['trip_duration'].values + 1)
Xtr, Xv, ytr, yv = train_test_split(train_merge_weather_bike[feature_names].values, y, test_size=0.2, random_state=1987)
dtrain = xgb.DMatrix(Xtr, label=ytr)
dvalid = xgb.DMatrix(Xv, label=yv)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.3, 'max_depth': 12,
            'subsample': 0.8, 'lambda': 1., 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}
model = xgb.train(xgb_pars, dtrain, 80, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=10)

[0]	train-rmse:4.23475	valid-rmse:4.23697
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:0.409759	valid-rmse:0.424811
[20]	train-rmse:0.368623	valid-rmse:0.395862
[30]	train-rmse:0.357766	valid-rmse:0.39196
[40]	train-rmse:0.351049	valid-rmse:0.390454
[50]	train-rmse:0.345748	valid-rmse:0.389685
[60]	train-rmse:0.341157	valid-rmse:0.389745
[70]	train-rmse:0.338642	valid-rmse:0.389795
[79]	train-rmse:0.335583	valid-rmse:0.389911
