In [1]:
import datetime
import lightgbm as lgb
import numpy as np
import os
import pandas as pd
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import haversine

In [2]:
random_seed = 1234
random.seed(random_seed)
np.random.seed(random_seed)

In [3]:
# Load data
train = pd.read_csv('data/taxi/train.csv')
test = pd.read_csv('data/taxi/test.csv')
ss = pd.read_csv('data/taxi/sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [5]:
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2)))

In [6]:
def extract_features(df):
    df['hdistance'] = df.apply(lambda r: haversine.haversine((r['pickup_latitude'],r['pickup_longitude']),(r['dropoff_latitude'], r['dropoff_longitude'])), axis=1)
    df['distance'] = np.sqrt(np.power(df['dropoff_longitude'] - df['pickup_longitude'], 2) + np.power(df['dropoff_latitude'] - df['pickup_latitude'], 2))
    df['log_distance'] = np.log(df['distance'] + 1)
    df['month'] = df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
    df['day'] = df['pickup_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[2]))
    df['hour'] = df['pickup_datetime'].apply(lambda x: int(x.split(' ')[1].split(':')[0]))
    df['minutes'] = df['pickup_datetime'].apply(lambda x: int(x.split(' ')[1].split(':')[1]))
    df['is_weekend'] = ((df.pickup_datetime.astype('datetime64[ns]').dt.dayofweek) // 4 == 1).astype(float)
    df['weekday'] = df.pickup_datetime.astype('datetime64[ns]').dt.dayofweek
    df['is_holyday'] = df.apply(lambda row: 1 if (row['month']==1 and row['day']==1) or (row['month']==7 and row['day']==4) or (row['month']==11 and row['day']==11) or (row['month']==12 and row['day']==25) or (row['month']==1 and row['day'] >= 15 and row['day'] <= 21 and row['weekday'] == 0) or (row['month']==2 and row['day'] >= 15 and row['day'] <= 21 and row['weekday'] == 0) or (row['month']==5 and row['day'] >= 25 and row['day'] <= 31 and row['weekday'] == 0) or (row['month']==9 and row['day'] >= 1 and row['day'] <= 7 and row['weekday'] == 0) or (row['month']==10 and row['day'] >= 8 and row['day'] <= 14 and row['weekday'] == 0) or (row['month']==11 and row['day'] >= 22 and row['day'] <= 28 and row['weekday'] == 3) else 0, axis=1)
    df['is_day_before_holyday'] = df.apply(lambda row: 1 if (row['month']==12 and row['day']==31) or (row['month']==7 and row['day']==3) or (row['month']==11 and row['day']==10) or (row['month']==12 and row['day']==24) or (row['month']==1 and row['day'] >= 14 and row['day'] <= 20 and row['weekday'] == 6) or (row['month']==2 and row['day'] >= 14 and row['day'] <= 20 and row['weekday'] == 6) or (row['month']==5 and row['day'] >= 24 and row['day'] <= 30 and row['weekday'] == 6) or ((row['month']==9 and row['day'] >= 1 and row['day'] <= 6) or (row['month']==8 and row['day'] == 31) and row['weekday'] == 6) or (row['month']==10 and row['day'] >= 7 and row['day'] <= 13 and row['weekday'] == 6) or (row['month']==11 and row['day'] >= 21 and row['day'] <= 27 and row['weekday'] == 2) else 0, axis=1)
    df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map(lambda x: 0 if x =='N' else 1)
    df.drop('day', axis=1, inplace=True)

In [7]:
# Extract features
print('Extracting train features')
extract_features(train)
print('Extracting test features')
extract_features(test)

train.head()

Extracting train features
Extracting test features


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,hdistance,distance,log_distance,month,hour,minutes,is_weekend,weekday,is_holyday,is_day_before_holyday
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,...,1.498521,0.01768,0.017525,3,17,24,0.0,0,0,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,...,1.805507,0.020456,0.020249,6,0,43,1.0,6,0,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,...,6.385098,0.059934,0.058206,1,11,35,0.0,1,0,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,...,1.485498,0.013438,0.013349,4,19,32,0.0,2,0,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,...,1.188588,0.01069,0.010633,3,13,30,1.0,5,0,0


In [8]:
# Prepare data
X = np.array(train.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'store_and_fwd_flag', 'trip_duration'], axis=1))
y = np.log(train['trip_duration'].values)
median_trip_duration = np.median(train['trip_duration'].values)

print('X.shape = ' + str(X.shape))
print('y.shape = ' + str(y.shape))

X_test = np.array(test.drop(['id', 'pickup_datetime', 'store_and_fwd_flag'], axis=1))

print('X_test.shape = ' + str(X_test.shape))

X.shape = (1458644, 16)
y.shape = (1458644,)
X_test.shape = (625134, 16)


In [11]:
print('Training and making predictions')

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmsle',
    'max_depth': 6, 
    'learning_rate': 0.4,
    'num_leaves': 45,
    'max_bin': 250
}

n_estimators = 300
n_iters = 200
preds_buf = []
err_buf = []
for i in range(n_iters): 
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=i)
    d_train = lgb.Dataset(x_train, label=y_train)
    d_valid = lgb.Dataset(x_valid, label=y_valid)
    watchlist = [d_valid]

    model = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

    preds = model.predict(x_valid)
    preds = np.exp(preds)
    preds[preds < 0] = median_trip_duration
    err = rmsle(np.exp(y_valid), preds)
    err_buf.append(err)
    print(str(i) + ' random_state, ' + ' RMSLE = ' + str(err))
    
    preds = model.predict(X_test)
    preds = np.exp(preds)
    preds[preds < 0] = median_trip_duration
    preds_buf.append(preds)

print('Mean RMSLE = ' + str(np.mean(err_buf)) + ' +/- ' + str(np.std(err_buf)))
# Average predictions
preds = np.mean(preds_buf, axis=0)

Training and making predictions
0 random_state,  RMSLE = 0.395705200886
1 random_state,  RMSLE = 0.392638777689
2 random_state,  RMSLE = 0.391998816362
3 random_state,  RMSLE = 0.397788099767
4 random_state,  RMSLE = 0.394443899708
5 random_state,  RMSLE = 0.395280342863
6 random_state,  RMSLE = 0.392742930419
7 random_state,  RMSLE = 0.397975374369
8 random_state,  RMSLE = 0.395817026157
9 random_state,  RMSLE = 0.389823933912
10 random_state,  RMSLE = 0.395156035938
11 random_state,  RMSLE = 0.395625169674
12 random_state,  RMSLE = 0.398561782446
13 random_state,  RMSLE = 0.393545292952
14 random_state,  RMSLE = 0.400839602608
15 random_state,  RMSLE = 0.399973420601
16 random_state,  RMSLE = 0.399625877822
17 random_state,  RMSLE = 0.394149878499
18 random_state,  RMSLE = 0.398465112735
19 random_state,  RMSLE = 0.393720034681
20 random_state,  RMSLE = 0.399842939114
21 random_state,  RMSLE = 0.395015609851
22 random_state,  RMSLE = 0.399472052979
23 random_state,  RMSLE = 0.3960540

197 random_state,  RMSLE = 0.398617460083
198 random_state,  RMSLE = 0.395410730384
199 random_state,  RMSLE = 0.401277552949
Mean RMSLE = 0.396006867879 +/- 0.00304585108137


In [63]:
# Prepare submission
subm = pd.DataFrame()
subm['id'] = test.id.values
subm['trip_duration'] = preds
subm.to_csv('submission_taxi_lgbm.csv', index=False)