In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set(style='whitegrid')


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [5]:
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [6]:
(train.store_and_fwd_flag == 'Y').values.astype('int')

array([0, 0, 0, ..., 0, 0, 0])

In [7]:
(train.vendor_id == 1).values.astype('int')

array([0, 1, 0, ..., 0, 1, 1])

In [8]:
dtrain = np.zeros((1458644, 7))
dtrain[:, :5] = train.loc[:, ['passenger_count', 'pickup_longitude', 
       'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude']].values
dtrain[:, 5] = (train.store_and_fwd_flag == 'Y').values.astype('int')
dtrain[:, 6] = (train.vendor_id == 1).values.astype('int')

dtest = np.zeros((test.shape[0], 7))
dtest[:, :5] = test.loc[:, ['passenger_count', 'pickup_longitude', 
       'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude']].values
dtest[:, 5] = (test.store_and_fwd_flag == 'Y').values.astype('int')
dtest[:, 6] = (test.vendor_id == 1).values.astype('int')

In [9]:
target = train.trip_duration.values

In [10]:
target

array([ 455,  663, 2124, ...,  764,  373,  198])

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [36]:
kf = KFold(n_splits=5, random_state=0)
res = []
for train_index, test_index in kf.split(dtrain, target):
    X_train, X_test = dtrain[train_index], dtrain[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    y_ltrain = np.log(y_train + 1)
    y_ltest = np.log(y_test + 1)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    clf = LinearRegression(normalize=True)
    clf.fit(X_train, y_ltrain)
    preds = clf.predict(X_test)
    res.append(np.sqrt(mean_squared_error(y_ltest, preds)))
    print('RMSLE: {:.5f}'.format(res[-1]))

RMSLE: 0.77986
RMSLE: 0.77463
RMSLE: 0.78293
RMSLE: 0.78670
RMSLE: 0.78442


In [37]:
print('Mean RMSLE: {:.5f}'.format(np.mean(res)))

Mean RMSLE: 0.78171


In [12]:
kf = KFold(n_splits=5, random_state=0)
res = []
for train_index, test_index in kf.split(dtrain, target):
    X_train, X_test = dtrain[train_index], dtrain[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    y_ltrain = np.log(y_train + 1)
    y_ltest = np.log(y_test + 1)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    clf = SVR(kernel='linear', max_iter=1000)
    clf.fit(X_train, y_ltrain)
    preds = clf.predict(X_test)
    res.append(np.sqrt(mean_squared_error(y_ltest, preds)))
    print('RMSLE: {:.5f}'.format(res[-1]))



RMSLE: 2.46085
RMSLE: 2.13221
RMSLE: 1.18304
RMSLE: 3.01022
RMSLE: 3.44467


In [13]:
print('Mean RMSLE: {:.5f}'.format(np.mean(res)))

Mean RMSLE: 2.44620


In [16]:
kf = KFold(n_splits=5, random_state=0)
res = []
for train_index, test_index in kf.split(dtrain, target):
    X_train, X_test = dtrain[train_index], dtrain[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    y_ltrain = np.log(y_train + 1)
    y_ltest = np.log(y_test + 1)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    clf = RandomForestRegressor(n_estimators=10, 
                                random_state=0,
                                max_depth=15)
    clf.fit(X_train, y_ltrain)
    preds = clf.predict(X_test)
    res.append(np.sqrt(mean_squared_error(y_ltest, preds)))
    print('RMSLE: {:.5f}'.format(res[-1]))

RMSLE: 0.52760
RMSLE: 0.52970
RMSLE: 0.52841
RMSLE: 0.52889
RMSLE: 0.52739


In [17]:
print('Mean RMSLE: {:.5f}'.format(np.mean(res)))

Mean RMSLE: 0.52840


In [18]:
kf = KFold(n_splits=5, random_state=0)
res = []
for train_index, test_index in kf.split(dtrain, target):
    X_train, X_test = dtrain[train_index], dtrain[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    y_ltrain = np.log(y_train + 1)
    y_ltest = np.log(y_test + 1)
    
    X_train_sm = X_train[:int(len(X_train)/2)]
    X_test_sm = X_train[int(len(X_train)/2):]
    y_ltrain_sm = y_ltrain[:int(len(X_train)/2)]
    y_ltest_sm = y_ltrain[int(len(X_train)/2):]
    
    sc = StandardScaler()
    print('Scaled')
    X_train_sm = sc.fit_transform(X_train_sm)
    X_test_sm = sc.transform(X_test_sm)
    X_test = sc.transform(X_test)
    rf = RandomForestRegressor(n_estimators=10, 
                                random_state=0,
                                max_depth=15)
    rf.fit(X_train_sm, y_ltrain_sm)
    print('Fitted RF')
    lr = LinearRegression()
    lr.fit(X_train_sm, y_ltrain_sm)
    print('Fitted LR')
    svr = SVR(kernel='linear', max_iter=500)
    svr.fit(X_train_sm, y_ltrain_sm)
    print('Fitted SVR')
    
    pred_rf = rf.predict(X_test_sm)
    pred_lr = lr.predict(X_test_sm)
    pred_svr = svr.predict(X_test_sm)
    
    
    
    lr_over = LinearRegression()
    preds = np.zeros((len(pred_rf), 3))
    preds[:, 0] = pred_rf
    preds[:, 1] = pred_lr
    preds[:, 2] = pred_svr
    
    lr_over.fit(preds, y_ltest_sm)
    
    pred_rf = rf.predict(X_test)
    pred_lr = lr.predict(X_test)
    pred_svr = svr.predict(X_test)
    
    print('RMSLE RF : {:.5f}'.format(
        np.sqrt(mean_squared_error(y_ltest, pred_rf))))
    print('RMSLE LR : {:.5f}'.format(
        np.sqrt(mean_squared_error(y_ltest, pred_lr))))
    print('RMSLE SVR: {:.5f}'.format(
        np.sqrt(mean_squared_error(y_ltest, pred_svr))))
    
    preds = np.zeros((len(pred_rf), 3))
    preds[:, 0] = pred_rf
    preds[:, 1] = pred_lr
    preds[:, 2] = pred_svr
    
    fin_preds = lr_over.predict(preds)
    
    res.append(np.sqrt(mean_squared_error(y_ltest, fin_preds)))
    print('RMSLE: {:.5f}'.format(res[-1]))
    break

Scaled
Fitted RF
Fitted LR




Fitted SVR
RMSLE RF : 0.52958
RMSLE LR : 0.76953
RMSLE SVR: 3.66454
RMSLE: 0.52916


In [20]:
lr_over.coef_

array([ 1.03440532, -0.00843699, -0.06825399])