In [32]:
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
import seaborn as sns
import ipyleaflet
from math import sin, cos, sqrt, atan2, radians
import folium
import folium.plugins as plugins
import os
from folium.plugins import MarkerCluster     # Map
from geographiclib.geodesic import Geodesic  # Map
import time, datetime                        # time data
import calendar
import scipy

%matplotlib inline

import statsmodels.api as sm
from sklearn.datasets import make_blobs
from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale
from sklearn.preprocessing import normalize
import statsmodels


In [33]:
train = pd.read_csv("~/Documents/data/taxi_data/train.csv")
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [34]:
test = pd.read_csv("~/Documents/data/taxi_data/test.csv")
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [35]:
sample_submission = pd.read_csv("~/Documents/data/taxi_data/sample_submission.csv")
sample_submission.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [36]:
train = train.drop("dropoff_datetime", axis=1)

In [37]:
#data type convert to datetime from object
train["pickup_datetime"] =  pd.to_datetime(train["pickup_datetime"])
test["pickup_datetime"] =  pd.to_datetime(test["pickup_datetime"])

In [38]:
#day of week
#Monday=0, Sunday=6
train["pick_dayofweek"] = train["pickup_datetime"].dt.dayofweek
# train["drop_dayofweek"] = train["dropoff_datetime"].dt.dayofweek.astype("int")

#date by month
train["pick_dayofmonth"] = train["pickup_datetime"].dt.month
# train["drop_dayofmonth"] = train["dropoff_datetime"].dt.month.astype("int")

# #date by hour
train["pick_datehour"] = train["pickup_datetime"].dt.hour
# # train["drop_datehour"] = train["dropoff_datetime"].dt.hour.astype("int")

In [39]:
#day of week
#Monday=0, Sunday=6
test["pick_dayofweek"] = test["pickup_datetime"].dt.dayofweek

#date by month
test["pick_dayofmonth"] = test["pickup_datetime"].dt.month

#date by hour
test["pick_datehour"] = test["pickup_datetime"].dt.hour

In [40]:
# approximate radius of earth in km
# train
R = 6371.0

dist = []

for i in range(len(train)):
    lat1 = radians(train.iloc[i,5])
    lon1 = radians(train.iloc[i,4])
    lat2 = radians(train.iloc[i,7])
    lon2 = radians(train.iloc[i,6])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    dist.append(distance)
    
train['distance'] = dist

In [41]:
# approximate radius of earth in km
# test
R = 6371.0

dist = []

for i in range(len(test)):
    lat1 = radians(test.iloc[i,5])
    lon1 = radians(test.iloc[i,4])
    lat2 = radians(test.iloc[i,7])
    lon2 = radians(test.iloc[i,6])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    dist.append(distance)
    
test['distance'] = dist

In [42]:
train['manhattan_distance'] = (abs(train.dropoff_longitude - train.pickup_longitude) +
                            abs(train.dropoff_latitude - train.pickup_latitude))

In [43]:
test['manhattan_distance'] = (abs(test.dropoff_longitude - test.pickup_longitude) +
                            abs(test.dropoff_latitude - test.pickup_latitude))

## 2.2 Direction

In [44]:
def calculate_bearing(pickup_lat, pickup_long, dropoff_lat, dropoff_long):
    '''Calculate the direction of travel in degrees'''
    pickup_lat_rads = np.radians(pickup_lat)
    pickup_long_rads = np.radians(pickup_long)
    dropoff_lat_rads = np.radians(dropoff_lat)
    dropoff_long_rads = np.radians(dropoff_long)
    long_delta_rads = np.radians(dropoff_long_rads - pickup_long_rads)
    
    y = np.sin(long_delta_rads) * np.cos(dropoff_lat_rads)
    x = (np.cos(pickup_lat_rads) * 
         np.sin(dropoff_lat_rads) - 
         np.sin(pickup_lat_rads) * 
         np.cos(dropoff_lat_rads) * 
         np.cos(long_delta_rads))
    
    return np.degrees(np.arctan2(y, x))

In [45]:
train['bearing'] = calculate_bearing(train.pickup_latitude,
                                     train.pickup_longitude,
                                     train.dropoff_latitude,
                                     train.dropoff_longitude)


In [46]:
test['bearing'] = calculate_bearing(test.pickup_latitude,
                                     test.pickup_longitude,
                                     test.dropoff_latitude,
                                     test.dropoff_longitude)

In [47]:
train.loc[train.distance > 200] = np.nan ##200km 넘는 데이터 제거
train.loc[train.trip_duration > 40000] = np.nan ##40000초(약 11시간)가 넘는 데이터 제거
train.loc[train.passenger_count == 0] = np.NAN   ### passenger 수가 0인 데이터 제거
train.dropna(inplace=True)

In [48]:
train['store_and_fwd_flag'] = 1 * (train.store_and_fwd_flag.values == 'Y')
test['store_and_fwd_flag'] = 1 * (test.store_and_fwd_flag.values == 'Y')

In [49]:
# train = pd.get_dummies(train, columns=["store_and_fwd_flag"], prefix='store_and_fwd_flag')
# test = pd.get_dummies(test, columns=["store_and_fwd_flag"], prefix='store_and_fwd_flag')

In [50]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456576 entries, 0 to 1458643
Data columns (total 16 columns):
id                    1456576 non-null object
vendor_id             1456576 non-null float64
pickup_datetime       1456576 non-null datetime64[ns]
passenger_count       1456576 non-null float64
pickup_longitude      1456576 non-null float64
pickup_latitude       1456576 non-null float64
dropoff_longitude     1456576 non-null float64
dropoff_latitude      1456576 non-null float64
store_and_fwd_flag    1456576 non-null int64
trip_duration         1456576 non-null float64
pick_dayofweek        1456576 non-null float64
pick_dayofmonth       1456576 non-null float64
pick_datehour         1456576 non-null float64
distance              1456576 non-null float64
manhattan_distance    1456576 non-null float64
bearing               1456576 non-null float64
dtypes: datetime64[ns](1), float64(13), int64(1), object(1)
memory usage: 188.9+ MB


In [51]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 15 columns):
id                    625134 non-null object
vendor_id             625134 non-null int64
pickup_datetime       625134 non-null datetime64[ns]
passenger_count       625134 non-null int64
pickup_longitude      625134 non-null float64
pickup_latitude       625134 non-null float64
dropoff_longitude     625134 non-null float64
dropoff_latitude      625134 non-null float64
store_and_fwd_flag    625134 non-null int64
pick_dayofweek        625134 non-null int64
pick_dayofmonth       625134 non-null int64
pick_datehour         625134 non-null int64
distance              625134 non-null float64
manhattan_distance    625134 non-null float64
bearing               625134 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(6), object(1)
memory usage: 71.5+ MB


In [66]:
X_train = train.drop(labels = ["id","vendor_id", 'pick_dayofweek', 'pick_dayofmonth', 'pick_datehour', "pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude","store_and_fwd_flag", "trip_duration","pickup_datetime", "manhattan_distance"], axis=1)
Y_train = train["trip_duration"]
X_test  = test.drop(labels = ["id", "vendor_id", 'pick_dayofweek', 'pick_dayofmonth', 'pick_datehour',"pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude", "store_and_fwd_flag","pickup_datetime", "manhattan_distance"], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((1456576, 3), (1456576,), (625134, 3))

In [67]:
import statsmodels.api as sm

OLS_model = sm.OLS(Y_train, X_train).fit()
print(OLS_model.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.774
Model:                            OLS   Adj. R-squared:                  0.774
Method:                 Least Squares   F-statistic:                 1.665e+06
Date:                Thu, 15 Mar 2018   Prob (F-statistic):               0.00
Time:                        21:20:53   Log-Likelihood:            -1.1167e+07
No. Observations:             1456576   AIC:                         2.233e+07
Df Residuals:                 1456573   BIC:                         2.233e+07
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
passenger_count   113.5582      0.238    4

In [68]:
Y_test = OLS_model.predict(X_test)

sub = pd.DataFrame()
sub['Id'] = test["id"]
sub['trip_duration'] = Y_test
sub.to_csv('submission_OLS_selected.csv',index=False)

# decision tree

In [55]:
from sklearn.tree import DecisionTreeRegressor

In [56]:

# Regression
import scipy
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.cross_validation import cross_val_score
# Decission Tree regressor
from sklearn.tree import DecisionTreeRegressor




In [168]:
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(X_train, Y_train, test_size=0.20)

In [171]:
model_dt=DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, 
                              min_samples_split=2, min_samples_leaf=1, 
                              min_weight_fraction_leaf=0.0, max_features=None, 
                              random_state=0, max_leaf_nodes=None, presort=False)
model_dt.fit(X_train_tree, y_train_tree)
print(cross_val_score(model_dt,X_train_tree, y_train_tree,cv=5))

[0.36454361 0.20395866 0.42521454 0.37353782 0.3194221 ]


# model importance

In [None]:
importances = model_dt.feature_importances_
indices = np.argsort(importances)[::-1]
# std = np.std([model_dt.feature_importances_ for treet in model_dt.estimators_],
#             axis=0)


# # Print the feature ranking
print("Feature ranking:")

for f in range(train_tree.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# # Plot the feature importances of the forest
# plt.figure()
# plt.title("Feature importances")
# plt.bar(range(dfx.shape[1]), importances[indices],
#        color="r", yerr=std[indices], align="center")
# plt.xticks(range(dfx.shape[1]), indices)
# plt.xlim([-1, dfx.shape[1]])
# plt.show()

In [None]:
y_test = model_dt.predict(dfx_test)

In [None]:
y_test = np.exp(y_test) - 1

In [None]:
y_test = pd.DataFrame(y_test)

In [None]:
submission = pd.concat([test['id'], y_test], axis=1)
submission.columns = ['id','trip_duration']
submission['trip_duration'] = submission.apply(lambda x : 1 if (x['trip_duration'] <= 0) else x['trip_duration'], axis = 1)
submission.to_csv("submission_dt.csv", index=False)

# random forest

In [172]:
from sklearn.ensemble import RandomForestRegressor

In [173]:
#Random Forest Regressor
model_rnd_frst=RandomForestRegressor(n_estimators=10, criterion='mse', 
                                    min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features='auto', 
                                    max_leaf_nodes=None, min_impurity_decrease=1e-07, 
                                    bootstrap=True, oob_score=False, n_jobs=-1, 
                                    random_state=0, verbose=1, warm_start=False)
model_rnd_frst.fit(X_train_tree, y_train_tree)
print(cross_val_score(model_rnd_frst,X_train_tree,y_train,cv=5))

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


[-0.21126335 -0.1676509  -0.13659318 -0.43931837 -0.21436379]


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


In [None]:
y_test1 = model_rnd_frst.predict(dfx_test)

In [None]:
y_test1 = np.exp(y_test1) - 1

In [None]:
y_test1 = pd.DataFrame(y_test1)

In [None]:
y_test1.shpae()

In [None]:
submission = pd.concat([test['id'], y_test1], axis=1)
submission.columns = ['id','trip_duration']
submission['trip_duration'] = submission.apply(lambda x : 1 if (x['trip_duration'] <= 0) else x['trip_duration'], axis = 1)
submission.to_csv("submission_rf.csv", index=False)

# gradient boost

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model_gb=GradientBoostingRegressor(loss='ls', learning_rate=0.05, n_estimators=400, subsample=1.0,
                                  criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, 
                                 min_weight_fraction_leaf=0.0, max_depth=5,
                                  init=None, random_state=None, max_features=None, alpha=0.9, 
                                  verbose=0, 
                                  max_leaf_nodes=None, warm_start=False, presort='auto')
model_gb.fit(X_train, y_train)
print(cross_val_score(model_gb,dfx,y,cv=3))

In [None]:
y_test2 = model_rnd_frst.predict(dfx_test)

In [None]:
y_test2 = np.exp(y_test2) - 1

In [None]:
y_test2 = pd.DataFrame(y_test2)

In [None]:
y_test2.shpae()

In [None]:
submission = pd.concat([test['id'], y_test2], axis=1)
submission.columns = ['id','trip_duration']
submission['trip_duration'] = submission.apply(lambda x : 1 if (x['trip_duration'] <= 0) else x['trip_duration'], axis = 1)
submission.to_csv("submission_gb.csv", index=False)