# Kaggle : New York City Taxi Trip Duration

<img src="taxi.png">

# 1 EDA (Exploratory Data Analysis)

# purpose of  EDA

- Suggest hypotheses about the causes of observed phenomena
- Assess assumptions on which statistical inference will be based
- Support the selection of appropriate statistical tools and techniques
- Provide a basis for further data collection through surveys or experiments

# EDA methods
- Graphical techniques used in EDA are:
    - boxplot 
        - detailed feature (datetime by month, day of week, hours)
    - historgram or barplot (distribution) # bin = range of value
        - origin feature (pick lat,long, drop lat, long, duration, passenger count, flag)
        - detailed feature (datetime by month, day of week, hours)
    - scatter plot
        - duration vs distance = to check odd data
    - Parallel Coordinates vs Colormaps vs Andrews curves charts
    - odd ratio????

- Quantative methods:
    - Trimean == tukey method?

# 1.1 Understanding data 

In [26]:
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
import seaborn as sns
import ipyleaflet
from math import sin, cos, sqrt, atan2, radians
import folium
import folium.plugins as plugins
import os
from folium.plugins import MarkerCluster     # Map
from geographiclib.geodesic import Geodesic  # Map
import time, datetime                        # time data
import calendar
import scipy

%matplotlib inline

import statsmodels.api as sm
from sklearn.datasets import make_blobs
from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale
from sklearn.preprocessing import normalize
import statsmodels


In [27]:
train = pd.read_csv("data/train.csv")
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [28]:
test = pd.read_csv("data/test.csv")
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [29]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [30]:
# np.random.seed(2)
# train = train.sample(frac=0.001, replace=True)
# train.info()
# np.random.seed(2)
# test = test.sample(frac=0.001, replace=True)
# test.info()

# 1.1.a Data type and unit

# unit

### 1. latitude / longtitude = decimal degree 
- 111.32mm per 0.000001° / 11.132 m per 0.0001° / 1.1132 km per 0.01° / 111.32 km per 1.0°
- 14 demical degree
- ex) 40.767937 , -73.982155

### 2. datetime = year-month-day: hour-minute-second

### 3. vendor_id = 1, 2

### 4. passenger_count = 0,,,,9

### 4. store_and_fwd_flag = N, Y

### 6. duration = second
- ex) 455 sec = 7min 35sec


In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null object
dropoff_datetime      1458644 non-null object
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [32]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 9 columns):
id                    625134 non-null object
vendor_id             625134 non-null int64
pickup_datetime       625134 non-null object
passenger_count       625134 non-null int64
pickup_longitude      625134 non-null float64
pickup_latitude       625134 non-null float64
dropoff_longitude     625134 non-null float64
dropoff_latitude      625134 non-null float64
store_and_fwd_flag    625134 non-null object
dtypes: float64(4), int64(2), object(3)
memory usage: 42.9+ MB


In [33]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 2 columns):
id               625134 non-null object
trip_duration    625134 non-null int64
dtypes: int64(1), object(1)
memory usage: 9.5+ MB


# train data
-  1.4M data, 11 columns

# test data
-  0.6M data, 9 columns (no dropoff_datetime, trip_duration)

# sample_submission
-  0.6M data, 2 columns (id, trip_duration)

# 1.1.b Missing Data check

In [34]:
#none of missing data
train2 = train.dropna(how = 'any')
test2 = test.dropna(how = 'any')
len(train) == len(train2), len(test) == len(test2)

(True, True)

# 1.1.c Trip duration check

In [35]:
train["pickup_datetime"] =  pd.to_datetime(train["pickup_datetime"])
train["dropoff_datetime"] =  pd.to_datetime(train["dropoff_datetime"])
sample_duration = train["dropoff_datetime"] - train["pickup_datetime"]
sample_duration_sec = sample_duration.dt.total_seconds().astype('int')
train['trip_sec'] =  sample_duration_sec

In [36]:
train_d = train[train["trip_duration"] != train["trip_sec"]]
len(train_d)

0

### NYC Taxi Trip Duration [Train data]는

### 총 1,458,644 Row와 11 Column으로 구성되어 있으며,

### Missing Data는 존재하지 않습니다.

# 1.1.c Column information

- id : 개별 Taxi에 부여된 고유 id (이건 그냥 쓴거예요...)
- verdor_id : Taxi Company id >>>  1, 2로 구성되어 있는걸로 봐서 2개의 회사를 대상
- pickup/dropoff datetime : 출발/도착 시간정보 >> 년, 월, 일, 시각 정보가 포함
- passenger_count : 승객수 >>> 0~9명까지 존재
- pickup/dropoff_longitude & latitude : 출발/도착 지리정보
- store_and_fwd_flag : whether the trip data was sent immediately to the vendor (“N”) or held in the memory of the taxi because there was no connection to the server (“Y”)
- trip_duration : 탑승시간 >>> 단위는 Seconds

# 1.2 Feature Engineering & Data Cleaning

### 1.2.a Add columns with detailed informations


- duration per min
- datetime per hour
- datetime per day of week
- datetime per month

# drop "dropoff_datetime" column

In [37]:
train = train.drop("dropoff_datetime", axis=1)

In [38]:
#data type convert to datetime from object
train["pickup_datetime"] =  pd.to_datetime(train["pickup_datetime"])
test["pickup_datetime"] =  pd.to_datetime(test["pickup_datetime"])

In [39]:
#day of week
#Monday=0, Sunday=6
train["pick_dayofweek"] = train["pickup_datetime"].dt.dayofweek
# train["drop_dayofweek"] = train["dropoff_datetime"].dt.dayofweek.astype("int")

#date by month
train["pick_dayofmonth"] = train["pickup_datetime"].dt.month
# train["drop_dayofmonth"] = train["dropoff_datetime"].dt.month.astype("int")

# #date by hour
train["pick_datehour"] = train["pickup_datetime"].dt.hour
# # train["drop_datehour"] = train["dropoff_datetime"].dt.hour.astype("int")

In [40]:
#day of week
#Monday=0, Sunday=6
test["pick_dayofweek"] = test["pickup_datetime"].dt.dayofweek

#date by month
test["pick_dayofmonth"] = test["pickup_datetime"].dt.month

#date by hour
test["pick_datehour"] = test["pickup_datetime"].dt.hour

# 1.2.b Distance between pickup and dropoff location

# Geographic space
   - Manhattan distance vs Euclidean distance

### Euclidean distance
- unit = km

# New York border coordinate

In [41]:
# new york city coordinate = (41.145495, −73.994901)
city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85)

In [42]:
# approximate radius of earth in km
# train
R = 6371.0

dist = []

for i in range(len(train)):
    lat1 = radians(train.iloc[i,5])
    lon1 = radians(train.iloc[i,4])
    lat2 = radians(train.iloc[i,7])
    lon2 = radians(train.iloc[i,6])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    dist.append(distance)
    
train['distance'] = dist

In [43]:
# approximate radius of earth in km
# test
R = 6371.0

dist = []

for i in range(len(test)):
    lat1 = radians(test.iloc[i,5])
    lon1 = radians(test.iloc[i,4])
    lat2 = radians(test.iloc[i,7])
    lon2 = radians(test.iloc[i,6])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    dist.append(distance)
    
test['distance'] = dist

In [44]:
train.distance.head(5)

0    1.498521
1    1.805507
2    6.385098
3    1.485498
4    1.188588
Name: distance, dtype: float64

# [Taxicab geometry](https://en.wikipedia.org/wiki/Taxicab_geometry)

### green line : Euclidean distance
### purple line : Manhattan distance
![alt text](distance.jpeg "distance")
![alt text](HoULH.jpg "distance")
![alt text](sphere2.png "distance")
![alt text](main-qimg-3de1e2c9c9b93ffd501985fb36171388-c.jpeg "distance")

In [45]:
# train[train.trip_duration > 100000]

In [46]:
train['manhattan_distance'] = (abs(train.dropoff_longitude - train.pickup_longitude) +
                            abs(train.dropoff_latitude - train.pickup_latitude))

In [47]:
test['manhattan_distance'] = (abs(test.dropoff_longitude - test.pickup_longitude) +
                            abs(test.dropoff_latitude - test.pickup_latitude))

## 2.2 Direction

In [48]:
def calculate_bearing(pickup_lat, pickup_long, dropoff_lat, dropoff_long):
    '''Calculate the direction of travel in degrees'''
    pickup_lat_rads = np.radians(pickup_lat)
    pickup_long_rads = np.radians(pickup_long)
    dropoff_lat_rads = np.radians(dropoff_lat)
    dropoff_long_rads = np.radians(dropoff_long)
    long_delta_rads = np.radians(dropoff_long_rads - pickup_long_rads)
    
    y = np.sin(long_delta_rads) * np.cos(dropoff_lat_rads)
    x = (np.cos(pickup_lat_rads) * 
         np.sin(dropoff_lat_rads) - 
         np.sin(pickup_lat_rads) * 
         np.cos(dropoff_lat_rads) * 
         np.cos(long_delta_rads))
    
    return np.degrees(np.arctan2(y, x))

In [49]:
train['bearing'] = calculate_bearing(train.pickup_latitude,
                                     train.pickup_longitude,
                                     train.dropoff_latitude,
                                     train.dropoff_longitude)


In [50]:
test['bearing'] = calculate_bearing(test.pickup_latitude,
                                     test.pickup_longitude,
                                     test.dropoff_latitude,
                                     test.dropoff_longitude)

# 1.2.c Outlier Removal

### qualitative analysis
- 
- 
- 

### quantitative analysis
- Peirce's criterion
- Tukey's fences
- In anomaly detection
- Modified Thompson Tau test

# qualitative analysis

In [51]:
train.loc[train.distance > 200] = np.nan ##200km 넘는 데이터 제거
train.loc[train.trip_duration > 36000] = np.nan ##40000초(약 11시간)가 넘는 데이터 제거
train.loc[train.passenger_count == 0] = np.NAN   ### passenger 수가 0인 데이터 제거
train.dropna(inplace=True)

### Tukey's fences

### location coordinates mean, median

In [52]:
print(train["pickup_latitude"].mean(), train["dropoff_latitude"].mean())
print(train["pickup_latitude"].median(), train["dropoff_latitude"].median())

40.7509225231536 40.75180790130403
40.75410461425781 40.7545280456543


In [53]:
print(train["pickup_longitude"].mean(), train["dropoff_longitude"].mean())
print(train["pickup_longitude"].median(), train["dropoff_longitude"].median())

-73.97349490065326 -73.97340319536892
-73.98174285888672 -73.97975158691406


In [54]:
print(train["distance"].mean(), train["distance"].median())

3.4359716178035837 2.093386518701692


# 1.2.d.2 Spatial Data Analysis

### Types of spatial analysis
- FA(factor analysis)
    - Euclidean metric = > PCA(principal component analysis)
    - Chi-Square distance => Correspondence Analysis (similar to PCA, but better for categrorical data)
    - Generalized Mahalanobis distance => Discriminant Analysis 

- Spatial autocorrelation

- Spatial stratified heterogeneity
    - geographical detector q-statistic

### Spatial dependency or auto-correlation

### Scaling

### Common errors in spatial analysis
- Length
- Locational fallacy
- Ecological fallacy
    - Modifiable areal unit problem
        - statistical bias

### stack-up coordinates data

In [55]:
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values,
                    test[['pickup_latitude', 'pickup_longitude']].values,
                    test[['dropoff_latitude', 'dropoff_longitude']].values))

In [56]:
#PCA
from sklearn.decomposition import PCA


pca = PCA().fit(coords)
train['pickup_pca0'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 0]
train['pickup_pca1'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 1]
train['dropoff_pca0'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train['dropoff_pca1'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
test['pickup_pca0'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 0]
test['pickup_pca1'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 1]
test['dropoff_pca0'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
test['dropoff_pca1'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 1]

In [57]:
len(coords)

4163392

In [58]:
pca

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [59]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,...,pick_dayofweek,pick_dayofmonth,pick_datehour,distance,manhattan_distance,bearing,pickup_pca0,pickup_pca1,dropoff_pca0,dropoff_pca1
0,id2875421,2.0,2016-03-14 17:24:55,1.0,-73.982155,40.767937,-73.96463,40.765602,N,455.0,...,0.0,3.0,17.0,1.498521,0.019859,174.333195,0.007707,0.017044,-0.009654,0.013699
1,id2377394,1.0,2016-06-12 00:43:35,1.0,-73.980415,40.738564,-73.999481,40.731152,N,663.0,...,6.0,6.0,0.0,1.805507,0.026478,-178.051506,0.007669,-0.012381,0.027132,-0.018677
2,id3858529,2.0,2016-01-19 11:35:24,1.0,-73.979027,40.763939,-74.005333,40.710087,N,2124.0,...,1.0,1.0,11.0,6.385098,0.080158,-179.629721,0.004815,0.012872,0.034192,-0.039369
3,id3504673,2.0,2016-04-06 19:32:31,1.0,-74.01004,40.719971,-74.012268,40.706718,N,429.0,...,2.0,4.0,19.0,1.485498,0.01548,-179.872566,0.03832,-0.029229,0.04131,-0.04233
4,id2181028,2.0,2016-03-26 13:30:55,1.0,-73.973053,40.793209,-73.972923,40.78252,N,435.0,...,5.0,3.0,13.0,1.188588,0.010818,179.990812,-0.002842,0.041747,-0.002353,0.031069


# 1.2.d.3 Clustering
- K-means algorithms
- EM(Expectation Maximization) algorithms
- K-medoid algorithms

### k-means clustering

In [60]:
from sklearn.cluster import KMeans

In [61]:
kmeans = KMeans(n_jobs=4).fit(coords)

In [62]:
train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])

In [63]:
test.loc[:, 'pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])

In [64]:
train.shape, test.shape, 

((1456562, 23), (625134, 21))

In [65]:
train.info() #trip_duration, trip_sec, avg_speed_h

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456562 entries, 0 to 1458643
Data columns (total 23 columns):
id                    1456562 non-null object
vendor_id             1456562 non-null float64
pickup_datetime       1456562 non-null datetime64[ns]
passenger_count       1456562 non-null float64
pickup_longitude      1456562 non-null float64
pickup_latitude       1456562 non-null float64
dropoff_longitude     1456562 non-null float64
dropoff_latitude      1456562 non-null float64
store_and_fwd_flag    1456562 non-null object
trip_duration         1456562 non-null float64
trip_sec              1456562 non-null float64
pick_dayofweek        1456562 non-null float64
pick_dayofmonth       1456562 non-null float64
pick_datehour         1456562 non-null float64
distance              1456562 non-null float64
manhattan_distance    1456562 non-null float64
bearing               1456562 non-null float64
pickup_pca0           1456562 non-null float64
pickup_pca1           1456562 non-nu

In [66]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 21 columns):
id                    625134 non-null object
vendor_id             625134 non-null int64
pickup_datetime       625134 non-null datetime64[ns]
passenger_count       625134 non-null int64
pickup_longitude      625134 non-null float64
pickup_latitude       625134 non-null float64
dropoff_longitude     625134 non-null float64
dropoff_latitude      625134 non-null float64
store_and_fwd_flag    625134 non-null object
pick_dayofweek        625134 non-null int64
pick_dayofmonth       625134 non-null int64
pick_datehour         625134 non-null int64
distance              625134 non-null float64
manhattan_distance    625134 non-null float64
bearing               625134 non-null float64
pickup_pca0           625134 non-null float64
pickup_pca1           625134 non-null float64
dropoff_pca0          625134 non-null float64
dropoff_pca1          625134 non-null float64
pickup_cluster      

# 3. Modeling

# evaluation metric

[Root Mean Squared Logarithmic Error](https://www.kaggle.com/wiki/RootMeanSquaredLogarithmicError)

$\epsilon = \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 }$

Where:
- ϵ is the RMSLE value (score)

- n is the total number of observations in the (public/private) data set,

- pi is your prediction of trip duration, and
- ai is the actual trip duration for i. 
- log(x) is the natural logarithm of x

### data type manipulation
- categorical data convert encoding

In [70]:
train['store_and_fwd_flag'] = 1 * (train.store_and_fwd_flag.values == 'Y')
test['store_and_fwd_flag'] = 1 * (test.store_and_fwd_flag.values == 'Y')

In [77]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [82]:
# train = pd.get_dummies(train)
# test = pd.get_dummies(test)

In [83]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456562 entries, 0 to 1458643
Data columns (total 22 columns):
vendor_id             1456562 non-null float64
pickup_datetime       1456562 non-null datetime64[ns]
passenger_count       1456562 non-null float64
pickup_longitude      1456562 non-null float64
pickup_latitude       1456562 non-null float64
dropoff_longitude     1456562 non-null float64
dropoff_latitude      1456562 non-null float64
store_and_fwd_flag    1456562 non-null int64
trip_duration         1456562 non-null float64
trip_sec              1456562 non-null float64
pick_dayofweek        1456562 non-null float64
pick_dayofmonth       1456562 non-null float64
pick_datehour         1456562 non-null float64
distance              1456562 non-null float64
manhattan_distance    1456562 non-null float64
bearing               1456562 non-null float64
pickup_pca0           1456562 non-null float64
pickup_pca1           1456562 non-null float64
dropoff_pca0          1456562 non-nu

In [84]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 20 columns):
vendor_id             625134 non-null int64
pickup_datetime       625134 non-null datetime64[ns]
passenger_count       625134 non-null int64
pickup_longitude      625134 non-null float64
pickup_latitude       625134 non-null float64
dropoff_longitude     625134 non-null float64
dropoff_latitude      625134 non-null float64
store_and_fwd_flag    625134 non-null int64
pick_dayofweek        625134 non-null int64
pick_dayofmonth       625134 non-null int64
pick_datehour         625134 non-null int64
distance              625134 non-null float64
manhattan_distance    625134 non-null float64
bearing               625134 non-null float64
pickup_pca0           625134 non-null float64
pickup_pca1           625134 non-null float64
dropoff_pca0          625134 non-null float64
dropoff_pca1          625134 non-null float64
pickup_cluster        625134 non-null int32
dropoff_cluster       

In [86]:
X_train = train.drop(labels = ["trip_duration","trip_sec", "pickup_datetime"], axis=1)
Y_train = train["trip_duration"]
X_test  = test.drop(labels = ["pickup_datetime"], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((1456562, 19), (1456562,), (625134, 19))

In [100]:
import statsmodels.api as sm

In [101]:
OLS_model = sm.OLS(Y_train, X_train).fit()
print(OLS_model.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.567
Model:                            OLS   Adj. R-squared:                  0.567
Method:                 Least Squares   F-statistic:                 1.273e+05
Date:                Tue, 03 Apr 2018   Prob (F-statistic):               0.00
Time:                        16:29:08   Log-Likelihood:            -1.0963e+07
No. Observations:             1456562   AIC:                         2.193e+07
Df Residuals:                 1456546   BIC:                         2.193e+07
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
vendor_id              2.4160      0

In [96]:
Y_test = OLS_model.predict(X_test)
Y_test.head(), len(Y_test)

sub = pd.DataFrame(columns= ['id', 'trip_duration'])
sub['id'] = sample_submission["id"]
sub['trip_duration'] = Y_test
sub.to_csv('submission_OLS.csv',index=False)

# Appendix

### 1. degree of decimal
- 0.000001 = 1.11mm

### 2. spatial data analysis
- PCA
- discriminant analysis

### 3. clustering
- K means
- K nearest neighbor
- Expectation Maximization

### 4. ensemble methods
- aggregation
- boosting

# decision tree

In [92]:
from sklearn.tree import DecisionTreeRegressor

In [93]:

# Regression
import scipy
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.cross_validation import cross_val_score
# Decission Tree regressor
from sklearn.tree import DecisionTreeRegressor




In [99]:
model_dt=DecisionTreeRegressor(max_depth=4).fit(X_train,Y_train)

In [104]:
y_tree = model_dt.predict(X_test)

sub = pd.DataFrame(columns= ['id', 'trip_duration'])
sub['id'] = sample_submission["id"]
sub['trip_duration'] = y_tree
sub.to_csv('submission_tree.csv',index=False)

# random forest

In [105]:
from sklearn.ensemble import RandomForestRegressor

In [107]:
model_rnd_frst=RandomForestRegressor(max_depth=4, n_jobs=4)
model_rnd_frst.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [109]:
y_random = model_rnd_frst.predict(X_test)

sub = pd.DataFrame(columns= ['id', 'trip_duration'])
sub['id'] = sample_submission["id"]
sub['trip_duration'] = y_random
sub.to_csv('submission_random.csv',index=False)

# XGBoost

In [111]:
import xgboost as xgb

In [116]:
model_xgb = xgb.XGBRegressor(max_depth=15, n_jobs=4, reg_alpha=0.5, reg_lambda=0.5, random_state=0).fit(X_train, Y_train)

In [117]:
y_xgb = model_xgb.predict(X_test)

sub = pd.DataFrame(columns= ['id', 'trip_duration'])
sub['id'] = sample_submission["id"]
sub['trip_duration'] = y_xgb
sub.to_csv('submission_xgb.csv',index=False)
#0.42123