# 2. New York Taxi Trip EDA and Baseline Model

출처 : [뉴욕 택시 여행 EDA and Model](https://www.kaggle.com/aiswaryaramachandran/eda-baseline-model-0-40-rmse)

In [16]:
# 라이브러리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib as mpl
import seaborn as sns

from datetime import datetime, timedelta
import calendar
from math import sin, cos, sqrt, atan2 , radians

import folium
from folium import FeatureGroup, LayerControl, Map, Marker
from folium.plugins import HeatMap

from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
import pickle

import warnings
warnings.filterwarnings('ignore')

In [17]:
# 데이터 불러오기
train = pd.read_csv('data/newyork_train.csv')
test = pd.read_csv('data/newyork_test.csv')
print(test.shape)

(625134, 9)


In [18]:
# 데이터 타입 변경
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


### 결측치 확인

In [19]:
train.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

Missing value는 없다.

In [20]:
print('Min pick up time: ', min(train['pickup_datetime']))
print('Max pick up time: ', max(train['pickup_datetime']))

Min pick up time:  2016-01-01 00:00:17
Max pick up time:  2016-06-30 23:59:39


1월부터 6월까지의 데이터로 보인다.

## 파생변수 생성

In [21]:
# 승차 변수를 통해 새로운 feature 생성
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_day_of_week'] = train['pickup_datetime'].dt.weekday

# 하차 변수를 통해 새로운 feature 생성
train['dropoff_date'] = pd.to_datetime(train['dropoff_datetime'])
train['dropoff_day'] = train['dropoff_datetime'].dt.day
train['dropoff_hour'] = train['dropoff_datetime'].dt.hour
train['dropoff_day_of_week'] = train['dropoff_datetime'].dt.weekday

In [22]:
# 라운딩
train['pickup_latitude_round3'] = train['pickup_latitude'].apply(lambda x : round(x, 3))
train['pickup_longitude_round3'] = train['pickup_longitude'].apply(lambda x : round(x ,3))

train['dropoff_latitude_round3'] = train['dropoff_latitude'].apply(lambda x: round(x, 3))
train['dropoff_longitude'] = train['dropoff_longitude'].apply(lambda x : round(x , 3))

In [23]:
# 거리를 구하는 함수
def calculateDistance(row):
    R = 6373.0
    pickup_lat = radians(row['pickup_latitude'])
    pickup_lon = radians(row['pickup_longitude'])
    
    dropoff_lat = radians(row['dropoff_latitude'])
    dropoff_lon = radians(row['dropoff_longitude'])
    
    dlon = dropoff_lon - pickup_lon
    dlat = dropoff_lat - pickup_lat
    
    a = sin(dlat / 2) ** 2 + cos(pickup_lat) * cos(dropoff_lat) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R  * c
    
    return distance

In [24]:
# 위에서 만든 함수를 적용함
train['trip_distance'] = train.apply(lambda row : calculateDistance(row), axis = 1)
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,pickup_hour,pickup_day_of_week,dropoff_date,dropoff_day,dropoff_hour,dropoff_day_of_week,pickup_latitude_round3,pickup_longitude_round3,dropoff_latitude_round3,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.965,40.765602,N,...,17,0,2016-03-14 17:32:30,14,17,0,40.768,-73.982,40.766,1.468313
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999,40.731152,N,...,0,6,2016-06-12 00:54:38,12,0,6,40.739,-73.98,40.731,1.770086
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005,40.710087,N,...,11,1,2016-01-19 12:10:48,19,12,1,40.764,-73.979,40.71,6.377417
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012,40.706718,N,...,19,2,2016-04-06 19:39:40,6,19,2,40.72,-74.01,40.707,1.483277
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.973,40.78252,N,...,13,5,2016-03-26 13:38:10,26,13,5,40.793,-73.973,40.783,1.18892


In [25]:
train['trip_duration_in_hour'] = train['trip_duration'].apply(lambda x : x/3600)
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,pickup_day_of_week,dropoff_date,dropoff_day,dropoff_hour,dropoff_day_of_week,pickup_latitude_round3,pickup_longitude_round3,dropoff_latitude_round3,trip_distance,trip_duration_in_hour
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.965,40.765602,N,...,0,2016-03-14 17:32:30,14,17,0,40.768,-73.982,40.766,1.468313,0.126389
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999,40.731152,N,...,6,2016-06-12 00:54:38,12,0,6,40.739,-73.98,40.731,1.770086,0.184167
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005,40.710087,N,...,1,2016-01-19 12:10:48,19,12,1,40.764,-73.979,40.71,6.377417,0.59
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012,40.706718,N,...,2,2016-04-06 19:39:40,6,19,2,40.72,-74.01,40.707,1.483277,0.119167
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.973,40.78252,N,...,5,2016-03-26 13:38:10,26,13,5,40.793,-73.973,40.783,1.18892,0.120833


## EDA

In [None]:
plt.figure(figsize = (8,5))
sns.distplot(train['trip_duration_in_hour'])
plt.title('Distribution of Trip Duration', size = 14)
plt.xlabel('Trip Duration (in hour)')
plt.show()

In [None]:
print('trip duration in hour의 중앙값' ,round(train['trip_duration_in_hour'].median(), 3))
print('trip distance의 중앙값', round(train['trip_distance'].median(), 3))

trip_duration_in_hour의 중앙값은 0.184인데 그래프를 보면 미약하게 이상치가 보인다. 자세히 살펴보겠다.

In [None]:
outlier_trip_duration = train.loc[train['trip_duration_in_hour'] > 24]
outlier_trip_duration

다음 4개의 데이터들이 24 시간을 넘는 trip duration을 보였다. 시간은 매우 높지만 주행거리(trip_distance)가 높지는 않은 데이터도 보인다. 위 4개의 데이터를 이상치로 취급하자. 하지만  위 데이터들이 특정 지역에서 출발하거나 끝나는지 궁금하다. 

그전에 trip distance가 매우 skew 되어있기 때문에 로그 변환을 취해 정규성을 갖도록 해주자.

In [None]:
plt.figure(figsize = (8,5))
sns.distplot(np.log(train['trip_duration'].values))
plt.title('Distribution of Trip Duration in Log scale', size = 14)
plt.show()

In [None]:
print('Trip Duration (log scale)의 평균: ',np.log(train['trip_duration']).mean())
print('New York Taxi의 평균 trip duration은 {}분이다.'.format(round(np.exp(6.49) / 60, 3)))

로그 변환을 해주니 정규성을 갖는 분포를 보인다. 대부분의 데이터가 4~6 사이에 분포하고 있다. 이를 exponential을 취해주어 확인해보면 54초 ~ 50분 안에 trip을 마친다는 뜻이다.  하지만 분포의 끝에 있는 데이터들처럼 100시간이 넘게 택시를 탄다는 것은 말이 되지 않는다.