# Kaggle : New York City Taxi Trip Duration
### Share code and data to improve ride time predictions

https://www.kaggle.com/c/nyc-taxi-trip-duration/kernels

## 1. EDA (Exploratory Data Analysis)

### a. understanding data 
### b. data visualization

In [1]:
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
import seaborn as sns
from ipyleaflet import *
import time
from math import sin, cos, sqrt, atan2, radians
import folium
import folium.plugins as plugins



%matplotlib inline

In [2]:
train = pd.read_csv("~/Documents/taxi_data/train.csv")
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


# unit

### latitude / longtitude = decimal degree (111.32mm per 0.000001°)
- 40.767937° / -73.982155°

### duration = sec
- 455 sec = 7min 35sec

In [3]:
# train.info()

In [4]:
# train.describe()

In [5]:
9.594923e+02

959.4923

In [6]:
np.random.seed(2)
train_sample = train.sample(frac=0.005, replace=True)
train_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7293 entries, 100879 to 1074490
Data columns (total 11 columns):
id                    7293 non-null object
vendor_id             7293 non-null int64
pickup_datetime       7293 non-null object
dropoff_datetime      7293 non-null object
passenger_count       7293 non-null int64
pickup_longitude      7293 non-null float64
pickup_latitude       7293 non-null float64
dropoff_longitude     7293 non-null float64
dropoff_latitude      7293 non-null float64
store_and_fwd_flag    7293 non-null object
trip_duration         7293 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 683.7+ KB


# Feature Engineering / Data Cleaning

- higher than 100 mile/h (160 km/h) is odd data
- longer than 100 mile distance data needs to check out
- less than 5 minutes trip duration data needs to check out

### Add columns of detailed informations
- duration per min
- datetime per hour
- datetime per day of week
- datetime per month

In [11]:
#train['trip_duration']
#convert duration unit : sec to min
train_sample["pickup_datetime"] =  pd.to_datetime(train_sample["pickup_datetime"])
train_sample["dropoff_datetime"] =  pd.to_datetime(train_sample["dropoff_datetime"])
sample_duration = train_sample["dropoff_datetime"] - train_sample["pickup_datetime"]
sample_duration_sec = sample_duration.dt.total_seconds().astype('int')
train_sample["dur_min"] = (sample_duration_sec / 60).astype('int')
print(train_sample['dur_min'][0:2])

print(train_sample["pickup_datetime"][0:3])
print(train_sample["dropoff_datetime"][0:3])
print(train_sample["dropoff_datetime"][0:3] - train_sample["pickup_datetime"][0:3])

100879    24
203245     5
Name: dur_min, dtype: int64
100879    2016-03-19 20:53:44
203245    2016-04-25 14:31:34
1133010   2016-05-04 16:35:10
Name: pickup_datetime, dtype: datetime64[ns]
100879    2016-03-19 21:18:11
203245    2016-04-25 14:37:05
1133010   2016-05-04 16:41:07
Name: dropoff_datetime, dtype: datetime64[ns]
100879    00:24:27
203245    00:05:31
1133010   00:05:57
dtype: timedelta64[ns]


In [12]:
#day of week
#Monday=0, Sunday=6
train_sample["pick_dayofweek"] = train_sample["pickup_datetime"].dt.dayofweek
train_sample["drop_dayofweek"] = train_sample["dropoff_datetime"].dt.dayofweek
print(train_sample["pick_dayofweek"][0:3])
print(train_sample["drop_dayofweek"][0:3])

100879     5
203245     0
1133010    2
Name: pick_dayofweek, dtype: int64
100879     5
203245     0
1133010    2
Name: drop_dayofweek, dtype: int64


In [13]:
#date by month
train_sample["pick_dayofmonth"] = train_sample["pickup_datetime"].dt.month
train_sample["drop_dayofmonth"] = train_sample["dropoff_datetime"].dt.month
print(train_sample["pick_dayofmonth"][0:3])
print(train_sample["drop_dayofmonth"][0:3])

100879     3
203245     4
1133010    5
Name: pick_dayofmonth, dtype: int64
100879     3
203245     4
1133010    5
Name: drop_dayofmonth, dtype: int64


In [14]:
#date by hour
train_sample["pick_datehour"] = train_sample["pickup_datetime"].dt.hour
train_sample["drop_datehour"] = train_sample["dropoff_datetime"].dt.hour
print(train_sample["pick_datehour"][0:3])
print(train_sample["drop_datehour"][0:3])

100879     20
203245     14
1133010    16
Name: pick_datehour, dtype: int64
100879     21
203245     14
1133010    16
Name: drop_datehour, dtype: int64


# Distance between pickup and dropoff location

In [2]:
from math import sin, cos, sqrt, atan2, radians

# approximate radius of earth in km
R = 6371.0

dis = []

for i in range(len(train_sample)):
    lat1 = radians(train_sample.iloc[i,6])
    lon1 = radians(train_sample.iloc[i,5])
    lat2 = radians(train_sample.iloc[i,8])
    lon2 = radians(train_sample.iloc[i,7])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    dis.append(distance)
    
train_sample['distance'] = dis

NameError: name 'train_sample' is not defined

In [None]:
train_sample.head(5)

# Map Visualization

- locate pin onto NYC map (pickup location)

In [147]:
#heatmap w/o time

data1 = [[train_sample.iloc[i, 6], train_sample.iloc[i, 5]] for i in range(len(train_sample))]

m = folium.Map(center,  zoom_start=12, tiles = "Cartodb Positron")

hm = plugins.HeatMap(data1)

hm.add_to(m)

m