# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 試著模仿範例寫法, 使用程車費率預測競賽練習時間欄位處理

# [作業重點]
- 新增星期幾(day of week)與第幾周(week of year)這兩項特徵, 觀察有什麼影響 (In[4], Out[4], In[5], Out[5])
- 新增加上年週期與周周期特徵 , 觀察有什麼影響 (In[8], Out[8], In[9], Out[9]) 

In [58]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = './data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [59]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [60]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)

# standardize the model
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)

# test the models
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.02687687147563449
Gradient Boosting Reg Score : 0.7110732737096732


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?

In [61]:
from datetime import datetime # import datetime class
from datetime import date # import date class

# add day of week attribute
df['day_of_week'] = df.pickup_datetime.apply(lambda x: datetime.weekday(x))

In [62]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)

# standardize the model
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)

# test the models
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.02657206231178666
Gradient Boosting Reg Score : 0.711331467102927


In [68]:
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3


In [71]:
# add week of year attribute
df['week_of_year'] = None
for i in range(len(df)):
    df.week_of_year[i] = date(df.pickup_year[i], df.pickup_month[i], df.pickup_day[i]).isocalendar()[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [75]:
# 加入星期幾與第幾周兩個特徵
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week,week_of_year
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24


In [76]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果 after adding day of week, week of year.
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.026481023863714738
Gradient Boosting Reg Score : 0.7106389191204594


In [77]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week,week_of_year,day_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,-0.02545
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6,0.333601
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,-0.967083
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24,-0.888817
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24,0.782427


In [78]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.0260803522937872
Gradient Boosting Reg Score : 0.7154002561767846


# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差?

In [87]:
# before adding cyclic features
print(f'the max of month is: {df.pickup_month.max()}')
print(f'the max of week is: {df.day_of_week.max()}')
print(f'the max of day is: {df.pickup_day.max()}')
print(f'the max of hour is: {df.pickup_hour.max()}')
print(f'the max of minute is: {df.pickup_minute.max()}')
print(f'the max of second is: {df.pickup_second.max()}')

the max of month is: 12
the max of week is: 6
the max of day is: 31
the max of hour is: 23
the max of minute is: 59
the max of second is: 59


In [89]:
# 加上cyclic特徵

# month
df['mo_sin'] = np.sin(df.pickup_month*(2.*np.pi/df.pickup_month.max()))
df['mo_cos'] = np.cos(df.pickup_month*(2.*np.pi/df.pickup_month.max()))

# day of week
df['dow_sin'] = np.sin(df.day_of_week*(2.*np.pi/df.day_of_week.max()))
df['dow_cos'] = np.cos(df.day_of_week*(2.*np.pi/df.day_of_week.max()))

# day
df['day_sin'] = np.sin(df.pickup_day*(2.*np.pi/df.pickup_day.max()))
df['day_cos'] = np.cos(df.pickup_day*(2.*np.pi/df.pickup_day.max()))

# hour
df['hr_sin'] = np.sin(df.pickup_hour*(2.*np.pi/df.pickup_hour.max()))
df['hr_cos'] = np.cos(df.pickup_hour*(2.*np.pi/df.pickup_hour.max()))

# minute
df['min_sin'] = np.sin(df.pickup_minute*(2.*np.pi/df.pickup_minute.max()))
df['min_cos'] = np.cos(df.pickup_minute*(2.*np.pi/df.pickup_minute.max()))

# second
df['se_sin'] = np.sin(df.pickup_second*(2.*np.pi/df.pickup_second.max()))
df['se_cos'] = np.cos(df.pickup_second*(2.*np.pi/df.pickup_second.max()))

df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,...,dow_sin,dow_cos,day_sin,day_cos,hr_sin,hr_cos,min_sin,min_cos,se_sin,se_cos
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,...,-0.8660254,-0.5,-0.897805,-0.440394,-2.449294e-16,1.0,-0.507666,0.861554,0.874763,0.484551
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,...,0.8660254,0.5,0.571268,0.820763,0.3984011,-0.917211,-0.97143,-0.237327,0.314077,0.949398
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,...,-2.449294e-16,1.0,-0.101168,-0.994869,-0.9790841,0.203456,-0.106293,0.994335,-0.106293,0.994335
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,...,-0.8660254,0.5,0.485302,-0.874347,-0.9422609,-0.33488,0.874763,0.484551,-0.507666,0.861554
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,...,1.224647e-16,-1.0,0.651372,-0.758758,0.730836,0.682553,0.461093,-0.887352,-0.314077,0.949398


In [90]:
# dropping redudant
df_temp = df.drop(['pickup_datetime', 'pickup_month', 'pickup_day', 'pickup_hour','pickup_minute','pickup_second','day_of_week'] , axis=1)

In [93]:
df_temp = df_temp.drop(['day_cycle'], axis=1)

In [94]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.027119878317386225
Gradient Boosting Reg Score : 0.7061927602451687


In [105]:
# check for the correlation
pd.concat([df_temp, train_Y], axis=1).corr().fare_amount

pickup_longitude     0.024800
pickup_latitude     -0.022621
dropoff_longitude    0.023686
dropoff_latitude    -0.022447
passenger_count      0.015710
pickup_year          0.125113
mo_sin              -0.038868
mo_cos              -0.019453
dow_sin             -0.001805
dow_cos             -0.002291
day_sin              0.004903
day_cos             -0.011694
hr_sin               0.016970
hr_cos               0.014841
min_sin              0.000059
min_cos             -0.034471
se_sin              -0.037762
se_cos               0.009722
fare_amount          1.000000
Name: fare_amount, dtype: float64

In [107]:
# drop the ones with low coefficient
df_temp = df_temp.drop(['dow_sin','dow_cos','min_sin','min_cos'], axis=1)

In [108]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.028132921322306492
Gradient Boosting Reg Score : 0.7043612409383869
