# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 試著模仿範例寫法, 使用程車費率預測競賽練習時間欄位處理

# [作業重點]
- 新增星期幾(day of week)與第幾周(week of year)這兩項特徵, 觀察有什麼影響 (In[4], Out[4], In[5], Out[5])
- 新增加上年週期與周周期特徵 , 觀察有什麼影響 (In[8], Out[8], In[9], Out[9]) 

In [26]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = '../Part02/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [28]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime)
# df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))


In [24]:
# 時間特徵分解方式:使用datetime
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')


df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [25]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475641616
Gradient Boosting Reg Score : 0.7101528888867124


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?

In [29]:
# 加入星期幾與第幾周兩個特徵
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df['pickup_dayofweek'] = df.pickup_datetime.dt.dayofweek
df['pickup_weekofyear'] = df.pickup_datetime.dt.week

df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_dayofweek,pickup_weekofyear
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24


In [30]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.026481023863712937
Gradient Boosting Reg Score : 0.7097419134518033


In [31]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_dayofweek,pickup_weekofyear,day_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,-0.02545
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6,0.333601
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,-0.967083
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24,-0.888817
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24,0.782427


In [32]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.026080352293789643
Gradient Boosting Reg Score : 0.7176168179540552


# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差?

In [None]:
# 加上"年週期"與"周週期"特徵


In [36]:
# seconds_in_day = 24*60*60
week_in_year = 52
day_in_week = 7

df['sin_week'] = np.sin(2*np.pi*df.pickup_weekofyear/week_in_year)
df['cos_week'] = np.cos(2*np.pi*df.pickup_weekofyear/week_in_year)
df['sin_dayofweek'] = np.sin(2*np.pi*df.pickup_dayofweek/day_in_week)
df['cos_dayofweek'] = np.cos(2*np.pi*df.pickup_dayofweek/day_in_week)


df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_dayofweek,pickup_weekofyear,day_cycle,sin_week,cos_week,sin_dayofweek,cos_dayofweek
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,4,42,-0.02545,-0.935016,0.354605,-0.433884,-0.900969
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,1,6,0.333601,0.663123,0.748511,0.781831,0.62349
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,6,11,-0.967083,0.970942,0.239316,-0.781831,0.62349
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,5,24,-0.888817,0.239316,-0.970942,-0.974928,-0.222521
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,3,24,0.782427,0.239316,-0.970942,0.433884,-0.900969


In [44]:
df.sort_values('pickup_datetime')

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_dayofweek,pickup_weekofyear,day_cycle,sin_week,cos_week,sin_dayofweek,cos_dayofweek
2743,2009-01-01 19:58:00,-74.003380,40.706528,-73.999935,40.741720,1,2009,1,1,19,58,0,3,1,-0.870356,1.205367e-01,0.992709,0.433884,-0.900969
2251,2009-01-02 13:04:05,-73.989430,40.753172,-73.981222,40.741460,1,2009,1,2,13,4,5,4,1,-0.275987,1.205367e-01,0.992709,-0.433884,-0.900969
594,2009-01-02 19:52:00,-73.976487,40.751042,-73.989805,40.735645,5,2009,1,2,19,52,0,4,1,-0.882948,1.205367e-01,0.992709,-0.433884,-0.900969
4905,2009-01-02 21:22:00,-73.991363,40.750758,-73.943285,40.675778,1,2009,1,2,21,22,0,4,1,-0.636078,1.205367e-01,0.992709,-0.433884,-0.900969
2535,2009-01-04 12:36:00,-73.980695,40.733843,-73.989718,40.756655,2,2009,1,4,12,36,0,6,1,-0.156434,1.205367e-01,0.992709,-0.781831,0.623490
2868,2009-01-07 22:48:00,-73.991362,40.750560,-73.967860,40.798890,1,2009,1,7,22,48,0,2,2,-0.309017,2.393157e-01,0.970942,0.974928,-0.222521
2179,2009-01-08 10:27:00,-73.954883,40.767520,-73.971298,40.764492,1,2009,1,8,10,27,0,3,2,0.394744,2.393157e-01,0.970942,0.433884,-0.900969
955,2009-01-08 12:07:37,-74.005767,40.750963,-73.984377,40.759232,1,2009,1,8,12,7,37,3,2,-0.033228,2.393157e-01,0.970942,0.433884,-0.900969
4849,2009-01-09 19:33:46,-73.953283,40.778685,-73.966444,40.772178,2,2009,1,9,19,33,46,4,2,-0.917466,2.393157e-01,0.970942,-0.433884,-0.900969
786,2009-01-10 00:06:10,0.000000,0.000000,0.000000,0.000000,1,2009,1,10,0,6,10,5,2,0.026904,2.393157e-01,0.970942,-0.974928,-0.222521


In [47]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.025491028187643948
Gradient Boosting Reg Score : 0.713060730252072
