# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 試著模仿範例寫法, 使用程車費率預測競賽練習時間欄位處理

# [作業重點]
- 新增星期幾(day of week)與第幾周(week of year)這兩項特徵, 觀察有什麼影響 (In[4], Out[4], In[5], Out[5])
- 新增加上年週期與周周期特徵 , 觀察有什麼影響 (In[8], Out[8], In[9], Out[9]) 

In [1]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

df = pd.read_csv('Tutorials/Data/taxi_train.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [2]:
df = df[:50000]

In [6]:
train_Y = train_Y[:50000]

In [3]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2009-06-15 17:26:21.0000001,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,17,26,21
1,2010-01-05 16:52:16.0000002,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16,52,16
2,2011-08-18 00:35:00.00000049,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,0,35,0
3,2012-04-21 04:30:42.0000001,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4,30,42
4,2010-03-09 07:51:00.000000135,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7,51,0


In [7]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime', 'key'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.01667836212397993
Gradient Boosting Reg Score : 0.7079035849559746


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?

In [95]:
# 加入星期幾與第幾周兩個特徵
"""
Your Code Here
"""
df['pickup_datetime_1'] = df.apply(lambda x: datetime.datetime(x.pickup_year,x.pickup_month,x.pickup_day), axis =1)
df['week'] = df['pickup_datetime_1'].apply(lambda x:datetime.datetime.isocalendar(x)[1])
df['weekday'] = df['pickup_datetime_1'].apply(lambda x:datetime.datetime.weekday(x))
df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_datetime_1,week,weekday
0,2009-06-15 17:26:21.0000001,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,17,26,21,2009-06-15,25,0
1,2010-01-05 16:52:16.0000002,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16,52,16,2010-01-05,1,1
2,2011-08-18 00:35:00.00000049,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,0,35,0,2011-08-18,33,3
3,2012-04-21 04:30:42.0000001,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4,30,42,2012-04-21,16,5
4,2010-03-09 07:51:00.000000135,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7,51,0,2010-03-09,10,1


In [101]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime' , 'key','pickup_datetime_1'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.016586558906373283
Gradient Boosting Reg Score : 0.7081956765032834


In [102]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_datetime_1,week,weekday,day_cycle
0,2009-06-15 17:26:21.0000001,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,17,26,21,2009-06-15,25,0,-0.98924
1,2010-01-05 16:52:16.0000002,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16,52,16,2010-01-05,1,1,-0.956644
2,2011-08-18 00:35:00.00000049,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,0,35,0,2011-08-18,33,3,0.152123
3,2012-04-21 04:30:42.0000001,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4,30,42,2012-04-21,16,5,0.925044
4,2010-03-09 07:51:00.000000135,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7,51,0,2010-03-09,10,1,0.884988


In [103]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime','key','pickup_datetime_1'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.016751837211588638
Gradient Boosting Reg Score : 0.7061027653149117


# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差?

In [104]:
# 加上"年週期"與"周週期"特徵
"""
Your Code Here
"""
df['week_cycle'] = df['week']/26 + df['weekday']/3.5
df['week_cycle'] = df['week_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_datetime_1,week,weekday,day_cycle,week_cycle
0,2009-06-15 17:26:21.0000001,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,17,26,21,2009-06-15,25,0,-0.98924,0.120537
1,2010-01-05 16:52:16.0000002,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16,52,16,2010-01-05,1,1,-0.956644,0.851284
2,2011-08-18 00:35:00.00000049,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,0,35,0,2011-08-18,33,3,0.152123,0.386667
3,2012-04-21 04:30:42.0000001,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4,30,42,2012-04-21,16,5,0.925044,0.137654
4,2010-03-09 07:51:00.000000135,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7,51,0,2010-03-09,10,1,0.884988,0.860214


In [105]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime','key','pickup_datetime_1'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.016694235757030773
Gradient Boosting Reg Score : 0.7077774597417598


In [None]:
pd.to_