# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 試著模仿範例寫法, 使用程車費率預測競賽練習時間欄位處理

# [作業重點]
- 新增星期幾(day of week)與第幾周(week of year)這兩項特徵, 觀察有什麼影響 (In[4], Out[4], In[5], Out[5])
- 新增加上年週期與周周期特徵 , 觀察有什麼影響 (In[8], Out[8], In[9], Out[9]) 

In [19]:
import os
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

# Set data directory
dir_data = 'D:\Document\AI\Marathon100D\Assignment\Day_025\data'

# Set the full data file name
f_app_train = os.path.join(dir_data, 'taxi_data1.csv')

# Read CSV into data frame
df = pd.read_csv(f_app_train)

# Extract target data
train_Y = df['fare_amount']

# Drop target data column from data frame
df = df.drop(['fare_amount'] , axis=1)

# Show top few rows
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [20]:
# 時間特徵分解方式:使用datetime
# Convert string data to date time
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
# Add a column extracting the year of the date time value
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
# Add a column extracting the month of the date time value
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
# Add a column extracting the day of the date time value
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
# Add a column extracting the hour of the date time value
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
# Add a column extracting the minute of the date time value
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
# Add a column extracting the second of the date time value
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [21]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
# Create a temp data frame by droping the date time column
df_temp = df.drop(['pickup_datetime'] , axis=1)

# Create a minmax data scaler
scaler = MinMaxScaler()

# Create a data frame by applying the minmax data scaler
train_X = scaler.fit_transform(df_temp)

# Create a linear regression model
Linear = LinearRegression()

# Print the mean value of cross validation score
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')

# Create a gradient boosting regression model
GDBT = GradientBoostingRegressor()

# Print the mean value of cross validation score
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475641616
Gradient Boosting Reg Score : 0.7120997355110542


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?

It gets worse.

In [22]:
# 加入星期幾與第幾周兩個特徵
"""
Your Code Here
"""
import math
# Add a column based on week day of the datetime column
df['pickup_weekday'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%w')).astype('int64')

# Add a column based on week number of the datetime column
df['pickup_week'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%W')).astype('int64')

# Show top few rows
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_weekday,pickup_week
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,10
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23


In [23]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
# Drop the original datetime column
df_temp = df.drop(['pickup_datetime'] , axis=1)

# Create training data frame based on applying the minmax scaler
train_X = scaler.fit_transform(df_temp)

# Print the mean value of cross validation score
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')

# Print the mean value of cross validation score
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.028649006962518264
Gradient Boosting Reg Score : 0.7119547010504187


In [24]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math

# Add a column with daily pattern
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200

# Apply sin fucntion
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))

# Show top few rows
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_weekday,pickup_week,day_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42,-0.02545
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5,0.333601
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,10,-0.967083
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23,-0.888817
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23,0.782427


In [25]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
# Drop the original datetime column
df_temp = df.drop(['pickup_datetime'] , axis=1)

# Create training data frame based on applying the minmax scaler
train_X = scaler.fit_transform(df_temp)

# Print the mean value of cross validation score
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')

# Print the mean value of cross validation score
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.02827400966881355
Gradient Boosting Reg Score : 0.7135379955622179


# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差?

It gets worse. The data is not suitable to add weekly and yearly pattern.

In [26]:
# 加上"年週期"與"周週期"特徵
"""
Your Code Here
"""
import math

# Add yearly cycle
# Add a column based on the yearly time pattern
df['year_cycle'] = df['pickup_month']/6 + df['pickup_day']/180
# Apply the cos function
df['year_cycle'] = df['year_cycle'].map(lambda x:math.cos(x*math.pi))

# Add weekly cycle
# Add a column based on the weekly time pattern
df['weekday_cycle'] = df['pickup_weekday']/3.5+ df['pickup_hour']/84
# Apply sin function
df['weekday_cycle'] = df['weekday_cycle'].map(lambda x:math.sin(x*math.pi))

# Show top few rows
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_weekday,pickup_week,day_cycle,year_cycle,weekday_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42,-0.02545,0.777146,-0.804598
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5,0.333601,0.45399,0.826239
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,10,-0.967083,-0.275637,0.62349
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23,-0.888817,-0.97437,-0.294755
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23,0.782427,-0.978148,-0.532032


In [27]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
# Create a temp data frame by dropping the original data time column
df_temp = df.drop(['pickup_datetime'] , axis=1)

# Apply the minmax scaler
train_X = scaler.fit_transform(df_temp)

# Get mean value of cross validation score using linear regression model
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')

# Get mean value of cross validation score using gradient boosting regression model
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.02832659986669268
Gradient Boosting Reg Score : 0.712932902341161
