In [141]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

%matplotlib inline
warnings.filterwarnings("ignore")

In [142]:
df = pd.read_csv('uber_expl.csv')

In [143]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,distance,cab_type,destination,source,price,surge_multiplier,name,date,hour,minute,temp,clouds,pressure,rain,humidity,wind
0,0,1.11,Uber,West End,North End,12.0,1.0,UberXL,2018-11-30,22,13,38.65,0.96,1017.23,0.0,0.65,3.13
1,1,1.11,Uber,West End,North End,16.0,1.0,Black,2018-12-13,10,50,20.12,0.43,1031.07,0.0,0.64,3.88
2,2,1.11,Uber,West End,North End,7.5,1.0,UberX,2018-12-13,19,15,32.75,0.51,1033.75,0.0,0.57,1.9


In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329140 entries, 0 to 329139
Data columns (total 17 columns):
Unnamed: 0          329140 non-null int64
distance            329140 non-null float64
cab_type            329140 non-null object
destination         329140 non-null object
source              329140 non-null object
price               329140 non-null float64
surge_multiplier    329140 non-null float64
name                329140 non-null object
date                329140 non-null object
hour                329140 non-null int64
minute              329140 non-null int64
temp                329140 non-null float64
clouds              329140 non-null float64
pressure            329140 non-null float64
rain                329140 non-null float64
humidity            329140 non-null float64
wind                329140 non-null float64
dtypes: float64(9), int64(3), object(5)
memory usage: 42.7+ MB


In [145]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('cab_type', axis=1, inplace=True)
df.drop('surge_multiplier', axis=1, inplace=True)

In [146]:
df.head(3)

Unnamed: 0,distance,destination,source,price,name,date,hour,minute,temp,clouds,pressure,rain,humidity,wind
0,1.11,West End,North End,12.0,UberXL,2018-11-30,22,13,38.65,0.96,1017.23,0.0,0.65,3.13
1,1.11,West End,North End,16.0,Black,2018-12-13,10,50,20.12,0.43,1031.07,0.0,0.64,3.88
2,1.11,West End,North End,7.5,UberX,2018-12-13,19,15,32.75,0.51,1033.75,0.0,0.57,1.9


In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329140 entries, 0 to 329139
Data columns (total 14 columns):
distance       329140 non-null float64
destination    329140 non-null object
source         329140 non-null object
price          329140 non-null float64
name           329140 non-null object
date           329140 non-null object
hour           329140 non-null int64
minute         329140 non-null int64
temp           329140 non-null float64
clouds         329140 non-null float64
pressure       329140 non-null float64
rain           329140 non-null float64
humidity       329140 non-null float64
wind           329140 non-null float64
dtypes: float64(8), int64(2), object(4)
memory usage: 35.2+ MB


In [148]:
df['date'] = pd.to_datetime(df['date'])

In [149]:
df['month'] = df['date'].dt.month
df['weekdays'] =[pd.to_datetime(df.date[i]).weekday() for i in range(len(df.date))]

In [150]:
df.head(3)

Unnamed: 0,distance,destination,source,price,name,date,hour,minute,temp,clouds,pressure,rain,humidity,wind,month,weekdays
0,1.11,West End,North End,12.0,UberXL,2018-11-30,22,13,38.65,0.96,1017.23,0.0,0.65,3.13,11,4
1,1.11,West End,North End,16.0,Black,2018-12-13,10,50,20.12,0.43,1031.07,0.0,0.64,3.88,12,3
2,1.11,West End,North End,7.5,UberX,2018-12-13,19,15,32.75,0.51,1033.75,0.0,0.57,1.9,12,3


In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329140 entries, 0 to 329139
Data columns (total 16 columns):
distance       329140 non-null float64
destination    329140 non-null object
source         329140 non-null object
price          329140 non-null float64
name           329140 non-null object
date           329140 non-null datetime64[ns]
hour           329140 non-null int64
minute         329140 non-null int64
temp           329140 non-null float64
clouds         329140 non-null float64
pressure       329140 non-null float64
rain           329140 non-null float64
humidity       329140 non-null float64
wind           329140 non-null float64
month          329140 non-null int64
weekdays       329140 non-null int64
dtypes: datetime64[ns](1), float64(8), int64(4), object(3)
memory usage: 40.2+ MB


In [152]:
df.drop('date', axis=1, inplace=True)
df.drop('minute', axis=1, inplace=True)

In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329140 entries, 0 to 329139
Data columns (total 14 columns):
distance       329140 non-null float64
destination    329140 non-null object
source         329140 non-null object
price          329140 non-null float64
name           329140 non-null object
hour           329140 non-null int64
temp           329140 non-null float64
clouds         329140 non-null float64
pressure       329140 non-null float64
rain           329140 non-null float64
humidity       329140 non-null float64
wind           329140 non-null float64
month          329140 non-null int64
weekdays       329140 non-null int64
dtypes: float64(8), int64(3), object(3)
memory usage: 35.2+ MB


In [154]:
ohe = OneHotEncoder()

destination = pd.DataFrame(ohe.fit_transform(df[['destination']]).toarray(), 
                           columns=['dest_'+loc for loc in sorted(list(df['destination'].unique()))])

source = pd.DataFrame(ohe.fit_transform(df[['source']]).toarray(), 
                       columns=['src_'+loc for loc in sorted(list(df['source'].unique()))])

car_type = pd.DataFrame(ohe.fit_transform(df[['name']]).toarray(),
                        columns=sorted(list(df['name'].unique())))

weekdays = pd.DataFrame(ohe.fit_transform(df[['weekdays']]).toarray(),
                        columns=sorted(list(df['weekdays'].unique())))

month = pd.DataFrame(ohe.fit_transform(df[['month']]).toarray(),
                        columns=sorted(list(df['month'].unique())))

In [155]:
destination.head()

Unnamed: 0,dest_Back Bay,dest_Beacon Hill,dest_Boston University,dest_Fenway,dest_Financial District,dest_Haymarket Square,dest_North End,dest_North Station,dest_Northeastern University,dest_South Station,dest_Theatre District,dest_West End
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [156]:
source.head()

Unnamed: 0,src_Back Bay,src_Beacon Hill,src_Boston University,src_Fenway,src_Financial District,src_Haymarket Square,src_North End,src_North Station,src_Northeastern University,src_South Station,src_Theatre District,src_West End
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [157]:
car_type.head()

Unnamed: 0,Black,Black SUV,UberPool,UberX,UberXL,WAV
0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0


In [158]:
weekdays.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [159]:
month.head()

Unnamed: 0,11,12
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [160]:
df_onehot = pd.concat([df, destination, source, car_type, weekdays, month ], 
                      axis=1)
df_onehot.drop(['source', 'destination', 'name', 'weekdays', 'month'], axis=1, inplace=True)

In [161]:
df_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329140 entries, 0 to 329139
Data columns (total 48 columns):
distance                        329140 non-null float64
price                           329140 non-null float64
hour                            329140 non-null int64
temp                            329140 non-null float64
clouds                          329140 non-null float64
pressure                        329140 non-null float64
rain                            329140 non-null float64
humidity                        329140 non-null float64
wind                            329140 non-null float64
dest_Back Bay                   329140 non-null float64
dest_Beacon Hill                329140 non-null float64
dest_Boston University          329140 non-null float64
dest_Fenway                     329140 non-null float64
dest_Financial District         329140 non-null float64
dest_Haymarket Square           329140 non-null float64
dest_North End                  329140 non-null flo

In [162]:
df_onehot.head()

Unnamed: 0,distance,price,hour,temp,clouds,pressure,rain,humidity,wind,dest_Back Bay,dest_Beacon Hill,dest_Boston University,dest_Fenway,dest_Financial District,dest_Haymarket Square,dest_North End,dest_North Station,dest_Northeastern University,dest_South Station,dest_Theatre District,dest_West End,src_Back Bay,src_Beacon Hill,src_Boston University,src_Fenway,src_Financial District,src_Haymarket Square,src_North End,src_North Station,src_Northeastern University,src_South Station,src_Theatre District,src_West End,Black,Black SUV,UberPool,UberX,UberXL,WAV,0,1,2,3,4,5,6,11,12
0,1.11,12.0,22,38.65,0.96,1017.23,0.0,0.65,3.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.11,16.0,10,20.12,0.43,1031.07,0.0,0.64,3.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.11,7.5,19,32.75,0.51,1033.75,0.0,0.57,1.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.11,7.5,23,42.36,1.0,1012.15,0.2088,0.77,11.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.11,26.0,0,30.07,0.69,1034.97,0.0,0.68,1.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [163]:
x = df_onehot.drop(['price'], axis=1)
y = df_onehot['price']

In [164]:
x.shape

(329140, 47)

In [165]:
y.shape

(329140,)

In [166]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.3,
                                                    random_state=0)

In [167]:
seed = 0

In [168]:
# 선형회귀

lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

train_pred = lin_reg.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
print('Train score : ', train_rmse)


predicted = lin_reg.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, predicted))
print('Test score : ', test_rmse)

lin_reg = {}
lin_reg['Train'] = round(train_rmse,4)
lin_reg['Test'] = round(test_rmse, 4)
lin_reg

Train score :  2.40192775105956
Test score :  2.4058717267197824


{'Train': 2.4019, 'Test': 2.4059}

In [169]:
# 결정트리

dtree = DecisionTreeRegressor()

dtree.fit(X_train, y_train)

train_pred = dtree.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
print('Train score : ', train_rmse)

predicted = dtree.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, predicted))
print('Test score : ', test_rmse)

dt_reg = {}
dt_reg['Train'] = round(train_rmse,4)
dt_reg['Test'] = round(test_rmse, 4)
dt_reg

Train score :  0.9903781631671267
Test score :  2.513293194805477


{'Train': 0.9904, 'Test': 2.5133}

In [171]:
%%time
# 랜덤 포레스트

rf = RandomForestRegressor()

rf.fit(X_train, y_train)

train_pred = rf.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
print('Train score : ', train_rmse)

predicted = rf.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, predicted))
print('Test score : ', test_rmse)

rf_reg = {}
rf_reg['Train'] = round(train_rmse,4)
rf_reg['Test'] = round(test_rmse, 4)
rf_reg

Train score :  1.1307132570708192
Test score :  2.079081151424976
Wall time: 5min 1s


{'Train': 1.1307, 'Test': 2.0791}

In [172]:
# 아다 부스트

adar = AdaBoostRegressor(random_state=seed)

adar.fit(X_train, y_train)

train_pred = adar.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
print('Train score : ', train_rmse)

predicted = adar.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, predicted))
print('Test score : ', test_rmse)

ada_reg = {}
ada_reg['Train'] = round(train_rmse,4)
ada_reg['Test'] = round(test_rmse, 4)
ada_reg

Train score :  6.07900651678313
Test score :  6.085266687988645


{'Train': 6.079, 'Test': 6.0853}

In [173]:
# 그래디언트 부스트

gbr = GradientBoostingRegressor(random_state=seed)

gbr.fit(X_train, y_train)

train_pred = gbr.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
print('Train score : ',train_rmse)

predicted = gbr.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, predicted))
print('Test score : ', test_rmse)

gbr_reg = {}
gbr_reg['Train'] = round(train_rmse,4)
gbr_reg['Test'] = round(test_rmse, 4)
gbr_reg

Train score :  1.935562767281448
Test score :  1.9457845139493224


{'Train': 1.9356, 'Test': 1.9458}

In [174]:
# XG부스트

xbr = XGBRegressor(random_state=seed)

xbr.fit(X_train, y_train)

train_pred = xbr.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
print('Train score : ', tr_rmse)

predicted = xbr.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, predicted))
print('Test score : ', test_rmse)

xbr_reg = {}
xbr_reg['Train'] = round(train_rmse,4)
xbr_reg['Test'] = round(test_rmse, 4)
xbr_reg

Train score :  2.095913478054979
Test score :  1.9468776409001278


{'Train': 1.9369, 'Test': 1.9469}

In [175]:
# Cat부스트

cab=CatBoostRegressor(iterations=100, verbose=400, random_state=seed)

cab.fit(X_train, y_train,
          eval_set=(X_test, y_test))

train_pred = cab.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
print('Train score : ', train_rmse)

predicted = cab.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, predicted))
print('Test score : ', test_rmse)

cat_reg = {}
cat_reg['Train'] = round(train_rmse,4)
cat_reg['Test'] = round(test_rmse, 4)
cat_reg

Learning rate set to 0.5
0:	learn: 4.7409889	test: 4.7305542	best: 4.7305542 (0)	total: 37.8ms	remaining: 3.74s
99:	learn: 1.8337417	test: 1.8776660	best: 1.8776660 (99)	total: 3.14s	remaining: 0us

bestTest = 1.877665957
bestIteration = 99

Train score :  1.8337416871499304
Test score :  1.8776659567793055


{'Train': 1.8337, 'Test': 1.8777}

In [176]:
final_results = pd.DataFrame([lin_reg,
                              dt_reg,
                              rf_reg,
                              ada_reg,
                              gbr_reg,
                              xbr_reg,
                              cat_reg],                             
                            index=['Linear Regression',
                                   'Decision Tree',
                                   'Random Forest',
                                   'Ada Boost',
                                   'Gradient Boost',
                                   'Xg Boost',
                                   'Cat Boost'])
final_results

Unnamed: 0,Train,Test
Linear Regression,2.4019,2.4059
Decision Tree,0.9904,2.5133
Random Forest,1.1307,2.0791
Ada Boost,6.079,6.0853
Gradient Boost,1.9356,1.9458
Xg Boost,1.9369,1.9469
Cat Boost,1.8337,1.8777
