# Compare

# Linear regression vs. Decision tree regressor w/ various depths

In [1]:
import pandas as pd
import time

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.linear_model import LinearRegression

from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("vehicles_maindata.csv")

In [3]:
df_1 = df.loc[:,['price', 'year', 'manufacturer', 
       'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'drive', 'type', 'area']]

In [4]:
df_1.dtypes

price             int64
year             object
manufacturer     object
condition        object
cylinders         int64
fuel             object
odometer        float64
title_status     object
transmission     object
drive            object
type             object
area             object
dtype: object

In [5]:
# Perform data transformation - apply get_dummies to 'object' type variables
for col in df_1.columns[1:]:
    attName = col
    dType = df[col].dtype
    missing = pd.isnull(df[col]).any()
    uniqueCount = len(df[attName].value_counts(normalize=False))
# discretize (create dummies)
    if dType == object:
        df_1 = pd.concat([df_1, pd.get_dummies(df_1[col], prefix=col)], axis=1)
        del df_1[attName]

In [6]:
df_1

Unnamed: 0,price,cylinders,odometer,year_2001-01-01,year_2002-01-01,year_2003-01-01,year_2004-01-01,year_2005-01-01,year_2006-01-01,year_2007-01-01,...,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon,area_mw,area_ne,area_south,area_west
0,15000,6,128000.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,27990,8,68696.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,34590,6,29499.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,35000,6,43000.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,29990,6,17302.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103572,39990,8,41664.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
103573,32990,8,55612.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
103574,33590,6,30814.0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
103575,23590,6,32226.0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [7]:
# Create features and target
X = df_1.copy().drop('price', axis=1)
y = df_1['price']

In [8]:
# Split train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .2, random_state=2)

# Linear Regression

In [9]:
# Create linear regression model
lin_reg_model = LinearRegression()

In [10]:
# Fit model
start_time = time.perf_counter()

# Rud fit code
lin_reg_model.fit(X_train, y_train)

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(elapsed_time)

3.043480600004841


In [11]:
# Prediction
y_pred_lin = lin_reg_model.predict(X_test)

In [24]:
# R score
error_score = metrics.r2_score(y_test, y_pred_lin)
print("R Squared Error_score : ",error_score.round(4), "with linear model")

R Squared Error_score :  0.7539 with linear model


# Decision tree regressor

In [12]:
dt_reg_model = DecisionTreeRegressor()

In [13]:
# Find optimal depth
pipe = Pipeline(steps=[('regr',dt_reg_model)])

In [14]:
# Set parameters as value list from which GridSearchCV will select best value
max_depth = [int(i) for i in range(1,51)]

In [16]:
# Create dictionary to set parameters
parameters = dict(regr__max_depth=max_depth)

In [17]:
start_time = time.perf_counter()

# Create GridSearchCV object and dataset
y_pred_dt = GridSearchCV(pipe, parameters)
y_pred_dt.fit(X_train, y_train)

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(elapsed_time)

407.0405559999999


In [18]:
# Print result
print(); print(y_pred_dt.best_estimator_.get_params()['regr'])


DecisionTreeRegressor(max_depth=27)


In [19]:
# Prepare for result DataFrame
dict = {'price': y_test,
       'liner_model': y_pred_lin}

In [25]:
# Run decision tree with depth 1 to 50

for i in max_depth:
    # Define name
    prediction = 'prediction_''{}'.format(i)
    
    y_pred = DecisionTreeRegressor(max_depth=i)
    y_pred.fit(X_train, y_train)
    y_pred_dp = y_pred.predict(X_test)
    
    # Add result to 'dict'
    dict[prediction] = y_pred_dp
    
    # R square error
    error_score = metrics.r2_score(y_test, y_pred_dp)
    print("R Squared Error_score : ",error_score.round(4), "(max_depth = ", '{}'.format(i), ")")

R Squared Error_score :  0.3083 (max_depth =  1 )
R Squared Error_score :  0.4444 (max_depth =  2 )
R Squared Error_score :  0.5274 (max_depth =  3 )
R Squared Error_score :  0.5807 (max_depth =  4 )
R Squared Error_score :  0.618 (max_depth =  5 )
R Squared Error_score :  0.6556 (max_depth =  6 )
R Squared Error_score :  0.6825 (max_depth =  7 )
R Squared Error_score :  0.7084 (max_depth =  8 )
R Squared Error_score :  0.7316 (max_depth =  9 )
R Squared Error_score :  0.7521 (max_depth =  10 )
R Squared Error_score :  0.772 (max_depth =  11 )
R Squared Error_score :  0.7866 (max_depth =  12 )
R Squared Error_score :  0.8049 (max_depth =  13 )
R Squared Error_score :  0.8167 (max_depth =  14 )
R Squared Error_score :  0.8245 (max_depth =  15 )
R Squared Error_score :  0.8294 (max_depth =  16 )
R Squared Error_score :  0.8354 (max_depth =  17 )
R Squared Error_score :  0.8396 (max_depth =  18 )
R Squared Error_score :  0.8414 (max_depth =  19 )
R Squared Error_score :  0.8425 (max_depth

In [26]:
df_result = pd.DataFrame(dict).round(2)

In [27]:
df_result

Unnamed: 0,price,liner_model,prediction_1,prediction_2,prediction_3,prediction_4,prediction_5,prediction_6,prediction_7,prediction_8,...,prediction_41,prediction_42,prediction_43,prediction_44,prediction_45,prediction_46,prediction_47,prediction_48,prediction_49,prediction_50
3692,19900,18670.22,12645.71,15918.34,14471.75,11866.24,17256.47,13230.85,12784.62,12514.45,...,17995.0,17995.0,17999.0,17995.0,17999.0,17999.0,26995.0,17999.0,17999.0,23000.0
92885,4998,2060.73,12645.71,7760.90,5832.31,5103.04,5252.37,5164.39,5048.46,4895.45,...,6555.0,4995.0,6555.0,6555.0,6555.0,4995.0,4995.0,4995.0,6555.0,6555.0
7937,29990,32140.71,29069.42,32796.95,31944.79,34134.73,34750.79,37821.47,37046.80,37266.24,...,29990.0,29990.0,29990.0,29990.0,29990.0,29990.0,29990.0,29990.0,29990.0,29990.0
45589,51900,37387.15,29069.42,32796.95,31944.79,27407.92,29245.36,31848.28,32538.03,35499.90,...,51900.0,51900.0,51900.0,51900.0,51900.0,51900.0,51900.0,51900.0,51900.0,51900.0
92790,39590,40308.52,29069.42,32796.95,31944.79,34134.73,34750.79,37821.47,37046.80,37266.24,...,39590.0,39590.0,39590.0,39590.0,39590.0,39590.0,39590.0,39590.0,39590.0,39590.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33496,46000,48573.29,12645.71,15918.34,29823.68,31971.15,33367.82,36687.18,35234.55,33528.19,...,53900.0,53900.0,54990.0,54990.0,53900.0,53900.0,53900.0,54990.0,53900.0,53900.0
71214,28318,28789.04,29069.42,32796.95,31944.79,34134.73,34750.79,32523.09,37748.05,38543.09,...,28318.0,28318.0,28318.0,28318.0,28318.0,28318.0,28318.0,28318.0,28318.0,28318.0
15994,37088,33905.52,29069.42,32796.95,31944.79,27407.92,29245.36,25815.00,29183.23,26365.41,...,37088.0,37088.0,37088.0,37088.0,37088.0,37088.0,37088.0,37088.0,37088.0,37088.0
44136,7995,4778.81,12645.71,7760.90,9666.54,8768.67,8354.42,8171.98,7909.93,7644.39,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0


In [28]:
df_result.to_csv("pred_result.csv", index=False)