# Decision Tree Regression

In [109]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [132]:
#Get data
data = pd.read_csv('auto-mpg.csv')
X = data.drop('mpg',axis = 1) #float64
y = data['mpg'] #float64

#Now we want to split the origin column up a bit
#since 1 = US, 2=EU and 3=Asia we can make a mapping

#After the following 3 maps we will get 3 new columns: 'origin_US','origin_Europ', and 'origin_Asia'
mapUS = {1:1,2:0,3:0}
X['origin_US'] = data['origin'].map(mapUS)

mapEU = {1:0,2:1,3:0}
X['origin_Europe']=data['origin'].map(mapEU)

mapAsia = {1:0,2:0,3:1}
X['origin_Asia']=data['origin'].map(mapAsia)


#We also will not be utilizing the 'cylinders', 'model year', 'horsepower', and 'car name' columns.
#Furthermore, we drop 'origin' column as we do not need it anymore
X = X.drop(['cylinders', 'model year', 'car name','origin','horsepower'],axis = 1)


In [124]:
SEED = np.random.seed(343)

In [125]:
#Split Data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2,random_state = SEED)

In [126]:
#dtr: Decision Tree Regressor
dtr = DecisionTreeRegressor(max_depth = 8, min_samples_leaf = .13)

#fit model to training set
dtr.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=8, min_samples_leaf=0.13)

## DTR Error vs Linear

In [130]:
from sklearn.metrics import mean_squared_error as MSE


y_pred_dtr = dtr.predict(X_test)
mse_dtr = MSE(y_pred_dtr,y_test)
rmse_dtr = mse_dtr**(1/2)
print("Test set RMSE of dt: {:.2f}".format(rmse_dtr))

Test set RMSE of dt: 3.69


In [138]:
from sklearn import linear_model

lr = linear_model.LinearRegression()
lr.fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = MSE(y_pred_lr,y_test)
rmse_lr = mse_lr ** (1/2)
print("Test set RMSE of dt: {:.2f}".format(rmse_lr))

Test set RMSE of dt: 3.87
