# Decision Tree - Scikit Learn

## Import Required Libraries

In [90]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## Load Dataset

> The seaborn library has many example datasets you can work with. Use the following code to view all seaborn datasets: sb.get_dataset_names()

In [71]:
df = sb.load_dataset('mpg')

In [72]:
# Without the include='all' parameter, the describe function would only show numeric variables.

df.describe(include='all')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398,398
unique,,,,,,,,3,305
top,,,,,,,,usa,ford pinto
freq,,,,,,,,249,6
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,,
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,,
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,,
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,,
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,,
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,,


In [73]:
df.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
5,15.0,8,429.0,198.0,4341,10.0,70,usa,ford galaxie 500
6,14.0,8,454.0,220.0,4354,9.0,70,usa,chevrolet impala
7,14.0,8,440.0,215.0,4312,8.5,70,usa,plymouth fury iii
8,14.0,8,455.0,225.0,4425,10.0,70,usa,pontiac catalina
9,15.0,8,390.0,190.0,3850,8.5,70,usa,amc ambassador dpl


## Data Preprocessing

### Selecting columns and handling nulls

In [74]:
# Selecting and reordering columns so the target variable is in the last column.

df = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'mpg']]

In [75]:
# Exploring the data to find any nulls

df.isna().sum()

cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
mpg             0
dtype: int64

In [76]:
# Dropping null values. Axis = 0 drops all rows with null values (Axis = 1 drops columns). Inplace = True modifies the original dataframe

df.dropna(axis = 0, inplace = True)

### Splitting the data between predictors and target

> The code below shows 2 different ways that this split can be done. If the target variable is in the last column, the 2nd option is faster in the sense that you don't have to explcitly type all column names.

In [77]:
# Explicitly naming the columns
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']]
y = df['mpg']

# Assuming that your target variable is in the last column, the data can be split with the following:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

### Train/Test Split

> Reserves a certain percentage of the data for testing your model. The test size parameter takes a value between 0 and 1.

In [78]:
# The train_test_split function is imported with the scikit learn package

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

## Decision Tree

> Details on the scikit learn Linear Regression model can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor

In [85]:
regressor = DecisionTreeRegressor()

### Model Fit

In [86]:
model_fit = regressor.fit(X_train, y_train)

### Model Predictions

In [87]:
predictions = model_fit.predict(X_test)

### Model Scoring

In [88]:
# R-Squared value of the test set. 

print(model_fit.score(X_train, y_train))
print(model_fit.score(X_test, y_test))

1.0
0.7503739436265764


In [91]:
# Alternate method to find the R-Squared Score

r2_score(y_test, predictions)

0.7503739436265764

In [92]:
# RMSE (Root Mean Squared Error). Without the squared = False parameter, you would have the mean squared error

mean_squared_error(y_test, predictions, squared=False)

3.814436141413706