<a href="https://colab.research.google.com/github/lindseyvanosky/Ensemble-Trees-Exercise/blob/main/LV_Ensemble_Trees_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Ensemble Trees Exercise
- Lindsey Vanosky 
- 06.29.22

##Import Libraries & Inspect Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

filename = "/content/drive/MyDrive/Coding Dojo/02 Stack 2: Intro to Machine Learning/Data Sets/Boston_Housing_from_Sklearn.csv"
df = pd.read_csv(filename)
df.head()

Unnamed: 0,CRIM,NOX,RM,AGE,PTRATIO,LSTAT,PRICE
0,0.00632,0.538,6.575,65.2,15.3,4.98,24.0
1,0.02731,0.469,6.421,78.9,17.8,9.14,21.6
2,0.02729,0.469,7.185,61.1,17.8,4.03,34.7
3,0.03237,0.458,6.998,45.8,18.7,2.94,33.4
4,0.06905,0.458,7.147,54.2,18.7,5.33,36.2


##Defining targets, features and train test split

In [None]:
y = df["PRICE"]
x = df[["RM", "LSTAT", "PTRATIO"]]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)

## 1) Decision Tree

In [None]:
dec_tree = DecisionTreeRegressor(random_state = 42)
dec_tree.fit(x_train, y_train)

DecisionTreeRegressor(random_state=42)

######Scoring (r2)

In [None]:
train_score = dec_tree.score(x_train, y_train)
test_score = dec_tree.score(x_test, y_test)
print(train_score)
print(test_score)
#We can see this is overfitting

1.0
0.45752194086453823


######Hyperparameter Tuning

In [None]:
dec_tree.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [None]:
dec_tree.get_depth()

20

In [None]:
dec_tree.get_n_leaves()

368

In [None]:
depths = list(range(10, 50))
scores = pd.DataFrame(index=depths, columns=['Test Score','Train Score'])
for depth in depths:
    dec_tree = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dec_tree.fit(x_train, y_train)
    train_score = dec_tree.score(x_test, y_test)
    test_score = dec_tree.score(x_test, y_test)
    scores.loc[depth, 'Train Score'] = train_score
    scores.loc[depth, 'Test Score'] = test_score
sort_scores = scores.sort_values(by = "Test Score", ascending = False)
sort_scores.head()
#it looks like 11 is the optimal number for depth

Unnamed: 0,Test Score,Train Score
11,0.615357,0.615357
19,0.463279,0.463279
16,0.463274,0.463274
10,0.459877,0.459877
13,0.458512,0.458512


#####Final Scores

In [None]:
#r2
dec_tree_new = DecisionTreeRegressor(max_depth = 11, random_state = 42)
dec_tree_new.fit(x_train, y_train)

new_train_score = dec_tree_new.score(x_train, y_train)
new_test_score = dec_tree_new.score(x_test, y_test)

print(new_train_score)
print(new_test_score)

0.9928919828140517
0.6153566129025645


######Adding Predictions

In [None]:
dec_train_pred = dec_tree_new.predict(x_train)
dec_test_pred = dec_tree_new.predict(x_test)

In [None]:
#MAE
dec_train_MAE = mean_absolute_error(y_train, dec_train_pred)
dec_test_MAE = mean_absolute_error(y_test, dec_test_pred)
print(dec_train_MAE)
print(dec_test_MAE)

0.38953294726381793
3.28498159320994


In [None]:
#MSE
dec_train_MSE = mean_squared_error(y_train, dec_train_pred)
dec_test_MSE = mean_squared_error(y_test, dec_test_pred)
print(dec_train_MSE)
print(dec_test_MSE)

0.6303512718591873
26.935477723943755


In [None]:
#RMSE
dec_train_RMSE = np.sqrt(dec_train_MSE)
dec_test_RMSE = np.sqrt(dec_test_MSE)
print(dec_train_RMSE)
print(dec_test_RMSE)

0.7939466429547941
5.189940050130035


## 2) Bagged Tree

In [None]:
bag_tree = BaggingRegressor(random_state = 42)
bag_tree.fit(x_train, y_train)
bag_tree.predict(x_test)

array([22.45, 31.37, 17.96, 23.76, 15.4 , 21.23, 20.26, 13.3 , 21.38,
       21.62, 21.71, 17.95, 11.62, 21.31, 17.49, 27.59, 21.38,  8.79,
       47.69, 15.96, 23.35, 23.25, 12.59, 23.5 , 14.82, 14.69, 32.2 ,
       15.7 , 19.6 , 20.59, 19.55, 22.79, 30.32, 20.85, 11.7 , 17.56,
       34.61, 29.36, 18.27, 23.56, 17.39, 28.  , 47.69, 23.51, 25.15,
       13.98, 15.86, 22.88, 15.44, 29.14, 23.53, 36.07, 18.85, 26.28,
       43.54, 21.58, 14.9 , 29.18, 24.26, 19.79, 26.64, 35.35, 28.87,
       19.41, 26.06, 19.5 , 15.52, 23.05, 26.92, 22.38, 20.29, 26.22,
       10.01, 21.91, 21.26,  8.1 , 21.66, 49.54, 12.94, 11.84, 29.01,
        8.34, 33.2 ,  8.78, 20.45, 26.43, 13.47, 23.46, 23.32, 16.94,
       22.12,  7.9 , 19.79, 20.59, 35.35, 19.53, 21.98, 10.13, 15.02,
       13.31, 19.51, 25.44, 12.  , 23.54, 18.94, 11.64, 18.95, 25.94,
       18.62, 23.25,  6.17, 13.72, 22.5 , 22.16, 34.87, 14.49, 42.49,
       15.41, 17.75, 24.1 , 23.7 , 24.45,  9.39, 20.83, 23.18, 19.57,
       24.34])

######Scoring (r2)

In [None]:
bt_train_score = bag_tree.score(x_train, y_train)
bt_test_score = bag_tree.score(x_test, y_test)
print(bt_train_score)
print(bt_test_score)

0.9567769581318673
0.6716295807636126


In [None]:
bag_tree.get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

######Hyperparameter Tuning

In [None]:
estimators = [10, 20, 30, 40, 50, 100]
scores = pd.DataFrame(index=estimators, columns=['Train Score', 'Test Score'])
for num_estimators in estimators:
   bag_reg = BaggingRegressor(n_estimators=num_estimators, random_state=42)
   bag_reg.fit(x_train, y_train)
   train_score = bag_reg.score(x_train, y_train)
   test_score = bag_reg.score(x_test, y_test)
   scores.loc[num_estimators, 'Train Score'] = train_score
   scores.loc[num_estimators, 'Test Score'] = test_score
   scores = scores.sort_values(by = "Test Score", ascending = False)
scores
#40 is the optimal number for n estimators

Unnamed: 0,Train Score,Test Score
40,0.969834,0.720751
100,0.973266,0.718568
50,0.970541,0.717904
30,0.967693,0.717179
20,0.9664,0.704722
10,0.956777,0.67163


#####Final Scores - Bagged Tree

In [None]:
#r2
new_bag_tree = BaggingRegressor(n_estimators = 40, random_state = 42)
new_bag_tree.fit(x_train, y_train)

new_train_score = new_bag_tree.score(x_train, y_train)
new_test_score = new_bag_tree.score(x_test, y_test)

print(new_train_score)
print(new_test_score)

0.9698341115000012
0.7207506295830213


######Adding Predictions

In [None]:
bt_train_pred = new_bag_tree.predict(x_train)
bt_test_pred = new_bag_tree.predict(x_test)

In [None]:
#MAE
bt_train_MAE = mean_absolute_error(y_train, bt_train_pred)
bt_test_MAE = mean_absolute_error(y_test, bt_test_pred)
print(bt_train_MAE)
print(bt_test_MAE)

1.0839709762532985
2.793956692913386


In [None]:
#MSE
bt_train_MSE = mean_squared_error(y_train, bt_train_pred)
bt_test_MSE = mean_squared_error(y_test, bt_test_pred)
print(bt_train_MSE)
print(bt_test_MSE)

2.675163225593668
19.555035777559052


In [None]:
#RMSE
bt_train_RMSE = np.sqrt(bt_train_MSE)
bt_test_RMSE = np.sqrt(bt_test_MSE)
print(bt_train_RMSE)
print(bt_test_RMSE)

1.635592622138431
4.422107617139033


## 3) Random Forest

In [None]:
rf = RandomForestRegressor(random_state = 42)
rf.fit(x_train, y_train)

RandomForestRegressor(random_state=42)

######Scoring (r2)

In [None]:
rf_train_score = rf.score(x_train, y_train)
rf_test_score = rf.score(x_test, y_test)
print(rf_train_score)
print(rf_test_score)


0.9741933966325389
0.7154130948850422


######Hyperparameter Tuning

In [None]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [None]:
est_depths = [estimator.get_depth() for estimator in rf.estimators_]
depths = range(1, max(est_depths))
scores = pd.DataFrame(index=depths, columns=['Test Score'])
for depth in depths:    
   model = RandomForestRegressor(max_depth=depth)
   model.fit(x_train, y_train)
   scores.loc[depth, 'Train Score'] = model.score(x_train, y_train)
   scores.loc[depth, 'Test Score'] = model.score(x_test, y_test)
scores.head()
sort_scores = scores.sort_values(by = "Test Score", ascending = False)
sort_scores.head()
#It seems 6 is the optimal number for max depth

Unnamed: 0,Test Score,Train Score
5,0.750405,0.91365
4,0.747854,0.890948
6,0.73878,0.935811
7,0.737526,0.945936
17,0.736205,0.971824


In [None]:
n_ests = [50, 100, 150, 200, 250]
scores2 = pd.DataFrame(index=n_ests, columns=['Test Score', 'Train Score'])
for n in n_ests:
   model = RandomForestRegressor(max_depth=29, n_estimators=n)
   model.fit(x_train, y_train)
   scores2.loc[n, 'Train Score'] = model.score(x_train, y_train)
   scores2.loc[n, 'Test Score'] = model.score(x_test, y_test)
scores2.head()
sort_scores2 = scores2.sort_values(by = "Test Score", ascending = False)
sort_scores2.head()
#It seems that 100 is the optimal number of n estimators

Unnamed: 0,Test Score,Train Score
50,0.73303,0.974699
250,0.726123,0.972775
150,0.722964,0.972409
100,0.718939,0.974627
200,0.718407,0.972468


#####Final Scores - Random Forest

In [None]:
#r2
new_rf = RandomForestRegressor(max_depth = 6, n_estimators = 100, random_state = 42)
new_rf.fit(x_train, y_train)

newrf_train_scores = new_rf.score(x_train, y_train)
newrf_test_scores = new_rf.score(x_test, y_test)

print(newrf_train_scores)
print(newrf_test_scores)

0.9351330666079332
0.7460864504521865


######Adding Predictions

In [None]:
rf_train_pred = new_rf.predict(x_train)
rf_test_pred = new_rf.predict(x_test)

In [None]:
#MAE
rf_train_MAE = mean_absolute_error(y_train, rf_train_pred)
rf_test_MAE = mean_absolute_error(y_test, rf_test_pred)
print(rf_train_MAE)
print(rf_test_MAE)

1.7804889683780467
2.6409048427937893


In [None]:
#MSE
rf_train_MSE = mean_squared_error(y_train, train_pred)
rf_test_MSE = mean_squared_error(y_test, test_pred)
print(rf_train_MSE)
print(rf_test_MSE)

0.0
37.98818897637794


In [None]:
#RMSE
rf_train_RMSE = np.sqrt(rf_train_MSE)
rf_test_RMSE = np.sqrt(rf_test_MSE)
print(rf_train_RMSE)
print(rf_test_RMSE)

0.0
6.163455927998345


#Conclusion

Decision Tree Metrics     
- r2 Train = 0.992 
- r2 Test = 0.615
- MAE Train = 0.389
- MAE Test = 3.284
- MSE Train = 0.630
- MSE Test = 26.935
- RMSE Train = 0.793
- RMSE Test = 5.189

---
Bagged Tree Metrics
- r2 Train = 0.969
- r2 Test = 0.720
- MAE Train = 1.083
- MAE Test = 2.793
- MSE Train = 2.675
- MSE Test = 19.555
- RMSE Train = 1.635
- RMSE Test = 4.422

---
Random Forest Metrics
- r2 Train = 0.935
- r2 Test = 0.746
- MAE Train = 1.780
- MAE Test = 2.640
- MSE Train = 0
- MSE Test = 37.988
- RMSE Train = 0
- RMSE Test = 6.163


Based off of these results, we can see that although our Random Forest model has the best test r2 score, by looking at the MAE and RMSE we can see that the model is overfitting. The model with the most consistent metrics is our Bagged Tree Model. It can account for 72% of the data and the difference between the train and test set are the most minimal. 