# LAB | Ensemble Methods

**Load the data**

In this challenge, we will be working with the same Spaceship Titanic data, like the previous Lab. The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

In this Lab, you should try different ensemble methods in order to see if can obtain a better model than before. In order to do a fair comparison, you should perform the same feature scaling, engineering applied in previous Lab.

In [18]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor


In [2]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [3]:
#your code here
spaceship = spaceship.dropna()
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
#Preparing the dataframe: booleans into 0/1, creating dummies, removing the non numerical values
spaceship['Transported'] = spaceship['Transported'].apply(lambda x : 0 if x == False else 1)
spaceclean = spaceship.copy()
spaceclean['Cabin']=spaceclean['Cabin'].apply(lambda x: x.split("/")[0])
spaceclean['Cabin']
spaceclean.drop(columns=['PassengerId', 'Name'], inplace=True)

spaceship_trans = pd.merge(left=spaceclean,
                              right=pd.get_dummies(spaceclean['Destination'],prefix='Destination',drop_first=True),
                              left_index=True,
                              right_index=True)
spaceship_trans = pd.merge(left=spaceship_trans,
                              right=pd.get_dummies(spaceship_trans['HomePlanet'],prefix='HomePlanet',drop_first=True),
                              left_index=True,
                              right_index=True)
spaceship_trans = pd.merge(left=spaceship_trans,
                              right=pd.get_dummies(spaceship_trans['Cabin'],prefix='Cabin',drop_first=True),
                              left_index=True,
                              right_index=True)
spaceship_trans

bool_columns = ['CryoSleep', 'VIP', "HomePlanet_Europa", "HomePlanet_Mars", "Destination_PSO J318.5-22", "Destination_TRAPPIST-1e", "Cabin_B", "Cabin_C","Cabin_D", "Cabin_E","Cabin_F", "Cabin_G", "Cabin_T"]
for col in bool_columns:
    spaceship_trans[col] = spaceship_trans[col].apply(lambda x : 0 if x == False else 1)
spaceship_trans
spaceship_trans.drop(columns=['HomePlanet', 'Destination', 'Cabin'], inplace=True) #removing the non-numerical

In [5]:
spaceship_trans


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,HomePlanet_Europa,HomePlanet_Mars,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,1,1,0,1,0,0,0,0,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,1,0,0,0,0,0,0,1,0,0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,1,1,0,0,0,0,0,0,0,0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,1,1,0,0,0,0,0,0,0,0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,0,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0,0,0,1,0,0,0,0,0,0,0,0
8689,1,18.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,1,0
8690,0,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,0,1,0,0,0,0,0,0,0,1,0
8691,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0,0,0,1,0,0,0,0,1,0,0,0


**Perform Train Test Split**

In [59]:
#Target and Train
features = spaceship_trans.drop(columns=["Transported"])
target = spaceship_trans["Transported"]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

spaceship_trans.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T'],
      dtype='object')

In [60]:
#Scaling

normalizer = MinMaxScaler()
normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)

In [76]:
# Possible games with linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_norm, y_train)
pred = lin_reg.predict(X_test_norm)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", lin_reg.score(X_test_norm, y_test))


MAE 0.33888289012933115
RMSE 0.39895007334409044
R2 score 0.36335535591497936


**Model Selection** - now you will try to apply different ensemble methods in order to get a better model

- Bagging and Pasting

In [62]:
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                               n_estimators=100,
                               max_samples = 1000)

bagging_reg.fit(X_train_norm, y_train)

pred = bagging_reg.predict(X_test_norm)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", bagging_reg.score(X_test_norm, y_test))

MAE 0.2773395011485939
RMSE 0.37788570441866
R2 score 0.4288095775840526


In [63]:
#max dept= 20; n= 100, max 1000 => 0.4322669 ; 

In [64]:
#Decision Tree

tree = DecisionTreeRegressor(max_depth=10)
# max_depth = defines how many decision nodes we want

tree.fit(X_train_norm, y_train)

pred = tree.predict(X_test_norm)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", tree.score(X_test_norm, y_test))


MAE 0.2750271084115478
RMSE 0.4118537740108114
R2 score 0.32150587533220587


In [65]:
#Checking for the importance of the features 

tree_importance = {feature : importance for feature, importance in zip(X_train_norm.columns, tree.feature_importances_)}
tree_importance 

{'CryoSleep': 0.36874851023888405,
 'Age': 0.06370463789748562,
 'VIP': 0.0001638430866164443,
 'RoomService': 0.08918158339429563,
 'FoodCourt': 0.0948353428488228,
 'ShoppingMall': 0.036346252536492075,
 'Spa': 0.09480309387620485,
 'VRDeck': 0.12424544110036195,
 'Destination_PSO J318.5-22': 0.003895703606442955,
 'Destination_TRAPPIST-1e': 0.010674866930115587,
 'HomePlanet_Europa': 0.014469303363844273,
 'HomePlanet_Mars': 0.002245521221971631,
 'Cabin_B': 0.0027707868093734276,
 'Cabin_C': 0.00525961954250735,
 'Cabin_D': 0.006568399215557765,
 'Cabin_E': 0.01888709555805599,
 'Cabin_F': 0.002900326946536292,
 'Cabin_G': 0.06029967182643128,
 'Cabin_T': 0.0}

In [66]:
#bof not super results so far

In [72]:
# Attempt to reduce the features to what counts
features_2 = spaceship_trans[["CryoSleep", "RoomService", "FoodCourt", "Spa", "VRDeck"]]
target_2 = spaceship_trans["Transported"]

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(features_2, target_2, test_size=0.20, random_state=0)

In [73]:
#Scaling

normalizer_2 = MinMaxScaler()
normalizer_2.fit(X_train_2)

X_train_norm_2 = normalizer_2.transform(X_train_2)
X_test_norm_2 = normalizer_2.transform(X_test_2)

X_train_norm_2 = pd.DataFrame(X_train_norm_2, columns = X_train_2.columns)
X_test_norm_2 = pd.DataFrame(X_test_norm_2, columns = X_test_2.columns)

In [75]:
#Decision Tree

tree_2 = DecisionTreeRegressor(max_depth=10)
# max_depth = defines how many decision nodes we want

tree_2.fit(X_train_norm_2, y_train_2)

pred_2 = tree_2.predict(X_test_norm_2)

print("MAE", mean_absolute_error(pred_2, y_test_2))
print("RMSE", mean_squared_error(pred_2, y_test_2, squared=False))
print("R2 score", tree_2.score(X_test_norm_2.values, y_test_2))

MAE 0.30835354615612603
RMSE 0.4224175362012083
R2 score 0.2862537004388035




In [None]:
#much worse...

- Random Forests

In [71]:
#Initialise Ramdom forest 
forest = RandomForestRegressor(n_estimators=30,
                             max_depth=8)
forest.fit(X_train_norm, y_train)

pred = forest.predict(X_test_norm)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", forest.score(X_test_norm, y_test))


MAE 0.2757953326009281
RMSE 0.3778909913392543
R2 score 0.4287935946585425


In [None]:
# n = 100 & Max depth 20 => R2 0.41011

- Gradient Boosting

In [21]:
#Initialise gradient
gb_reg = GradientBoostingRegressor(max_depth=20,
                                   n_estimators=100)

#Training
gb_reg.fit(X_train_norm, y_train)

# Evaluating
print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", gb_reg.score(X_test_norm, y_test))




MAE 0.26830245463106117
RMSE 0.38402077292116066
R2 score 0.3037596808147328


- Adaptive Boosting

In [23]:
#Initialise ada booster
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

#Training
ada_reg.fit(X_train_norm, y_train)

# Evaluating
print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", ada_reg.score(X_test_norm, y_test))


MAE 0.26830245463106117
RMSE 0.38402077292116066
R2 score 0.25121453003215555


Which model is the best and why?

In [None]:
#comment here