## Packages

In [60]:
# Import packages

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

## Data

In [2]:
# Read both Train and Test csv files

test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

In [3]:
train_df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## Dealing with NaN

In [45]:
# Create a copy of the dataframe to work with
train_df_2 = train_df
# Replace "HomePlanet" null values with "Earth"
train_df_2["HomePlanet"].fillna("Earth", inplace=True)
# Replace "CryoSleep" null values with False
train_df_2["CryoSleep"].fillna(False, inplace=True)
# Drop rows with "Cabin" null values
train_df_2 = train_df_2[train_df_2["Cabin"].notna()]
# Replace "Destination" null values with "TRAPPIST-1e"
train_df_2.loc[train_df_2["Destination"].isnull(), "Destination"] = "TRAPPIST-1e"
# Drop rows with "Age" null values
train_df_2 = train_df_2.loc[train_df_2["Age"].notna(), :]
# Replace "VIP" null values with False, then Trues to 1 and Falses to 0
train_df_2.loc[train_df_2["VIP"].isnull(), "VIP"] = False
# Replace "RoomService" null values with 0
train_df_2.loc[train_df_2["RoomService"].isnull(), "RoomService"] = 0
# Replace "FoodCourt" null values with 0
train_df_2.loc[train_df_2["FoodCourt"].isnull(), "FoodCourt"] = 0
# Replace "ShoppingMall" null values with 0
train_df_2.loc[train_df_2["ShoppingMall"].isnull(), "ShoppingMall"] = 0
# Replace "Spa" null values with 0
train_df_2.loc[train_df_2["Spa"].isnull(), "Spa"] = 0
# Replace "VRDeck" null values with 0
train_df_2.loc[train_df_2["VRDeck"].isnull(), "VRDeck"] = 0


## Encoding, scaling...

In [46]:
# Drop PassengerId and Name columns
train_df_2.drop(["PassengerId", "Name"], axis=1, inplace= True)
# Replace "VIP" Trues with 1 and Falses with 0
train_df_2["VIP"] = train_df_2["VIP"].astype(int)
# Split "Cabin" column in 3 columns and drop the original one
train_df_2[['Cabin_1', 'Cabin_2', 'Cabin_3']] = train_df_2['Cabin'].str.split('/',expand=True)
train_df_2.drop(columns="Cabin", inplace=True)
# Replace "CryoSleep" Trues with 1 and Falses with 0
train_df_2["CryoSleep"] = train_df_2["CryoSleep"].astype(int)
# Replace "Transported" Trues with 1 and Falses with 0
train_df_2["Transported"] = train_df_2["Transported"].astype(int)

In [47]:
# Check unique values for HomePlanet
print(f"The unique values for 'HomePlanet' are {train_df_2.HomePlanet.unique()}") 
# Instantiate the OneHotEncoder
ohe_1 = OneHotEncoder(sparse = False) 
# Fit encoder
ohe_1.fit(train_df_2[['HomePlanet']])
# Display the detected categories
print(f"The categories detected by the OneHotEncoder are {ohe_1.categories_}")

The unique values for 'HomePlanet' are ['Europa' 'Earth' 'Mars']
The categories detected by the OneHotEncoder are [array(['Earth', 'Europa', 'Mars'], dtype=object)]


In [48]:
# Display the generated names
print(f"The column names for the encoded values are {ohe_1.get_feature_names_out()}") 
# Transform the current "HomePlanet" column
train_df_2[ohe_1.get_feature_names_out()] = ohe_1.transform(train_df_2[['HomePlanet']]) 
# Drop the column "HomePlanet" which has been encoded
train_df_2.drop(columns = ["HomePlanet"], inplace = True)
# Show the dataset
train_df_2.head(3)

The column names for the encoded values are ['HomePlanet_Earth' 'HomePlanet_Europa' 'HomePlanet_Mars']


Unnamed: 0,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_1,Cabin_2,Cabin_3,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars
0,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,P,0.0,1.0,0.0
1,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,S,1.0,0.0,0.0
2,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S,0.0,1.0,0.0


In [49]:
# Check unique values for Destination
print(f"The unique values for 'Destination' are {train_df_2.Destination.unique()}") 
# Instantiate the OneHotEncoder
ohe_2 = OneHotEncoder(sparse = False) 
# Fit encoder
ohe_2.fit(train_df_2[['Destination']])
# Display the detected categories
print(f"The categories detected by the OneHotEncoder are {ohe_2.categories_}")
# Display the generated names
print(f"The column names for the encoded values are {ohe_2.get_feature_names_out()}") 
# Transform the current "Destination" column
train_df_2[ohe_2.get_feature_names_out()] = ohe_2.transform(train_df_2[['Destination']]) 
# Drop the column "Destination" which has been encoded
train_df_2.drop(columns = ["Destination"], inplace = True)
# Show the dataset
train_df_2.head(3)

The unique values for 'Destination' are ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e']
The categories detected by the OneHotEncoder are [array(['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e'], dtype=object)]
The column names for the encoded values are ['Destination_55 Cancri e' 'Destination_PSO J318.5-22'
 'Destination_TRAPPIST-1e']


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_1,Cabin_2,Cabin_3,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,P,0.0,1.0,0.0,0.0,0.0,1.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,S,1.0,0.0,0.0,0.0,0.0,1.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S,0.0,1.0,0.0,0.0,0.0,1.0


In [50]:
# Check unique values for Cabin_1
print(f"The unique values for 'Cabin_1' are {train_df_2.Cabin_1.unique()}") 
# Instantiate the OneHotEncoder
ohe_3 = OneHotEncoder(sparse = False) 
# Fit encoder
ohe_3.fit(train_df_2[['Cabin_1']])
# Display the detected categories
print(f"The categories detected by the OneHotEncoder are {ohe_3.categories_}")
# Display the generated names
print(f"The column names for the encoded values are {ohe_3.get_feature_names_out()}") 
# Transform the current "Cabin_1" column
train_df_2[ohe_3.get_feature_names_out()] = ohe_3.transform(train_df_2[['Cabin_1']]) 
# Drop the column "Cabin_1" which has been encoded
train_df_2.drop(columns = ["Cabin_1"], inplace = True)
# Show the dataset
train_df_2.head(3)

The unique values for 'Cabin_1' are ['B' 'F' 'A' 'G' 'E' 'D' 'C' 'T']
The categories detected by the OneHotEncoder are [array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'], dtype=object)]
The column names for the encoded values are ['Cabin_1_A' 'Cabin_1_B' 'Cabin_1_C' 'Cabin_1_D' 'Cabin_1_E' 'Cabin_1_F'
 'Cabin_1_G' 'Cabin_1_T']


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_2,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Cabin_1_A,Cabin_1_B,Cabin_1_C,Cabin_1_D,Cabin_1_E,Cabin_1_F,Cabin_1_G,Cabin_1_T
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# Check unique values for Cabin_3
print(f"The unique values for 'Cabin_3' are {train_df_2.Cabin_3.unique()}") 
# Instantiate the OneHotEncoder
ohe_4 = OneHotEncoder(sparse = False) 
# Fit encoder
ohe_4.fit(train_df_2[['Cabin_3']])
# Display the detected categories
print(f"The categories detected by the OneHotEncoder are {ohe_4.categories_}")
# Display the generated names
print(f"The column names for the encoded values are {ohe_4.get_feature_names_out()}") 
# Transform the current "Cabin_3" column
train_df_2[ohe_4.get_feature_names_out()] = ohe_4.transform(train_df_2[['Cabin_3']]) 
# Drop the column "Cabin_3" which has been encoded
train_df_2.drop(columns = ["Cabin_3"], inplace = True)
# Show the dataset
train_df_2.head(3)

The unique values for 'Cabin_3' are ['P' 'S']
The categories detected by the OneHotEncoder are [array(['P', 'S'], dtype=object)]
The column names for the encoded values are ['Cabin_3_P' 'Cabin_3_S']


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_2,...,Cabin_1_A,Cabin_1_B,Cabin_1_C,Cabin_1_D,Cabin_1_E,Cabin_1_F,Cabin_1_G,Cabin_1_T,Cabin_3_P,Cabin_3_S
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [64]:
# Step 0 - Instanciate Standard Scaler
std_scaler = StandardScaler()
# Step 1- Fit the scaler to the `GrLiveArea`
# to "learn" the median value and the IQR
std_scaler.fit(train_df_2[['Age']]) 
# 2-Scale/Transform
# <-> apply the transformation (value - median) / IQR for every house
train_df_2['Age'] = std_scaler.transform(train_df_2[['Age']])
train_df_2.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_2,...,Cabin_1_A,Cabin_1_B,Cabin_1_C,Cabin_1_D,Cabin_1_E,Cabin_1_F,Cabin_1_G,Cabin_1_T,Cabin_3_P,Cabin_3_S
0,0,0.70266,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,-0.332735,0,109.0,9.0,25.0,549.0,44.0,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0,2.014161,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0.288502,0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,-0.884946,0,303.0,70.0,151.0,565.0,2.0,1,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [65]:
# Step 1- Fit the scaler to the `GrLiveArea`
# to "learn" the median value and the IQR
std_scaler.fit(train_df_2[['RoomService']]) 
# 2-Scale/Transform
# <-> apply the transformation (value - median) / IQR for every house
train_df_2['RoomService'] = std_scaler.transform(train_df_2[['RoomService']])

# Step 1- Fit the scaler to the `GrLiveArea`
# to "learn" the median value and the IQR
std_scaler.fit(train_df_2[['FoodCourt']]) 
# 2-Scale/Transform
# <-> apply the transformation (value - median) / IQR for every house
train_df_2['FoodCourt'] = std_scaler.transform(train_df_2[['FoodCourt']])

# Step 1- Fit the scaler to the `GrLiveArea`
# to "learn" the median value and the IQR
std_scaler.fit(train_df_2[['ShoppingMall']]) 
# 2-Scale/Transform
# <-> apply the transformation (value - median) / IQR for every house
train_df_2['ShoppingMall'] = std_scaler.transform(train_df_2[['ShoppingMall']])

# Step 1- Fit the scaler to the `GrLiveArea`
# to "learn" the median value and the IQR
std_scaler.fit(train_df_2[['Spa']]) 
# 2-Scale/Transform
# <-> apply the transformation (value - median) / IQR for every house
train_df_2['Spa'] = std_scaler.transform(train_df_2[['Spa']])

# Step 1- Fit the scaler to the `GrLiveArea`
# to "learn" the median value and the IQR
std_scaler.fit(train_df_2[['VRDeck']]) 
# 2-Scale/Transform
# <-> apply the transformation (value - median) / IQR for every house
train_df_2['VRDeck'] = std_scaler.transform(train_df_2[['VRDeck']])

train_df_2.head()


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_2,...,Cabin_1_A,Cabin_1_B,Cabin_1_C,Cabin_1_D,Cabin_1_E,Cabin_1_F,Cabin_1_G,Cabin_1_T,Cabin_3_P,Cabin_3_S
0,0,0.70266,0,-0.342489,-0.280146,-0.283584,-0.271149,-0.262874,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,-0.332735,0,-0.171575,-0.274532,-0.242343,0.226923,-0.224324,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0,2.014161,1,-0.275064,1.950414,-0.283584,5.82094,-0.219943,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0.288502,0,-0.342489,0.520136,0.328429,2.749039,-0.093781,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,-0.884946,0,0.13262,-0.236483,-0.03449,0.241439,-0.261121,1,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [6]:
# Lines to make calculations about columns

# train_df_2["Cabin"].value_counts()
# train_df_2["VIP"].nunique()
# train_df_2[train_df_2["VRDeck"].isnull()]

In [67]:
train_df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8319 entries, 0 to 8692
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8319 non-null   int64  
 1   Age                        8319 non-null   float64
 2   VIP                        8319 non-null   int64  
 3   RoomService                8319 non-null   float64
 4   FoodCourt                  8319 non-null   float64
 5   ShoppingMall               8319 non-null   float64
 6   Spa                        8319 non-null   float64
 7   VRDeck                     8319 non-null   float64
 8   Transported                8319 non-null   int64  
 9   Cabin_2                    8319 non-null   object 
 10  HomePlanet_Earth           8319 non-null   float64
 11  HomePlanet_Europa          8319 non-null   float64
 12  HomePlanet_Mars            8319 non-null   float64
 13  Destination_55 Cancri e    8319 non-null   float

In [68]:
X = train_df_2.drop(columns = "Transported")
y = train_df_2["Transported"]

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [76]:
RFC_model_1 = RandomForestClassifier()

In [77]:
cv_results = cross_validate(RFC_model_1, X, y, cv=5,
                            scoring=['max_error',
                                     'r2',
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error']
)
pd.DataFrame(cv_results) # Cross validation output

Unnamed: 0,fit_time,score_time,test_max_error,test_r2,test_neg_mean_absolute_error,test_neg_mean_squared_error
0,0.440941,0.019885,-1,0.038412,-0.240385,-0.240385
1,0.420311,0.021077,-1,0.0312,-0.242188,-0.242188
2,0.421843,0.021359,-1,0.209093,-0.197716,-0.197716
3,0.417628,0.021412,-1,0.266789,-0.183293,-0.183293
4,0.415766,0.019934,-1,0.170138,-0.207456,-0.207456


In [78]:
RFC_model_1.fit(X, y)

## Preprocess X_test

In [79]:
test_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [87]:
test_df[test_df["Age"].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
42,0100_01,Earth,False,G/13/P,TRAPPIST-1e,,,0.0,17.0,997.0,0.0,0.0,Dary Cochrisons
68,0156_01,Mars,,F/34/P,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Tures Upead
146,0319_02,Mars,True,F/69/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Pigars Prie
157,0339_05,Earth,True,G/56/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Davisy Colleruces
211,0467_02,Mars,True,D/17/P,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,,Cings Keen
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4180,9065_01,Mars,True,F/1755/S,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Jet Bart
4216,9147_01,Earth,True,G/1490/P,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Gabrin Meyersones
4229,9177_02,Europa,True,C/306/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Zedarga Vablug
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
