## Packages

In [335]:
# Import packages

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

## Data

In [336]:
# Read both Train and Test csv files

test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

In [337]:
test_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [338]:
train_df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [339]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [340]:
# Lines to make calculations about columns

# train_df["Cabin"].value_counts()
# train_df["VIP"].nunique()
# train_df[train_df_2["VRDeck"].isnull()]

## Dealing with NaN

In [341]:
# Split Cabin column
def split_cabins (df):
    
    # Split "Cabin" column in 3 columns and drop the original one
    df[['Cabin_1', 'Cabin_2', 'Cabin_3']] = df['Cabin'].str.split('/',expand=True)
    df.drop(columns="Cabin", inplace=True)
    return df

In [342]:
# Replace nan values
def nan_values(df):
    
    # Replace "HomePlanet" null values with "Earth"
    df["HomePlanet"].fillna("Earth", inplace=True)
    # Replace "CryoSleep" null values with False
    df["CryoSleep"].fillna(False, inplace=True)
    # Replace "Destination" null values with "TRAPPIST-1e"
    df.loc[df["Destination"].isnull(), "Destination"] = "TRAPPIST-1e"

    # Replace rows with "Age" null values with mean
    imputer = SimpleImputer(strategy="mean") 
    imputer.fit(df[['Age']])
    df['Age'] = imputer.transform(df[['Age']])

    # Replace "VIP" null values with False, then Trues to 1 and Falses to 0
    df.loc[df["VIP"].isnull(), "VIP"] = False
    # Replace "RoomService" null values with 0
    df.loc[df["RoomService"].isnull(), "RoomService"] = 0
    # Replace "FoodCourt" null values with 0
    df.loc[df["FoodCourt"].isnull(), "FoodCourt"] = 0
    # Replace "ShoppingMall" null values with 0
    df.loc[df["ShoppingMall"].isnull(), "ShoppingMall"] = 0
    # Replace "Spa" null values with 0
    df.loc[df["Spa"].isnull(), "Spa"] = 0
    # Replace "VRDeck" null values with 0
    df.loc[df["VRDeck"].isnull(), "VRDeck"] = 0
    
    # Replace Cabin_1, 2, and 3 rows with null values with most frequent value
    # Instantiate a SimpleImputer
    imputer = SimpleImputer(strategy="most_frequent") 
    imputer.fit(df[['Cabin_1']])
    df['Cabin_1'] = imputer.transform(df[['Cabin_1']])

    imputer = SimpleImputer(strategy="most_frequent") 
    imputer.fit(df[['Cabin_2']])
    df['Cabin_2'] = imputer.transform(df[['Cabin_2']])
    df['Cabin_2'] = df['Cabin_2'].astype(int)

    imputer = SimpleImputer(strategy="most_frequent") 
    imputer.fit(df[['Cabin_3']])
    df['Cabin_3'] = imputer.transform(df[['Cabin_3']])
    
    return df


In [343]:
train_df_2 = split_cabins(train_df)
train_df_2 = nan_values(train_df_2)

In [344]:
train_df_2

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_1,Cabin_2,Cabin_3
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,A,98,P
8689,9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,G,1499,S
8690,9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,G,1500,S
8691,9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,E,608,S


## Encoding, scaling...

In [345]:
def encoding(df, set):
    
    # Drop PassengerId and Name columns
    df.drop(["PassengerId", "Name"], axis=1, inplace= True)
    # Replace "VIP" Trues with 1 and Falses with 0
    df["VIP"] = df["VIP"].astype(int)

    # Replace "CryoSleep" Trues with 1 and Falses with 0
    df["CryoSleep"] = df["CryoSleep"].astype(int)
    
    if set == "train":
        # Replace "Transported" Trues with 1 and Falses with 0
        df["Transported"] = df["Transported"].astype(int)
    
    # Instantiate the OneHotEncoder
    ohe_1 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_1.fit(df[['HomePlanet']])
    # Transform the current "HomePlanet" column
    df[ohe_1.get_feature_names_out()] = ohe_1.transform(df[['HomePlanet']]) 
    # Drop the column "HomePlanet" which has been encoded
    df.drop(columns = ["HomePlanet"], inplace = True)
    
    # Instantiate the OneHotEncoder
    ohe_2 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_2.fit(df[['Destination']])
    # Transform the current "Destination" column
    df[ohe_2.get_feature_names_out()] = ohe_2.transform(df[['Destination']]) 
    # Drop the column "Destination" which has been encoded
    df.drop(columns = ["Destination"], inplace = True)
    
    # Encoding Cabin_1 with order
    # Instantiate the Ordinal Encoder
    ordinal_encoder = OrdinalEncoder(categories = [["A", "B", "C", "D", "E", "F", "G", "T"]]) # Fit it
    ordinal_encoder.fit(df[["Cabin_1"]]) 
    # Transforming categories into ordered numbers
    df["encoded_Cabin_1"] = ordinal_encoder.transform(df[["Cabin_1"]])
    # Showing the transformed classes
    df.drop(columns= "Cabin_1", inplace=True)
    
    # Instantiate the OneHotEncoder
    ohe_4 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_4.fit(df[['Cabin_3']])
    # Transform the current "Cabin_3" column
    df[ohe_4.get_feature_names_out()] = ohe_4.transform(df[['Cabin_3']]) 
    # Drop the column "Cabin_3" which has been encoded
    df.drop(columns = ["Cabin_3"], inplace = True)
    
    return df
    

In [346]:
# Encoding Cabin_1 without order
'''

# Check unique values for Cabin_1
print(f"The unique values for 'Cabin_1' are {train_df_2.Cabin_1.unique()}") 
# Instantiate the OneHotEncoder
ohe_3 = OneHotEncoder(sparse = False) 
# Fit encoder
ohe_3.fit(train_df_2[['Cabin_1']])
# Display the detected categories
print(f"The categories detected by the OneHotEncoder are {ohe_3.categories_}")
# Display the generated names
print(f"The column names for the encoded values are {ohe_3.get_feature_names_out()}") 
# Transform the current "Cabin_1" column
train_df_2[ohe_3.get_feature_names_out()] = ohe_3.transform(train_df_2[['Cabin_1']]) 
# Drop the column "Cabin_1" which has been encoded
train_df_2.drop(columns = ["Cabin_1"], inplace = True)
# Show the dataset
train_df_2.head(3)

'''

'\n\n# Check unique values for Cabin_1\nprint(f"The unique values for \'Cabin_1\' are {train_df_2.Cabin_1.unique()}") \n# Instantiate the OneHotEncoder\nohe_3 = OneHotEncoder(sparse = False) \n# Fit encoder\nohe_3.fit(train_df_2[[\'Cabin_1\']])\n# Display the detected categories\nprint(f"The categories detected by the OneHotEncoder are {ohe_3.categories_}")\n# Display the generated names\nprint(f"The column names for the encoded values are {ohe_3.get_feature_names_out()}") \n# Transform the current "Cabin_1" column\ntrain_df_2[ohe_3.get_feature_names_out()] = ohe_3.transform(train_df_2[[\'Cabin_1\']]) \n# Drop the column "Cabin_1" which has been encoded\ntrain_df_2.drop(columns = ["Cabin_1"], inplace = True)\n# Show the dataset\ntrain_df_2.head(3)\n\n'

In [347]:
train_df_2 = encoding(train_df_2, "train")

In [348]:
def scaling(df):
    
    # Step 0 - Instanciate Standard Scaler
    std_scaler = StandardScaler()
    # Step 1- Fit the scaler
    std_scaler.fit(df[['Age']]) 
    # 2-Scale/Transform
    # <-> apply the transformation (value - median) / IQR for every house
    df['Age'] = std_scaler.transform(df[['Age']])
    
    # Step 1- Fit the scaler
    std_scaler.fit(df[['RoomService']]) 
    # 2-Scale/Transform
    df['RoomService'] = std_scaler.transform(df[['RoomService']])

    # Step 1- Fit the scaler
    std_scaler.fit(df[['FoodCourt']]) 
    # 2-Scale/Transform
    df['FoodCourt'] = std_scaler.transform(df[['FoodCourt']])

    # Step 1- Fit the scaler
    std_scaler.fit(df[['ShoppingMall']]) 
    # 2-Scale/Transform
    df['ShoppingMall'] = std_scaler.transform(df[['ShoppingMall']])

    # Step 1- Fit the scaler
    std_scaler.fit(df[['Spa']]) 
    # 2-Scale/Transform
    df['Spa'] = std_scaler.transform(df[['Spa']])

    # Step 1- Fit the scaler
    std_scaler.fit(df[['VRDeck']]) 
    # 2-Scale/Transform
    df['VRDeck'] = std_scaler.transform(df[['VRDeck']])

    return df

In [349]:
train_df_2 = scaling(train_df_2)


In [350]:
train_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   int64  
 1   Age                        8693 non-null   float64
 2   VIP                        8693 non-null   int64  
 3   RoomService                8693 non-null   float64
 4   FoodCourt                  8693 non-null   float64
 5   ShoppingMall               8693 non-null   float64
 6   Spa                        8693 non-null   float64
 7   VRDeck                     8693 non-null   float64
 8   Transported                8693 non-null   int64  
 9   Cabin_2                    8693 non-null   int64  
 10  HomePlanet_Earth           8693 non-null   float64
 11  HomePlanet_Europa          8693 non-null   float64
 12  HomePlanet_Mars            8693 non-null   float64
 13  Destination_55 Cancri e    8693 non-null   float

## Cross validation

In [351]:
X = train_df_2.drop(columns = "Transported")
y = train_df_2["Transported"]

In [352]:
RFC_model_1 = RandomForestClassifier()

In [353]:
cv_results = cross_validate(RFC_model_1, X, y, cv=5,
                            scoring=['max_error',
                                     'r2',
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error']
)
pd.DataFrame(cv_results) # Cross validation output

Unnamed: 0,fit_time,score_time,test_max_error,test_r2,test_neg_mean_absolute_error,test_neg_mean_squared_error
0,0.456655,0.019771,-1,0.047675,-0.238068,-0.238068
1,0.434721,0.020924,-1,0.045375,-0.238643,-0.238643
2,0.439081,0.021204,-1,0.234,-0.191489,-0.191489
3,0.442192,0.021129,-1,0.302613,-0.174338,-0.174338
4,0.423913,0.019513,-1,0.157612,-0.210587,-0.210587


## Fit the model with the full train test

In [354]:
RFC_model_1.fit(X, y)

## Preprocess prediction

In [355]:
test_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [356]:
test_df_2 = test_df.copy()
test_df_2 = split_cabins(test_df_2)
test_df_2 = nan_values(test_df_2)
test_df_2 = encoding(test_df_2, "test")
test_df_2 = scaling(test_df_2)

In [357]:
test_df_2

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_2,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,encoded_Cabin_1,Cabin_3_P,Cabin_3_S
0,1,-1.182216e-01,0,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,3,1.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,1.0
1,0,-6.886014e-01,0,-0.357339,-0.277879,-0.312173,2.287504,-0.246712,4,1.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,1.0
2,1,1.669682e-01,0,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0
3,0,6.660505e-01,0,-0.357339,4.121518,-0.312173,-0.104002,0.226648,1,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0
4,0,-6.173039e-01,0,-0.340723,-0.283840,0.832122,-0.267841,-0.246712,5,1.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,3.808606e-01,0,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,1496,1.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,1.0
4273,0,9.512404e-01,0,-0.357339,0.277095,-0.281538,-0.258790,-0.130193,4,1.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,1.0
4274,1,-2.532995e-16,0,-0.357339,-0.283840,-0.312173,-0.267841,-0.246712,296,0.0,0.0,1.0,1.0,0.0,0.0,3.0,1.0,0.0
4275,0,-2.532995e-16,0,-0.357339,1.491019,-0.312173,-0.267841,0.176479,297,0.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0


## Prediction

In [358]:
X_pred = test_df_2
prediction = RFC_model_1.predict(X_pred)

In [359]:
prediction

array([1, 0, 1, ..., 1, 1, 0])

## Create the submission file

In [330]:
prediction_df = pd.DataFrame(columns=("PassengerId", "Transported"))

In [331]:
prediction_df["PassengerId"] = test_df["PassengerId"]
prediction_df["Transported"] = prediction.astype(bool)
prediction_df.to_csv("submission.csv", index=False)

## Another method dropping the Cabin number column

In [360]:
train_df_3 = train_df_2.drop(columns="Cabin_2")
X_2 = train_df_3.drop(columns = "Transported")
y_2 = train_df_3["Transported"]
RFC_model_2 = RandomForestClassifier()
cv_results_2 = cross_validate(RFC_model_2, X_2, y_2, cv=5,
                            scoring=['max_error',
                                     'r2',
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error']
)
pd.DataFrame(cv_results_2)

Unnamed: 0,fit_time,score_time,test_max_error,test_r2,test_neg_mean_absolute_error,test_neg_mean_squared_error
0,0.408148,0.021935,-1,0.109783,-0.222542,-0.222542
1,0.382625,0.021914,-1,0.109783,-0.222542,-0.222542
2,0.390575,0.021583,-1,0.194895,-0.201265,-0.201265
3,0.389147,0.021966,-1,0.16912,-0.20771,-0.20771
4,0.386591,0.021519,-1,0.102374,-0.224396,-0.224396


In [361]:
RFC_model_2.fit(X_2, y_2)

In [364]:
X_pred_2 = test_df_2.drop(columns="Cabin_2")
prediction_2 = RFC_model_2.predict(X_pred_2)
prediction_df_2 = pd.DataFrame(columns=("PassengerId", "Transported"))
prediction_df_2["PassengerId"] = test_df["PassengerId"]
prediction_df_2["Transported"] = prediction_2.astype(bool)
prediction_df_2.to_csv("submission.csv", index=False)

## --> This second trial is not valid, it gives worse scoring.

## Another trial without scaling