## Packages

In [561]:
# Import packages

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

## Data

In [562]:
# Read both Train and Test csv files

original_test_df = pd.read_csv("test.csv")
original_train_df = pd.read_csv("train.csv")

In [563]:
original_test_df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [564]:
original_train_df.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [565]:
# Lines to make calculations about columns

# train_df["Cabin"].value_counts()
# train_df["VIP"].nunique()
# train_df[train_df_2["VRDeck"].isnull()]

## Dealing with NaN

In [566]:
# Split Cabin column
def split_cabins (df):
    
    # Split "Cabin" column in 3 columns and drop the original one
    df[['Cabin_1', 'Cabin_2', 'Cabin_3']] = df['Cabin'].str.split('/',expand=True)
    df.drop(columns="Cabin", inplace=True)
    return df

In [567]:
# Replace nan values
def nan_values(df):
    
    # Replace "HomePlanet" null values with "Earth"
    df["HomePlanet"].fillna("Earth", inplace=True)
    # Replace "CryoSleep" null values with False
    df["CryoSleep"].fillna(False, inplace=True)
    # Replace "Destination" null values with "TRAPPIST-1e"
    df.loc[df["Destination"].isnull(), "Destination"] = "TRAPPIST-1e"

    # Replace rows with "Age" null values with mean
    imputer = SimpleImputer(strategy="mean") 
    imputer.fit(df[['Age']])
    df['Age'] = imputer.transform(df[['Age']])

    # Replace "VIP" null values with False, then Trues to 1 and Falses to 0
    df.loc[df["VIP"].isnull(), "VIP"] = False
    # Replace "RoomService" null values with 0
    df.loc[df["RoomService"].isnull(), "RoomService"] = 0
    # Replace "FoodCourt" null values with 0
    df.loc[df["FoodCourt"].isnull(), "FoodCourt"] = 0
    # Replace "ShoppingMall" null values with 0
    df.loc[df["ShoppingMall"].isnull(), "ShoppingMall"] = 0
    # Replace "Spa" null values with 0
    df.loc[df["Spa"].isnull(), "Spa"] = 0
    # Replace "VRDeck" null values with 0
    df.loc[df["VRDeck"].isnull(), "VRDeck"] = 0
    
    # Replace Cabin_1, 2, and 3 rows with null values with most frequent value
    # Instantiate a SimpleImputer
    imputer = SimpleImputer(strategy="most_frequent") 
    imputer.fit(df[['Cabin_1']])
    df['Cabin_1'] = imputer.transform(df[['Cabin_1']])

    imputer = SimpleImputer(strategy="most_frequent") 
    imputer.fit(df[['Cabin_2']])
    df['Cabin_2'] = imputer.transform(df[['Cabin_2']])
    df['Cabin_2'] = df['Cabin_2'].astype(int)

    imputer = SimpleImputer(strategy="most_frequent") 
    imputer.fit(df[['Cabin_3']])
    df['Cabin_3'] = imputer.transform(df[['Cabin_3']])
    
    return df


## Encoding, scaling...

In [568]:
def encoding(df, set):
    
    # Drop PassengerId and Name columns
    df.drop(["PassengerId", "Name"], axis=1, inplace= True)
    # Replace "VIP" Trues with 1 and Falses with 0
    df["VIP"] = df["VIP"].astype(int)

    # Replace "CryoSleep" Trues with 1 and Falses with 0
    df["CryoSleep"] = df["CryoSleep"].astype(int)
    
    if set == "train":
        # Replace "Transported" Trues with 1 and Falses with 0
        df["Transported"] = df["Transported"].astype(int)
    
    # Instantiate the OneHotEncoder
    ohe_1 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_1.fit(df[['HomePlanet']])
    # Transform the current "HomePlanet" column
    df[ohe_1.get_feature_names_out()] = ohe_1.transform(df[['HomePlanet']]) 
    # Drop the column "HomePlanet" which has been encoded
    df.drop(columns = ["HomePlanet"], inplace = True)
    
    # Instantiate the OneHotEncoder
    ohe_2 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_2.fit(df[['Destination']])
    # Transform the current "Destination" column
    df[ohe_2.get_feature_names_out()] = ohe_2.transform(df[['Destination']]) 
    # Drop the column "Destination" which has been encoded
    df.drop(columns = ["Destination"], inplace = True)
    
    # Encoding Cabin_1 with order
    # Instantiate the Ordinal Encoder
    ordinal_encoder = OrdinalEncoder(categories = [["A", "B", "C", "D", "E", "F", "G", "T"]]) # Fit it
    ordinal_encoder.fit(df[["Cabin_1"]]) 
    # Transforming categories into ordered numbers
    df["encoded_Cabin_1"] = ordinal_encoder.transform(df[["Cabin_1"]])
    # Showing the transformed classes
    df.drop(columns= "Cabin_1", inplace=True)
    
    # Instantiate the OneHotEncoder
    ohe_4 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_4.fit(df[['Cabin_3']])
    # Transform the current "Cabin_3" column
    df[ohe_4.get_feature_names_out()] = ohe_4.transform(df[['Cabin_3']]) 
    # Drop the column "Cabin_3" which has been encoded
    df.drop(columns = ["Cabin_3"], inplace = True)
    
    return df
    

In [569]:
# Encoding Cabin_1 without order

def encoding_no_order(df, set):
    
    # Drop PassengerId and Name columns
    df.drop(["PassengerId", "Name"], axis=1, inplace= True)
    # Replace "VIP" Trues with 1 and Falses with 0
    df["VIP"] = df["VIP"].astype(int)

    # Replace "CryoSleep" Trues with 1 and Falses with 0
    df["CryoSleep"] = df["CryoSleep"].astype(int)
    
    if set == "train":
        # Replace "Transported" Trues with 1 and Falses with 0
        df["Transported"] = df["Transported"].astype(int)
    
    # Instantiate the OneHotEncoder
    ohe_1 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_1.fit(df[['HomePlanet']])
    # Transform the current "HomePlanet" column
    df[ohe_1.get_feature_names_out()] = ohe_1.transform(df[['HomePlanet']]) 
    # Drop the column "HomePlanet" which has been encoded
    df.drop(columns = ["HomePlanet"], inplace = True)
    
    # Instantiate the OneHotEncoder
    ohe_2 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_2.fit(df[['Destination']])
    # Transform the current "Destination" column
    df[ohe_2.get_feature_names_out()] = ohe_2.transform(df[['Destination']]) 
    # Drop the column "Destination" which has been encoded
    df.drop(columns = ["Destination"], inplace = True)
    
    # Encoding Cabin_1 without order, with One Hot Encoder
    # Instantiate the OneHotEncoder
    ohe_3 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_3.fit(df[['Cabin_1']])
    # Transform the current "Cabin_1" column
    df[ohe_3.get_feature_names_out()] = ohe_3.transform(df[['Cabin_1']]) 
    # Drop the column "Cabin_1" which has been encoded
    df.drop(columns = ["Cabin_1"], inplace = True)
    
    # Instantiate the OneHotEncoder
    ohe_4 = OneHotEncoder(sparse = False) 
    # Fit encoder
    ohe_4.fit(df[['Cabin_3']])
    # Transform the current "Cabin_3" column
    df[ohe_4.get_feature_names_out()] = ohe_4.transform(df[['Cabin_3']]) 
    # Drop the column "Cabin_3" which has been encoded
    df.drop(columns = ["Cabin_3"], inplace = True)
    
    return df

In [570]:
def scaling(df):
    
    # Step 0 - Instanciate Standard Scaler
    std_scaler = StandardScaler()
    # Step 1- Fit the scaler
    std_scaler.fit(df[['Age']]) 
    # 2-Scale/Transform
    # <-> apply the transformation (value - median) / IQR for every house
    df['Age'] = std_scaler.transform(df[['Age']])
    
    # Step 1- Fit the scaler
    std_scaler.fit(df[['RoomService']]) 
    # 2-Scale/Transform
    df['RoomService'] = std_scaler.transform(df[['RoomService']])

    # Step 1- Fit the scaler
    std_scaler.fit(df[['FoodCourt']]) 
    # 2-Scale/Transform
    df['FoodCourt'] = std_scaler.transform(df[['FoodCourt']])

    # Step 1- Fit the scaler
    std_scaler.fit(df[['ShoppingMall']]) 
    # 2-Scale/Transform
    df['ShoppingMall'] = std_scaler.transform(df[['ShoppingMall']])

    # Step 1- Fit the scaler
    std_scaler.fit(df[['Spa']]) 
    # 2-Scale/Transform
    df['Spa'] = std_scaler.transform(df[['Spa']])

    # Step 1- Fit the scaler
    std_scaler.fit(df[['VRDeck']]) 
    # 2-Scale/Transform
    df['VRDeck'] = std_scaler.transform(df[['VRDeck']])

    return df

## Prprocessing train set with functions

In [571]:
def full_preprocessing(df, set):
    df = split_cabins(df)
    df = nan_values(df)
    if set == "train":
        df = encoding(df, "train")
    else:
        df = encoding(df, "test")
    df = scaling(df)
    return df

In [572]:
train_df = original_train_df.copy()
train_df = full_preprocessing(train_df, "train")

In [573]:
train_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_2,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,encoded_Cabin_1,Cabin_3_P,Cabin_3_S
0,0,0.709437,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0,0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,0,-0.336717,0,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,1,0,1.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,1.0
2,0,2.034566,1,-0.268001,1.959998,-0.283579,5.695623,-0.219796,0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0,0.290975,0,-0.333105,0.523010,0.336851,2.687176,-0.092818,0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0,-0.894666,0,0.125652,-0.237159,-0.031059,0.231374,-0.261240,1,1,1.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,0.848924,1,-0.333105,3.992336,-0.283579,1.189173,-0.197751,0,98,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8689,1,-0.755179,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0,1499,1.0,0.0,0.0,0.0,1.0,0.0,6.0,0.0,1.0
8690,0,-0.197230,0,-0.333105,-0.281027,2.846999,-0.269737,-0.263003,1,1500,1.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,1.0
8691,0,0.221232,0,-0.333105,0.376365,-0.283579,0.043013,2.589576,0,608,0.0,1.0,0.0,1.0,0.0,0.0,4.0,0.0,1.0


## Cross validation

In [574]:
def cross_validation(X, y, model):
    cv_results = cross_validate(model, X, y, cv=5,
                                scoring=['r2']
    )
    print(pd.DataFrame(cv_results)) # Cross validation output

In [575]:
X = train_df.drop(columns = "Transported")
y = train_df["Transported"]
RFC_model_1 = RandomForestClassifier()
cross_validation(X, y, RFC_model_1)

   fit_time  score_time   test_r2
0  0.436693    0.019511  0.054576
1  0.433826    0.020224  0.020072
2  0.462069    0.021962  0.222498
3  0.465619    0.021439  0.284201
4  0.434407    0.019025  0.127691


## Fit the model with the full train test

In [576]:
RFC_model_1.fit(X, y)

## Preprocess prediction

In [577]:
test_df = original_test_df.copy()
test_df = full_preprocessing(test_df, "test")

## Prediction

In [578]:
prediction = RFC_model_1.predict(test_df)

## Create the submission file

In [579]:
def create_csv(prediction, pred_number):
    prediction_df = pd.DataFrame(columns=("PassengerId", "Transported"))
    prediction_df["PassengerId"] = original_test_df["PassengerId"]
    prediction_df["Transported"] = prediction.astype(bool)
    csv_name = f'submission_2023-04-17_{pred_number}.csv'
    prediction_df.to_csv(csv_name, index=False)

In [580]:
create_csv(prediction, 1)

## 2. Another method dropping the Cabin number column

In [585]:
train_df_2 = train_df.copy()
train_df_2 = train_df_2.drop(columns="Cabin_2")
X_2 = train_df_2.drop(columns = "Transported")
y_2 = train_df_2["Transported"]
RFC_model_2 = RandomForestClassifier()
cv_results_2 = cross_validate(RFC_model_2, X_2, y_2, cv=5,
                            scoring=['r2']
)
pd.DataFrame(cv_results_2)

Unnamed: 0,fit_time,score_time,test_r2
0,0.399654,0.022221,0.08678
1,0.391022,0.022359,0.107483
2,0.390657,0.022209,0.190294
3,0.391697,0.023754,0.173724
4,0.389693,0.021656,0.136898


In [582]:
RFC_model_2.fit(X_2, y_2)

In [583]:
X_pred_2 = test_df.copy()
X_pred_2 = X_pred_2.drop(columns="Cabin_2")
prediction_2 = RFC_model_2.predict(X_pred_2)
create_csv(prediction_2, 2)

## --> This second trial is not valid, it gives worse scoring.

## 3. Another trial with One Hot Encoder for Cabin_1

In [560]:
# Preprocess train
train_df_3 = original_train_df.copy()
train_df_3 = full_preprocessing(train_df_3, "train")

# Cross validation
X_3 = train_df_3.drop(columns = "Transported")
y_3 = train_df_3["Transported"]
RFC_model_3 = RandomForestClassifier()
cross_validation(X_3, y_3, RFC_model_3)

   fit_time  score_time   test_r2
0  0.440749    0.019619  0.049976
1  0.432910    0.020627  0.029273
2  0.447297    0.020257  0.220198
3  0.436437    0.020451  0.284201
4  0.433925    0.019513  0.139199


In [584]:
# Fit model
RFC_model_3.fit(X_3, y_3)

# Preprocess prediction
test_df_3 = original_test_df.copy()
test_df_3 = full_preprocessing(test_df_3, "test")

# Prediction
prediction_3 = RFC_model_3.predict(test_df_3)

# Submission file
create_csv(prediction_3, 3)

## --> Still worse scoring

## 4. Optimizing parameters

In [586]:
# Preprocess train
train_df_4 = original_train_df.copy()
train_df_4 = full_preprocessing(train_df_4, "train")

# X and y
X_4 = train_df_4.drop(columns = "Transported")
y_4 = train_df_4["Transported"]

In [525]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [587]:
# Train/Test split
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4, y_4, test_size=0.20, random_state=1) 
# Instanciate model
RFC_model_4 = RandomForestClassifier()
# Hyperparameter Grid
grid_4 = {'n_estimators': [150, 200, 500],
        'max_depth': [10, 15],
        'min_samples_split': [10, 15, 20],
        'min_samples_leaf': [1, 2, 5, 10, 15],
        'criterion': ['gini']}
# Instanciate Grid Search
search_4 = RandomizedSearchCV(RFC_model_4, grid_4,
                           scoring = 'r2',
                           cv = 5,
                           n_iter = 100,
                           n_jobs=-1 # paralellize computation
)
# Fit data to Grid Search
search_4.fit(X_train_4,y_train_4)
# Best score
print(search_4.best_score_)
# Best Params
print(search_4.best_params_)
# Best estimator
print(search_4.best_estimator_)



0.22174072090261915
{'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 10, 'criterion': 'gini'}
RandomForestClassifier(max_depth=10, min_samples_split=10, n_estimators=500)


In [588]:
# Cross validation
RFC_model_4 = RandomForestClassifier(n_estimators=500, min_samples_leaf=1, min_samples_split=10, max_depth=10, criterion='gini')
cv_results_4 = cross_validate(RFC_model_4, X_4, y_4, cv=5,
                            scoring=['r2']
)
pd.DataFrame(cv_results_4)

Unnamed: 0,fit_time,score_time,test_r2
0,1.696475,0.071263,0.049976
1,1.674112,0.073566,0.102883
2,1.644755,0.073461,0.194895
3,1.666711,0.074548,0.300312
4,1.647346,0.073049,0.215152


In [589]:
# Fit model
RFC_model_4.fit(X_4, y_4)

# Preprocess prediction
test_df_4 = original_test_df.copy()
test_df_4 = full_preprocessing(test_df_4, "test")

# Prediction
prediction_4 = RFC_model_4.predict(test_df_4)

# Submission file
create_csv(prediction_4, 4)

## With XGBoost