# Imports

In [1]:
import string
import sys
from collections import deque

import pandas as pd
import numpy as np
# import altair as alt
# alt.renderers.enable('mimetype')
# alt.data_transformers.enable('data_server')

from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings

from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB

# import kaggle

import matplotlib.pyplot as plt

from pycaret.classification import *

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

# Basic Data Analysis

In [3]:
train_df = pd.read_csv("data/train.csv")
train_df.set_index("PassengerId", inplace=True)
train_df = shuffle(train_df)

# Convert the values of numeric column into absolute values
train_df.loc[
    :, ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
] = train_df.loc[
    :, ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
].abs()
train_df

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6751_01,Mars,True,F/1290/S,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Trisp Berle,True
0141_01,Mars,False,F/30/P,TRAPPIST-1e,31.0,False,,0.0,97.0,0.0,0.0,Pyrohs Harte,False
3409_01,Earth,False,F/701/P,TRAPPIST-1e,19.0,False,90.0,0.0,0.0,74.0,595.0,Felice Wheelez,False
4706_02,Earth,True,G/762/P,TRAPPIST-1e,40.0,False,0.0,0.0,0.0,0.0,0.0,Verney Medington,True
5148_02,Earth,True,G/831/P,TRAPPIST-1e,16.0,False,0.0,0.0,0.0,0.0,0.0,Jeanny Moodsey,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0092_02,Earth,True,G/9/P,TRAPPIST-1e,0.0,False,0.0,0.0,,0.0,0.0,Stald Hewson,True
0743_01,Earth,True,G/111/S,TRAPPIST-1e,14.0,False,0.0,0.0,0.0,0.0,0.0,Loree Brighttt,True
5812_01,Earth,True,G/943/S,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,Holey Domington,True
5960_03,Europa,True,C/222/S,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Merabik Dister,True


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 6751_01 to 5360_01
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 891.4+ KB


In [5]:
train_df.isna().sum()/8693   # percentage of null

HomePlanet      0.023122
CryoSleep       0.024963
Cabin           0.022892
Destination     0.020936
Age             0.020591
VIP             0.023352
RoomService     0.020821
FoodCourt       0.021051
ShoppingMall    0.023927
Spa             0.021051
VRDeck          0.021627
Name            0.023007
Transported     0.000000
dtype: float64

# Data Wrangling & EDA

#### 1. Create new columns for missing data and perform EDA on missing items

In [6]:
train_df["HomePlanet_missing"] = train_df["HomePlanet"].isna()
train_df["CryoSleep_missing"] = train_df["CryoSleep"].isna()
train_df["Cabin_missing"] = train_df["Cabin"].isna()
train_df["Destination_missing"] = train_df["Destination"].isna()
train_df["Age_missing"] = train_df["Age"].isna()
train_df["VIP_missing"] = train_df["VIP"].isna()
train_df["RoomService_missing"] = train_df["RoomService"].isna()
train_df["FoodCourt_missing"] = train_df["FoodCourt"].isna()
train_df["ShoppingMall_missing"] = train_df["ShoppingMall"].isna()
train_df["Spa_missing"] = train_df["Spa"].isna()
train_df["VRDeck_missing"] = train_df["VRDeck"].isna()
train_df["Name_missing"] = train_df["Name"].isna()

In [7]:
# alt.Chart(train_df).mark_bar(clip=True).encode(
#     alt.X(alt.repeat(), type='quantitative', bin=alt.Bin(maxbins=2)),
#     alt.Y('count()', scale=alt.Scale(domain=(0, 250))),
#     fill='Transported'
# ).properties(
#     width=200,
#     height=150
# ).repeat(
#     ["HomePlanet_missing", "CryoSleep_missing", "Cabin_missing",
#      "Destination_missing", "Age_missing", "VIP_missing", "RoomService_missing",
#      "FoodCourt_missing", "ShoppingMall_missing",
#      "Spa_missing", "VRDeck_missing", "Name_missing"],
#     columns=3
# )

#### 2. Imputation

In [8]:
cat_imputer = SimpleImputer(strategy="most_frequent")
num_imputer = SimpleImputer(strategy="median")

In [9]:
cat_columns = train_df.columns.tolist()
cat_columns.remove('Transported')
cat_columns.remove('Age')
cat_columns.remove('Name')
cat_columns

['HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'HomePlanet_missing',
 'CryoSleep_missing',
 'Cabin_missing',
 'Destination_missing',
 'Age_missing',
 'VIP_missing',
 'RoomService_missing',
 'FoodCourt_missing',
 'ShoppingMall_missing',
 'Spa_missing',
 'VRDeck_missing',
 'Name_missing']

In [10]:
train_df.loc[:, ["Age"]] = num_imputer.fit_transform(train_df.loc[:, ["Age"]])
train_df.loc[:, cat_columns] = cat_imputer.fit_transform(train_df.loc[:, cat_columns])
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 6751_01 to 5360_01
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   HomePlanet            8693 non-null   object 
 1   CryoSleep             8693 non-null   bool   
 2   Cabin                 8693 non-null   object 
 3   Destination           8693 non-null   object 
 4   Age                   8693 non-null   float64
 5   VIP                   8693 non-null   bool   
 6   RoomService           8693 non-null   float64
 7   FoodCourt             8693 non-null   float64
 8   ShoppingMall          8693 non-null   float64
 9   Spa                   8693 non-null   float64
 10  VRDeck                8693 non-null   float64
 11  Name                  8493 non-null   object 
 12  Transported           8693 non-null   bool   
 13  HomePlanet_missing    8693 non-null   bool   
 14  CryoSleep_missing     8693 non-null   bool   
 15  Cabin_missing    

#### 3. Categorical Variables

In [11]:
# # Perform EDA

# plot_homeplanet = alt.Chart(train_df).mark_bar().encode(
#     y="HomePlanet",
#     x='count()',
#     fill='Transported'
# )
# plot_cryosleep = alt.Chart(train_df).mark_bar().encode(
#     y="CryoSleep",
#     x='count()',
#     fill='Transported'
# )
# plot_destination = alt.Chart(train_df).mark_bar().encode(
#     y="Destination",
#     x='count()',
#     fill='Transported'
# )
# plot_vip = alt.Chart(train_df).mark_bar().encode(
#     y="VIP",
#     x='count()',
#     fill='Transported'
# )

# plot_homeplanet & plot_cryosleep & plot_destination & plot_vip

#### 4. Wrangling on `Cabin`

In [12]:
# Divide Cabin in to sub 3 section

train_df[['Cabin_dec', 'Cabin_num', 'Cabin_side']] = train_df["Cabin"].str.split("/", expand=True)

In [13]:
# # Perform EDA

# plot_cabin_dec = alt.Chart(train_df).mark_bar().encode(
#     y="Cabin_dec",
#     x='count()',
#     fill='Transported'
# )
# plot_cabin_side = alt.Chart(train_df).mark_bar().encode(
#     y="Cabin_side",
#     x='count()',
#     fill='Transported'
# )

# plot_cabin_dec & plot_cabin_side

#### 5. Wrangling on `Age`

In [14]:
# age_histogram = alt.Chart(train_df).mark_bar().encode(
#     alt.X('Age', bin=alt.Bin(maxbins=40)), 
#     y='count()',
#     fill='Transported')
# age_histogram

In [15]:
# Create Age groups
train_df['Age_bin'] = pd.cut(
    train_df['Age'], 
    bins=[-0.1, 12, 18, 30, 50, 66, 90], 
    labels=['Children', 'Teenage', '20s', '30-40s', '50-60s', 'Elder']
)

In [16]:
# # Perform EDA

# plot_age_bin = alt.Chart(train_df).mark_bar().encode(
#     y="Age_bin",
#     x='count()',
#     fill='Transported'
# )
# plot_age_bin

#### 6. Wrangling on `RoomService`

In [17]:
# roomservice_histogram = alt.Chart(train_df).mark_bar().encode(
#     alt.X('RoomService', bin=alt.Bin(maxbins=200)), 
#     y='count()',
#     fill='Transported')
# roomservice_histogram2 = alt.Chart(train_df).mark_bar(clip=True).encode(
#     alt.X('RoomService', bin=alt.Bin(maxbins=200), scale=alt.Scale(domain=(0, 3000))), 
#     alt.Y('count()', scale=alt.Scale(domain=(0, 500))),
#     fill='Transported')

# roomservice_histogram | roomservice_histogram2

In [18]:
# Create RoomService groups

train_df['RoomService_bin'] = pd.cut(
    train_df['RoomService'], 
    bins=[-0.1, 0.001, 100, 200, 400, 900, 2400, 15000], 
    labels=['RS$0', 'RS$0-100', 'RS$100-200', 'RS$200-400', 'RS$400-900', 'RS$900-2400', 'RS$2400-15000']
)

In [19]:
# # Perform EDA

# plot_roomservice_bin = alt.Chart(train_df).mark_bar().encode(
#     y="RoomService_bin",
#     x='count()',
#     fill='Transported'
# )
# plot_roomservice_bin

#### 7. Wrangling on `FoodCourt`

In [20]:
# foodcourt_histogram = alt.Chart(train_df).mark_bar().encode(
#     alt.X('FoodCourt', bin=alt.Bin(maxbins=400)), 
#     y='count()',
#     fill='Transported')
# foodcourt_histogram2 = alt.Chart(train_df).mark_bar(clip=True).encode(
#     alt.X('FoodCourt', bin=alt.Bin(maxbins=400), scale=alt.Scale(domain=(0, 3000))), 
#     alt.Y('count()', scale=alt.Scale(domain=(0, 500))),
#     fill='Transported')
# foodcourt_histogram | foodcourt_histogram2

In [21]:
# Create FoodCourt groups

train_df['FoodCourt_bin'] = pd.cut(
    train_df['FoodCourt'], 
    bins=[-0.1, 0.001, 100, 500, 900, 2000, 30000], 
    labels=['FC$0', 'FC$0-100', 'FC$100-500', 'FC$500-900', 'FC$900-2000', 'FC$2000-30000']
)

In [22]:
# # Perform EDA

# plot_roomservice_bin = alt.Chart(train_df).mark_bar().encode(
#     y="FoodCourt_bin",
#     x='count()',
#     fill='Transported'
# )
# plot_roomservice_bin

#### 8. Wrangling on `ShoppingMall`

In [23]:
# shoppingmall_histogram = alt.Chart(train_df).mark_bar().encode(
#     alt.X('ShoppingMall', bin=alt.Bin(maxbins=400)), 
#     y='count()',
#     fill='Transported')
# shoppingmall_histogram2 = alt.Chart(train_df).mark_bar(clip=True).encode(
#     alt.X('ShoppingMall', bin=alt.Bin(maxbins=400), scale=alt.Scale(domain=(0, 3000))), 
#     alt.Y('count()', scale=alt.Scale(domain=(0, 500))),
#     fill='Transported')
# shoppingmall_histogram | shoppingmall_histogram2

In [24]:
# Create ShoppingMall groups

train_df['ShoppingMall_bin'] = pd.cut(
    train_df['ShoppingMall'], 
    bins=[-0.1, 0.001, 100, 200, 600, 1200, 2000, 30000], 
    labels=['SM$0', 'SM$0-100', 'SM$100-200', 'SM$200-600', 'SM$600-1200', 'SM$1200-2000', 'SM$2000-30000']
)

In [25]:
# # Perform EDA

# plot_shoppingmall_bin = alt.Chart(train_df).mark_bar().encode(
#     y="ShoppingMall_bin",
#     x='count()',
#     fill='Transported'
# )
# plot_shoppingmall_bin

#### 9. Wrangling on `Spa`

In [26]:
# spa_histogram = alt.Chart(train_df).mark_bar().encode(
#     alt.X('Spa', bin=alt.Bin(maxbins=400)), 
#     y='count()',
#     fill='Transported')
# spa_histogram2 = alt.Chart(train_df).mark_bar(clip=True).encode(
#     alt.X('Spa', bin=alt.Bin(maxbins=400), scale=alt.Scale(domain=(0, 3000))), 
#     alt.Y('count()', scale=alt.Scale(domain=(0, 500))),
#     fill='Transported')
# spa_histogram | spa_histogram2

In [27]:
# Create Spa groups

train_df['Spa_bin'] = pd.cut(
    train_df['Spa'], 
    bins=[-0.1, 0.001, 100, 200, 600, 1200, 2000, 30000], 
    labels=['Sp$0', 'Sp$0-100', 'Sp$100-200', 'Sp$200-600', 'Sp$600-1200', 'Sp$1200-2000', 'Sp$2000-30000']
)

In [28]:
# # Perform EDA

# plot_spa_bin = alt.Chart(train_df).mark_bar().encode(
#     y="Spa_bin",
#     x='count()',
#     fill='Transported'
# )
# plot_spa_bin

#### 10. Wrangling on `VRDeck`

In [29]:
# vrdeck_histogram = alt.Chart(train_df).mark_bar().encode(
#     alt.X('VRDeck', bin=alt.Bin(maxbins=400)), 
#     y='count()',
#     fill='Transported')
# vrdeck_histogram2 = alt.Chart(train_df).mark_bar(clip=True).encode(
#     alt.X('VRDeck', bin=alt.Bin(maxbins=400), scale=alt.Scale(domain=(0, 3000))), 
#     alt.Y('count()', scale=alt.Scale(domain=(0, 500))),
#     fill='Transported')
# vrdeck_histogram | vrdeck_histogram2

In [30]:
# Create VRDeck groups

train_df['VRDeck_bin'] = pd.cut(
    train_df['VRDeck'], 
    bins=[-0.1, 0.001, 100, 200, 600, 1000, 2000, 30000], 
    labels=['VR$0', 'VR$0-100', 'VR$100-200', 'VR$200-600', 'VR$600-1000', 'VR$1000-2000', 'VR$2000-30000']
)

In [31]:
# # Perform EDA

# plot_vrdeck_bin = alt.Chart(train_df).mark_bar().encode(
#     y="VRDeck_bin",
#     x='count()',
#     fill='Transported'
# )
# plot_vrdeck_bin

#### 11. Wrangling on `Name`

In [32]:
train_df["Name"].nunique()

8473

In [33]:
# Split `Name` into first and last names

train_df[['Name_first', 'Name_last']] = train_df["Name"].str.split(" ", expand=True)

In [34]:
train_df["Name_first"].nunique()

2706

In [35]:
train_df["Name_last"].nunique()

2217

In [36]:
train_df["Name_last"].value_counts()

Casonston     18
Oneiles       16
Domington     15
Litthews      15
Browlerson    14
              ..
Rocketedy      1
Chnik          1
Win            1
Ingetrody      1
Monsintic      1
Name: Name_last, Length: 2217, dtype: int64

In [37]:
# Makes a dictionary using the last name and the number of corresponding last names

num_last_name = train_df["Name_last"].value_counts()
last_name_dict = dict(num_last_name)

In [38]:
# Create separate column with the number of the corresponding last names

for name, num in last_name_dict.items():
    train_df.loc[train_df["Name_last"] == name, "num_of_Name_last"] = num

In [39]:
# lastname_histogram = alt.Chart(train_df).mark_bar().encode(
#     alt.X('num_of_Name_last', bin=alt.Bin(maxbins=20)), 
#     y='count()',
#     fill='Transported')
# lastname_histogram

In [40]:
train_df["num_of_Name_last"].isna().sum()

200

In [41]:
# Impute the missing names

train_df["num_of_Name_last"] = train_df["num_of_Name_last"].fillna(1)

In [42]:
# Create groups for number of last name

train_df['num_of_Name_last'] = pd.cut(
    train_df['num_of_Name_last'], 
    bins=[-100, 1.001, 15, 20], 
    labels=['fam1', 'fam2-15', 'fam15-20']
)

In [43]:
# # Perform EDA

# plot_num_lastname = alt.Chart(train_df).mark_bar().encode(
#     y="num_of_Name_last",
#     x='count()',
#     fill='Transported'
# )
# plot_num_lastname

In [44]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 6751_01 to 5360_01
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   HomePlanet            8693 non-null   object  
 1   CryoSleep             8693 non-null   bool    
 2   Cabin                 8693 non-null   object  
 3   Destination           8693 non-null   object  
 4   Age                   8693 non-null   float64 
 5   VIP                   8693 non-null   bool    
 6   RoomService           8693 non-null   float64 
 7   FoodCourt             8693 non-null   float64 
 8   ShoppingMall          8693 non-null   float64 
 9   Spa                   8693 non-null   float64 
 10  VRDeck                8693 non-null   float64 
 11  Name                  8493 non-null   object  
 12  Transported           8693 non-null   bool    
 13  HomePlanet_missing    8693 non-null   bool    
 14  CryoSleep_missing     8693 non-null   bool    
 15  

In [45]:
X_train, y_train = train_df.drop(columns=['Transported']), train_df['Transported']

In [46]:
y_train = y_train.astype(int)

In [47]:
pycaret_data = train_df.drop(
    columns=["Cabin", "Age", "RoomService", "FoodCourt", "ShoppingMall",
             "Spa", "VRDeck", "Name", "Cabin_num", "Name_first",
             "Name_last"]
)

# Application of PyCaret

In [48]:
experiment = setup(
    pycaret_data, 
    target='Transported',
    normalize=True,
    use_gpu=True,
)

Unnamed: 0,Description,Value
0,session_id,7641
1,Target,Transported
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(8693, 26)"
5,Missing Values,False
6,Numeric Features,0
7,Categorical Features,25
8,Ordinal Features,False
9,High Cardinality Features,False


In [49]:
# Model Selection

model = compare_models(n_select=1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7969,0.8826,0.7933,0.7989,0.7958,0.5937,0.5943,2.55
catboost,CatBoost Classifier,0.7962,0.8849,0.8042,0.7914,0.7975,0.5924,0.5929,3.683
gbc,Gradient Boosting Classifier,0.7936,0.8788,0.8009,0.7893,0.7949,0.5872,0.5876,0.656
lr,Logistic Regression,0.7906,0.8744,0.8147,0.777,0.7952,0.5813,0.5824,0.136
xgboost,Extreme Gradient Boosting,0.7887,0.876,0.7858,0.7901,0.7878,0.5773,0.5776,0.843
ridge,Ridge Classifier,0.7882,0.0,0.8088,0.7765,0.7921,0.5763,0.5772,0.017
lda,Linear Discriminant Analysis,0.7877,0.8716,0.8088,0.7758,0.7917,0.5754,0.5763,0.069
ada,Ada Boost Classifier,0.7857,0.8704,0.8045,0.7752,0.7893,0.5714,0.5723,0.245
svm,SVM - Linear Kernel,0.7829,0.0,0.8124,0.7702,0.7883,0.5658,0.5708,0.063
rf,Random Forest Classifier,0.7819,0.8613,0.7581,0.7957,0.7763,0.5638,0.5647,0.723


In [50]:
# Model Optimization

tuned_model = tune_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7997,0.8847,0.8224,0.7862,0.8039,0.5994,0.6
1,0.8144,0.8835,0.8487,0.7938,0.8203,0.6289,0.6304
2,0.7964,0.8889,0.8421,0.7711,0.805,0.5928,0.5954
3,0.8062,0.8889,0.8191,0.7981,0.8084,0.6125,0.6127
4,0.7783,0.869,0.7697,0.7826,0.7761,0.5566,0.5567
5,0.7747,0.8713,0.7862,0.7685,0.7772,0.5493,0.5495
6,0.7928,0.8862,0.7961,0.7908,0.7934,0.5855,0.5855
7,0.7961,0.8797,0.8026,0.7922,0.7974,0.5921,0.5922
8,0.8059,0.8946,0.8191,0.7981,0.8084,0.6118,0.6121
9,0.7961,0.8846,0.7822,0.8034,0.7926,0.5921,0.5923


In [51]:
# Finalize model

finalized_model = finalize_model(tuned_model)



# Test Set Preparation

#### 1. Basic Analysis

In [52]:
X_test = pd.read_csv("data/test.csv")

# Convert the values of numeric column into absolute values
X_test.loc[
    :, ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
] = X_test.loc[
    :, ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
].abs()
X_test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [53]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [54]:
X_test.isna().sum()/4277

PassengerId     0.000000
HomePlanet      0.020341
CryoSleep       0.021744
Cabin           0.023381
Destination     0.021510
Age             0.021277
VIP             0.021744
RoomService     0.019172
FoodCourt       0.024784
ShoppingMall    0.022913
Spa             0.023615
VRDeck          0.018705
Name            0.021978
dtype: float64

#### 2. Data Wrangling

In [55]:
# Make columns for missing data
X_test["HomePlanet_missing"] = X_test["HomePlanet"].isna()
X_test["CryoSleep_missing"] = X_test["CryoSleep"].isna()
X_test["Cabin_missing"] = X_test["Cabin"].isna()
X_test["Destination_missing"] = X_test["Destination"].isna()
X_test["Age_missing"] = X_test["Age"].isna()
X_test["VIP_missing"] = X_test["VIP"].isna()
X_test["RoomService_missing"] = X_test["RoomService"].isna()
X_test["FoodCourt_missing"] = X_test["FoodCourt"].isna()
X_test["ShoppingMall_missing"] = X_test["ShoppingMall"].isna()
X_test["Spa_missing"] = X_test["Spa"].isna()
X_test["VRDeck_missing"] = X_test["VRDeck"].isna()
X_test["Name_missing"] = X_test["Name"].isna()

# Perform imputation
X_test.loc[:, ["Age"]] = num_imputer.fit_transform(X_test.loc[:, ["Age"]])
X_test.loc[:, cat_columns] = cat_imputer.fit_transform(X_test.loc[:, cat_columns])

# Cabin
X_test[['Cabin_dec', 'Cabin_num', 'Cabin_side']] = X_test["Cabin"].str.split("/", expand=True)

# Age
X_test['Age_bin'] = pd.cut(
    X_test['Age'], 
    bins=[-0.1, 12, 18, 30, 50, 66, 90], 
    labels=['Children', 'Teenage', '20s', '30-40s', '50-60s', 'Elder']
)

# RoomService
X_test['RoomService_bin'] = pd.cut(
    X_test['RoomService'], 
    bins=[-0.1, 0.001, 100, 200, 400, 900, 2400, 15000], 
    labels=['RS$0', 'RS$0-100', 'RS$100-200', 'RS$200-400', 'RS$400-900', 'RS$900-2400', 'RS$2400-15000']
)

# FoodCourt
X_test['FoodCourt_bin'] = pd.cut(
    X_test['FoodCourt'], 
    bins=[-0.1, 0.001, 100, 500, 900, 2000, 30000], 
    labels=['FC$0', 'FC$0-100', 'FC$100-500', 'FC$500-900', 'FC$900-2000', 'FC$2000-30000']
)

# ShoppingMall
X_test['ShoppingMall_bin'] = pd.cut(
    X_test['ShoppingMall'], 
    bins=[-0.1, 0.001, 100, 200, 600, 1200, 2000, 30000], 
    labels=['SM$0', 'SM$0-100', 'SM$100-200', 'SM$200-600', 'SM$600-1200', 'SM$1200-2000', 'SM$2000-30000']
)

# Spa
X_test['Spa_bin'] = pd.cut(
    X_test['Spa'], 
    bins=[-0.1, 0.001, 100, 200, 600, 1200, 2000, 30000], 
    labels=['Sp$0', 'Sp$0-100', 'Sp$100-200', 'Sp$200-600', 'Sp$600-1200', 'Sp$1200-2000', 'Sp$2000-30000']
)

# VRDeck
X_test['VRDeck_bin'] = pd.cut(
    X_test['VRDeck'], 
    bins=[-0.1, 0.001, 100, 200, 600, 1000, 2000, 30000], 
    labels=['VR$0', 'VR$0-100', 'VR$100-200', 'VR$200-600', 'VR$600-1000', 'VR$1000-2000', 'VR$2000-30000']
)

# Name
X_test[['Name_first', 'Name_last']] = X_test["Name"].str.split(" ", expand=True)
num_last_name = X_test["Name_last"].value_counts()
last_name_dict = dict(num_last_name)
for name, num in last_name_dict.items():
    X_test.loc[X_test["Name_last"] == name, "num_of_Name_last"] = num

X_test["num_of_Name_last"] = X_test["num_of_Name_last"].fillna(1)
    
X_test['num_of_Name_last'] = pd.cut(
    X_test['num_of_Name_last'], 
    bins=[-100, 1.001, 15, 20], 
    labels=['fam1', 'fam2-15', 'fam15-20']
)

# Change data type to prevent error
X_test["HomePlanet"] = X_test["HomePlanet"].astype(str)
X_test["CryoSleep"] = X_test["CryoSleep"].astype(str)
X_test["Destination"] = X_test["Destination"].astype(str)
X_test["VIP"] = X_test["VIP"].astype(str)
X_test["Cabin_dec"] = X_test["Cabin_dec"].astype(str)
X_test["Cabin_side"] = X_test["Cabin_side"].astype(str)

In [56]:
X_test_pycaret = X_test.drop(
    columns=["Cabin", "Age", "RoomService", "FoodCourt", "ShoppingMall",
             "Spa", "VRDeck", "Name", "Cabin_num", "Name_first",
             "Name_last"]
)

In [57]:
X_test_pycaret

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,VIP,HomePlanet_missing,CryoSleep_missing,Cabin_missing,Destination_missing,Age_missing,...,Name_missing,Cabin_dec,Cabin_side,Age_bin,RoomService_bin,FoodCourt_bin,ShoppingMall_bin,Spa_bin,VRDeck_bin,num_of_Name_last
0,0013_01,Earth,True,TRAPPIST-1e,False,False,False,False,False,False,...,False,G,S,20s,RS$0,FC$0,SM$0,Sp$0,VR$0,fam2-15
1,0018_01,Earth,False,TRAPPIST-1e,False,False,False,False,False,False,...,False,F,S,20s,RS$0,FC$0-100,SM$0,Sp$2000-30000,VR$0,fam1
2,0019_01,Europa,True,55 Cancri e,False,False,False,False,False,False,...,False,C,S,30-40s,RS$0,FC$0,SM$0,Sp$0,VR$0,fam1
3,0021_01,Europa,False,TRAPPIST-1e,False,False,False,False,False,False,...,False,C,S,30-40s,RS$0,FC$2000-30000,SM$0,Sp$100-200,VR$200-600,fam1
4,0023_01,Earth,False,TRAPPIST-1e,False,False,False,False,False,False,...,False,F,S,20s,RS$0-100,FC$0,SM$600-1200,Sp$0,VR$0,fam2-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,TRAPPIST-1e,False,False,False,False,False,False,...,False,G,S,30-40s,RS$0,FC$0,SM$0,Sp$0,VR$0,fam2-15
4273,9269_01,Earth,False,TRAPPIST-1e,False,False,False,True,False,False,...,False,G,P,30-40s,RS$0,FC$500-900,SM$0-100,Sp$0-100,VR$100-200,fam2-15
4274,9271_01,Mars,True,55 Cancri e,False,False,False,False,False,True,...,False,D,P,20s,RS$0,FC$0,SM$0,Sp$0,VR$0,fam2-15
4275,9273_01,Europa,False,TRAPPIST-1e,False,False,False,False,True,True,...,False,D,P,20s,RS$0,FC$2000-30000,SM$0,Sp$0,VR$200-600,fam2-15


In [58]:
prediction = {}

In [59]:
prediction = predict_model(finalized_model, data=X_test_pycaret)

In [60]:
prediction

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,VIP,HomePlanet_missing,CryoSleep_missing,Cabin_missing,Destination_missing,Age_missing,...,Cabin_side,Age_bin,RoomService_bin,FoodCourt_bin,ShoppingMall_bin,Spa_bin,VRDeck_bin,num_of_Name_last,Label,Score
0,0013_01,Earth,True,TRAPPIST-1e,False,False,False,False,False,False,...,S,20s,RS$0,FC$0,SM$0,Sp$0,VR$0,fam2-15,True,0.6987
1,0018_01,Earth,False,TRAPPIST-1e,False,False,False,False,False,False,...,S,20s,RS$0,FC$0-100,SM$0,Sp$2000-30000,VR$0,fam1,False,0.9362
2,0019_01,Europa,True,55 Cancri e,False,False,False,False,False,False,...,S,30-40s,RS$0,FC$0,SM$0,Sp$0,VR$0,fam1,True,0.9922
3,0021_01,Europa,False,TRAPPIST-1e,False,False,False,False,False,False,...,S,30-40s,RS$0,FC$2000-30000,SM$0,Sp$100-200,VR$200-600,fam1,True,0.9458
4,0023_01,Earth,False,TRAPPIST-1e,False,False,False,False,False,False,...,S,20s,RS$0-100,FC$0,SM$600-1200,Sp$0,VR$0,fam2-15,True,0.5553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,TRAPPIST-1e,False,False,False,False,False,False,...,S,30-40s,RS$0,FC$0,SM$0,Sp$0,VR$0,fam2-15,True,0.6987
4273,9269_01,Earth,False,TRAPPIST-1e,False,False,False,True,False,False,...,P,30-40s,RS$0,FC$500-900,SM$0-100,Sp$0-100,VR$100-200,fam2-15,False,0.7249
4274,9271_01,Mars,True,55 Cancri e,False,False,False,False,False,True,...,P,20s,RS$0,FC$0,SM$0,Sp$0,VR$0,fam2-15,True,0.9290
4275,9273_01,Europa,False,TRAPPIST-1e,False,False,False,False,True,True,...,P,20s,RS$0,FC$2000-30000,SM$0,Sp$0,VR$200-600,fam2-15,True,0.8006


In [61]:
predictions = prediction["Label"]

In [62]:
predictions

0        True
1       False
2        True
3        True
4        True
        ...  
4272     True
4273    False
4274     True
4275     True
4276     True
Name: Label, Length: 4277, dtype: object

In [63]:
temp_df = X_test
temp_df["Transported"] = predictions
temp_df = temp_df.loc[:, ["PassengerId", "Transported"]]
temp_df.to_csv("results_pycaret/spaceship_titanic_imputation_221014_pycaret.csv", encoding='utf-8', index=False)

In [64]:
temp_df

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
