In [88]:
try:
    from lets_plot import *
except:
    print('Installing Lets plot...')
    !pip install lets_plot --quiet
    print('Installation done.')

In [89]:
import sys
import pandas as pd
import numpy as np
from colorama import Fore, Back, Style
from lets_plot import *
LetsPlot.setup_html()

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score,f1_score

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [104]:
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Data preparation


In [93]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [105]:
df=df.drop(['Name'],axis=1)
df_test=df_test.drop(['Name'],axis=1)

In [95]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [96]:
def get_passenger_group(df):
    group = df['PassengerId'].str.split('_',expand=True)
    df['PassengerGroup']= group[1]
    return  df.drop(columns='PassengerId')

def get_cabin(df):
    group= df.Cabin.str.split('/',expand=True)
    df['CabinLetter']=group[0]
    df['CabinNumber']=group[2]
    return df.drop(columns='Cabin')

def age_groups(df):
    df['Age_group'] = np.nan
    conditions = [
        (df['Age'] <= 12),
        ((df['Age'] > 12) & (df['Age'] < 18)),
        ((df['Age'] >= 18) & (df['Age'] <= 25)),
        ((df['Age'] > 25) & (df['Age'] <= 30)),
        ((df['Age'] > 30) & (df['Age'] <= 50)),
        (df['Age'] > 50)
    ]
    age_groups = [
        'Age_0-12',
        'Age_13-17',
        'Age_18-25',
        'Age_26-30',
        'Age_31-50',
        'Age_51+'
    ]
    df['Age_group'] = np.select(conditions, age_groups, default=np.nan)
    return df

    

## Categorical to Numeric Variables

In [97]:
def categorical_to_numeric(df):
    columns =['CabinLetter','CabinNumber', 'PassengerGroup', 'HomePlanet', 'Destination']
    for column in columns:
        categories = df[column].value_counts().index.sort_values().to_list()
        mapping = {key: i for i, key in enumerate(categories)}
        print(mapping)
        df[column] = df[column].map(mapping)
    return df
     

## Changing variable types

In [63]:
def bool2num(df):
    features = ['CryoSleep', 'VIP']
    df[features] = df[features].astype(bool)
    return df

## Target variable to last

In [103]:
def move_to_last(df):
    target = 'Transported'
    columns = df.columns.to_list()
    
    if target in columns:
        columns.remove(target)
        columns.append(target)
    
    return df[columns]

## Preprocessing pipeline

In [64]:
def preprocessing(df,func):
    df_copy = df.copy()
    for f in func:
        df_copy =f(df_copy)
    return df_copy

In [106]:
functions = [get_passenger_group, get_cabin,age_groups,categorical_to_numeric,bool2num, move_to_last]


In [107]:
df_processed = preprocessing(df,functions)
df_processed_test = preprocessing(df_test,functions)

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
{'P': 0, 'S': 1}
{'01': 0, '02': 1, '03': 2, '04': 3, '05': 4, '06': 5, '07': 6, '08': 7}
{'Earth': 0, 'Europa': 1, 'Mars': 2}
{'55 Cancri e': 0, 'PSO J318.5-22': 1, 'TRAPPIST-1e': 2}
{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
{'P': 0, 'S': 1}
{'01': 0, '02': 1, '03': 2, '04': 3, '05': 4, '06': 5, '07': 6, '08': 7}
{'Earth': 0, 'Europa': 1, 'Mars': 2}
{'55 Cancri e': 0, 'PSO J318.5-22': 1, 'TRAPPIST-1e': 2}


In [100]:

df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   HomePlanet      8492 non-null   float64
 1   CryoSleep       8693 non-null   bool   
 2   Destination     8511 non-null   float64
 3   Age             8514 non-null   float64
 4   VIP             8693 non-null   bool   
 5   RoomService     8512 non-null   float64
 6   FoodCourt       8510 non-null   float64
 7   ShoppingMall    8485 non-null   float64
 8   Spa             8510 non-null   float64
 9   VRDeck          8505 non-null   float64
 10  Transported     8693 non-null   bool   
 11  PassengerGroup  8693 non-null   int64  
 12  CabinLetter     8494 non-null   float64
 13  CabinNumber     8494 non-null   float64
 14  Age_group       8693 non-null   object 
dtypes: bool(3), float64(10), int64(1), object(1)
memory usage: 840.6+ KB


# Exploratory Data Analysis

In [101]:
def plot_distributions(df):
    """
    Function to plot distribution plots of columns of a dataframe
    """
    columns = df.columns
    plots = []
    for col in columns:
        plot = ggplot(df) +\
               geom_histogram(aes(x=col, fill='Transported', color = 'Transported'), alpha = 0.4, size = 1.5) +\
               ggtitle(f'{col} Distribution') +\
               theme(panel_grid_major_x='blank')
        plots.append(plot)
    return plots

plots = plot_distributions(df_processed)

gggrid(plots, ncol=2)

In [110]:
df_processed.drop(columns='VIP', inplace=True)
df_processed_test.drop(columns='VIP', inplace=True)

# Modelling

In [111]:
X_train, X_val, y_train, y_val = train_test_split(df_processed.iloc[:,:-1], df_processed.iloc[:,-1], test_size=0.25, random_state=1)

print("X Train Shape: ", X_train.shape)
print("X Valid Shape: ", X_val.shape)

X Train Shape:  (6519, 13)
X Valid Shape:  (2174, 13)


In [113]:
numeric_features = X_train[['RoomService','FoodCourt','ShoppingMall','Spa','Age']].columns
categorical_features = X_train[['HomePlanet','CryoSleep','Destination','PassengerGroup','CabinLetter','CabinNumber']].columns
print("Numeric features: " + numeric_features)
print("Categorical fVIPeatures: " + categorical_features)

Index(['Numeric features: RoomService', 'Numeric features: FoodCourt',
       'Numeric features: ShoppingMall', 'Numeric features: Spa',
       'Numeric features: Age'],
      dtype='object')
Index(['Categorical features: HomePlanet', 'Categorical features: CryoSleep',
       'Categorical features: Destination',
       'Categorical features: PassengerGroup',
       'Categorical features: CabinLetter',
       'Categorical features: CabinNumber'],
      dtype='object')


In [114]:
# Define Preprocessing Pipelines
numeric_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaling', StandardScaler())
])

categorical_preprocessor = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore')),  
    ('imputer', SimpleImputer(strategy='constant'))
])
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_preprocessor, numeric_features),
    ('categorical', categorical_preprocessor, categorical_features)
])
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier())
])

model_pipeline


In [116]:
def evaluate_pipeline_classification(pipe, model_name, X_train = X_train, y_train = y_train, X_test = X_val, y_test = y_val, random_state=42):

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # Calculate classification scores
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    classification_scores = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

    print(f"\033[1;32mModel: {model_name}\033[0m")
    print("\033[;32mAccuracy:\033[0m", classification_scores['accuracy'])
    print("\033[;32mPrecision:\033[0m", classification_scores['precision'])
    print("\033[;32mRecall:\033[0m", classification_scores['recall'])
    print("\033[;32mF1 Score:\033[0m", classification_scores['f1_score'])

In [117]:
evaluate_pipeline_classification(model_pipeline, 'Gradient Boosting Classifier')

[1;32mModel: Gradient Boosting Classifier[0m
[;32mAccuracy:[0m 0.7907083716651334
[;32mPrecision:[0m 0.7846683893195521
[;32mRecall:[0m 0.8163082437275986
[;32mF1 Score:[0m 0.8001756697408872


In [118]:
model_pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier())
])

evaluate_pipeline_classification(model_pipeline2, 'XGBoost Classifier')

[1;32mModel: XGBoost Classifier[0m
[;32mAccuracy:[0m 0.7732290708371665
[;32mPrecision:[0m 0.7783735478105451
[;32mRecall:[0m 0.7804659498207885
[;32mF1 Score:[0m 0.7794183445190156


In [119]:
params = {
    'activation': 'tanh',
    'alpha': 0.0001,
    'hidden_layer_sizes': (20,),
    'learning_rate': 'constant',
    'solver': 'adam'
}

model_pipeline3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MLPClassifier(**params))
])

evaluate_pipeline_classification(model_pipeline3, 'MLP Classifier')

[1;32mModel: MLP Classifier[0m
[;32mAccuracy:[0m 0.7879484820607175
[;32mPrecision:[0m 0.8046511627906977
[;32mRecall:[0m 0.775089605734767
[;32mF1 Score:[0m 0.7895937927886809


In [120]:
ensemble_classifier = VotingClassifier(
    estimators=[
        ('Random Forest', RandomForestClassifier()),
        ('Gradient Boosting', GradientBoostingClassifier()),
        ('Support Vector Machine', SVC()),
        ('Ada Boosting', AdaBoostClassifier()),
        ('MLPClassifier', MLPClassifier(**params))
    ],
    voting='hard'
)

ensemble_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', ensemble_classifier)
])

ensemble_pipeline


In [121]:
evaluate_pipeline_classification(ensemble_pipeline, 'Ensemble Model')

[1;32mModel: Ensemble Model[0m
[;32mAccuracy:[0m 0.7920883164673413
[;32mPrecision:[0m 0.7932862190812721
[;32mRecall:[0m 0.8046594982078853
[;32mF1 Score:[0m 0.798932384341637
