### Define Objectives:  
*In this competition, the task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly.*

In [1]:
#### import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler


### Data Collection:  
*Load data into pandas DataFrame for further analysis.*

In [2]:
#### load the file
DIR_PATH = '/kaggle/input/spaceship-titanic'
train = pd.read_csv(os.path.join(DIR_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DIR_PATH, 'test.csv'))

### Data Cleaning:  
*Clean the unhelpful columns, NaN value, duplicates and inconsistencies.*

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


Note: All feature columns (exclude ID) have missing values.

In [4]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
#### define the clean data function
def clean_data(df):
    # data cleaning
    # list of categorical features to fill with mode
    categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
    # list of numerical features to fill with mean
    numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    # fill missing values for categorical features with mode
    for feature in categorical_features:
        df[feature] = df[feature].fillna(train[feature].mode()[0])
    # fill missing values for numerical features with mean
    for feature in numerical_features:
        df[feature] = df[feature].fillna(train[feature].mean())
        
    # feature engineering
    # perform one-hot encoding on 'HomePlanet' and 'Destination' columns
    df = pd.get_dummies(df, columns=['HomePlanet', 'Destination'])
    # extract and create a new 'group' column from 'PassengerId', assuming the format 'group_passenger'
    df['group'] = df['PassengerId'].str.split('_').str[0].astype(int)
    # drop 'Name' and 'Cabin' columns from the DataFrame
    df = df.drop(['PassengerId', 'Name', 'Cabin'], axis=1)
        
    return df

In [6]:
train = clean_data(train)

  df[feature] = df[feature].fillna(train[feature].mode()[0])


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   bool   
 1   Age                        8693 non-null   float64
 2   VIP                        8693 non-null   bool   
 3   RoomService                8693 non-null   float64
 4   FoodCourt                  8693 non-null   float64
 5   ShoppingMall               8693 non-null   float64
 6   Spa                        8693 non-null   float64
 7   VRDeck                     8693 non-null   float64
 8   Transported                8693 non-null   bool   
 9   HomePlanet_Earth           8693 non-null   bool   
 10  HomePlanet_Europa          8693 non-null   bool   
 11  HomePlanet_Mars            8693 non-null   bool   
 12  Destination_55 Cancri e    8693 non-null   bool   
 13  Destination_PSO J318.5-22  8693 non-null   bool 

In [8]:
train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,group
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,False,True,1
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,True,False,False,False,False,True,2
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,False,True,False,False,False,True,3
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,False,True,False,False,False,True,3
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,True,False,False,False,False,True,4


### Data Preprocessing:  

#### Feature Engineering:  
*Create new features from existing ones to improve model performance.*

In [9]:
#### feature engineering (add to clean_data() for convenience)
# # perform one-hot encoding on 'HomePlanet' and 'Destination' columns
# df = pd.get_dummies(df, columns=['HomePlanet', 'Destination'])
# # extract and create a new 'group' column from 'PassengerId', assuming the format 'group_passenger'
# df['group'] = df['PassengerId'].str.split('_').str[0].astype(int)
# # drop 'Name' and 'Cabin' columns from the DataFrame
# df = df.drop(['PassengerId', 'Name', 'Cabin'], axis=1)

#### Data Transformation:  
*Normalize, scale, or encode data as necessary.*

In [10]:
# prepare feature and target
X = train.drop(['Transported'], axis=1)
y = train['Transported']
# split data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# initialize the scaler
scaler = StandardScaler()
# fit the scaler with train set (X_train)
scaler.fit(X_train)
# apply the scaler to train set (X_train) and validation set (X_test)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Data Modeling:  
*Conduct a comprehensive assessment by deploying a suite of classification algorithms, such as K-Nearest Neighbors (KNN), Logistic Regression, and Random Forest, to ensure a robust evaluation of the dataset's predictive dynamics.*

#### KNN
The K-Nearest Neighbors algorithm boasts simplicity in its non-parametric and instance-based approach, excels at classifying non-linear data, and performs well with a small number of input variables.

In [11]:
# def make_submission(model, test_dataset, scaler):
#     PassengerId = test_dataset['PassengerId']
#     test_dataset = clean_data(test_dataset)
#     test_dataset = scaler.transform(test_dataset)
#     y_pred = model.predict(test_dataset)
    
#     submission = pd.from_dict({
#         'PassengerId' = PassengerId,
#         'Transported' = y_pred
#     })

In [12]:
def train_and_evaluate_knn(X_train, y_train, X_test, y_test, n_neighbors=3):
    
    '''Train a k-Nearest Neighbors classifier and evaluate its accuracy.'''
    
    # initialize the kNN classifier with specified number of neighbors
    knn = KNeighborsClassifier(n_neighbors)
    # train the classifier on the training data
    knn.fit(X_train, y_train)
    # make predictions on the testing data
    y_pred = knn.predict(X_test)
    # calculate the accuracy of the predictions
    accuracy = accuracy_score(y_test, y_pred)
    # display accuracy
    print(f'The accuracy is: {accuracy}')
    
    return knn

In [13]:
#### implement K-fold cross-validation to choose the optimal K
knn = KNeighborsClassifier()
# define the parameter grid
param_grid = {'n_neighbors': range(1, 31)}
# use GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5) # cv is the number of folds
# fit the grid search to the data
grid_search.fit(X_train, y_train)
# get the best parameter
best_k = grid_search.best_params_['n_neighbors']
print(f"The best value for 'k' is {best_k}")

The best value for 'k' is 30


In [14]:
knn = train_and_evaluate_knn(X_train, y_train, X_test, y_test, 30)

The accuracy is: 0.7860839562967222


#### Logistic Regression
Logistic Regression offers a probabilistic understanding of class membership, excels with binary classification tasks, and maintains efficiency with resource use, making it a preferred model for its interpretability and speed in scenarios with dichotomous outcomes.

In [15]:
def train_and_evaluate_logreg(X_train, y_train, X_test, y_test):
    '''Train a logistic regression classifier and evaluate its accuracy.'''
    
    # initialize the logistic model
    logreg = LogisticRegression()
    # fit the model to the training data
    logreg.fit(X_train, y_train)
    # predict probabilities
    y_pred = logreg.predict(X_test)
    # calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    # evaluate accuracy
    print('#' * 50)
    print(f"Accuracy:\n {accuracy: .2f}")

    # evaluate classification report
    print('#' * 50)
    print(f"classification report:\n {classification_report(y_test, y_pred)}")
    
    # access the model's coefficients and intercept
    coefficients = logreg.coef_
    intercept = logreg.intercept_
    # matching the coefficients to the feature names
    feature_importance = pd.DataFrame(data=coefficients.T, index=X.columns, columns=['Coefficient'])
    print('#' * 50)
    print(f"feature importance:\n {feature_importance}")
    
    return logreg

In [16]:
logreg = train_and_evaluate_logreg(X_train, y_train, X_test, y_test)

##################################################
Accuracy:
  0.78
##################################################
classification report:
               precision    recall  f1-score   support

       False       0.79      0.75      0.77       861
        True       0.77      0.81      0.79       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739

##################################################
feature importance:
                            Coefficient
CryoSleep                     0.613314
Age                          -0.094857
VIP                          -0.041164
RoomService                  -1.021469
FoodCourt                     0.802463
ShoppingMall                  0.341046
Spa                          -2.145904
VRDeck                       -2.056558
HomePlanet_Earth             -0.421149
HomePlanet_Europa             0.542656
HomePlanet_Mars              

#### random forest
Random Forest is a versatile algorithm that can handle both classification and regression tasks with high accuracy, manages large datasets with thousands of input variables without variable deletion, and provides important measures of feature significance, all while being less prone to overfitting compared to individual decision trees.

In [17]:
def train_and_evaluate_rf(X_train, y_train, X_test, y_test):
    '''Train a random forest classifier and evaluate its accuracy.'''
    
    # initialize the Random Forest model
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    # fit the model to the training data
    rf.fit(X_train, y_train)
    # predict class
    y_pred = rf.predict(X_test)

    # evaluate accuracy
    print(f"Accuracy:\n {accuracy_score(y_test, y_pred)}")
    # detailed classification report
    print(f"classification report:\n {classification_report(y_test, y_pred)}")
    
    return rf

In [18]:
rf = train_and_evaluate_rf(X_train, y_train, X_test, y_test)

Accuracy:
 0.7757331799884991
classification report:
               precision    recall  f1-score   support

       False       0.77      0.79      0.78       861
        True       0.79      0.77      0.78       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739

