<a href="https://www.kaggle.com/code/jackren000/spaceship-titanic?scriptVersionId=162295150" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### Define Objectives:  
*In this competition, the task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly.*

In [1]:
#### import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split  # automatic splitting method
from sklearn.model_selection import GridSearchCV  # cross validation
from sklearn.linear_model import LogisticRegression  # logistic regression classifier
from sklearn.neighbors import KNeighborsClassifier  # k-nearest neighbors classifier
from sklearn.ensemble import RandomForestClassifier  # random forest classifier
from sklearn.metrics import accuracy_score  # accuracy scoring metric
from sklearn.metrics import classification_report  # classification performance report
from sklearn.preprocessing import StandardScaler  # standard scaler for normalization
from sklearn.preprocessing import MinMaxScaler  # min-max scaler for normalization

# import PyTorch for Deep Learning
import torch
# import neural network
from torch import nn
# import relative math functions
import torch.nn.functional as F
# import PyTorch DataLoader
from torch.utils.data import Dataset, DataLoader

### Data Collection:  
*Load data into pandas DataFrame for further analysis.*

In [2]:
#### load the file
DIR_PATH = '/kaggle/input/spaceship-titanic'
train = pd.read_csv(os.path.join(DIR_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DIR_PATH, 'test.csv'))

### Data Cleaning:  
*Clean the unhelpful columns, NaN value, duplicates and inconsistencies.*

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


Note: All feature columns (exclude ID) have missing values.

In [4]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
#### define the clean data function
def data_cleaning(df):
    ######################## data cleaning ################################
    # list of categorical features to fill with mode
    categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
    # list of numerical features to fill with mean
    numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    # fill missing values for categorical features with mode
    for feature in categorical_features:
        df[feature] = df[feature].fillna(df[feature].mode()[0])
    # fill missing values for numerical features with mean
    for feature in numerical_features:
        df[feature] = df[feature].fillna(df[feature].mean())
        
    ######################### feature engineering ##########################
    # define the age bins
#     age_bins = [0, 12, 18, 35, 60, float('inf')]
    # assign a number for each age group
#     age_labels = [1, 2, 3, 4, 5]  # These numbers correspond to 'Child', 'Teen', 'Young Adult', 'Adult', 'Senior'
    # create the 'AgeGroup' column
#     df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

    # split the 'Cabin' column into 'Deck', 'Num', and 'Side'
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
    # fill NaN values in 'Deck' with the mode of the 'Deck' column
#     df['Deck'] = df['Deck'].fillna(df['Deck'].mode()[0])
    # map 'Deck' column
#     df['Deck'] = df['Deck'].map({'B':1,'F':2,'A':3,'G':4,'E':5,'D':6,'C':7,'T':8})
    # fill NaN values in 'Num' with the mode of the 'Num' column
    df['Num'] = df['Num'].fillna(df['Num'].mode()[0])
    # convert 'Num' into an integer
    df['Num'] = df['Num'].astype(int)
    # fill NaN values in 'Side' with the mode of the 'Side' column
    df['Side'] = df['Side'].fillna(df['Side'].mode()[0])
    # map 'P' to Port (0) and 'S' to Starboard (1)
    df['Side'] = df['Side'].map({'P': 0, 'S': 1})
    # map 'HomePlanet'
    df['HomePlanet'] = df['HomePlanet'].map({'Europa':1,'Earth':2,'Mars':3})
    # map 'Destination'
    df['Destination'] = df['Destination'].map({'TRAPPIST-1e':1,'PSO J318.5-22':2,'55 Cancri e':3})
    # perform one-hot encoding on 'HomePlanet' and 'Destination' columns
#     df = pd.get_dummies(df, columns=['HomePlanet', 'Destination'])
    # extract the group identifier from 'PassengerId'
    group = df['PassengerId'].str.split('_').str[0]
    # create a dictionary with group counts
    group_size_dict = group.value_counts().to_dict()
    # map the group size to each passenger
    df['group_size'] = group.map(group_size_dict)
    # drop 'Name' and 'Cabin' columns from the DataFrame
    df = df.drop(['PassengerId', 'Name', 'Cabin', 'Deck'], axis=1)
        
    return df

In [6]:
train = data_cleaning(train)

  df[feature] = df[feature].fillna(df[feature].mode()[0])


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   int64  
 1   CryoSleep     8693 non-null   bool   
 2   Destination   8693 non-null   int64  
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   bool   
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  Transported   8693 non-null   bool   
 11  Num           8693 non-null   int64  
 12  Side          8693 non-null   int64  
 13  group_size    8693 non-null   int64  
dtypes: bool(3), float64(6), int64(5)
memory usage: 772.6 KB


In [8]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,Side,group_size
0,1,False,1,39.0,False,0.0,0.0,0.0,0.0,0.0,False,0,0,1
1,2,False,1,24.0,False,109.0,9.0,25.0,549.0,44.0,True,0,1,1
2,1,False,1,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0,1,2
3,1,False,1,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0,1,2
4,2,False,1,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1,1,1


### Data Preprocessing:  

#### Feature Engineering:  
*Create new features from existing ones to improve model performance.*

In [9]:
#### feature engineering (add to clean_data() for convenience)
# ######################### feature engineering ##########################
# # split the 'Cabin' column into 'Deck', 'Num', and 'Side'
# df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
# # fill NaN values in 'Side' with the mode of the 'Side' column
# df['Side'] = df['Side'].fillna(df['Side'].mode()[0])
# # map 'P' to Port (0) and 'S' to Starboard (1)
# df['Side'] = df['Side'].map({'P': 0, 'S': 1})
# # perform one-hot encoding on 'HomePlanet' and 'Destination' columns
# df = pd.get_dummies(df, columns=['HomePlanet', 'Destination'])
# # extract the group identifier from 'PassengerId'
# group = df['PassengerId'].str.split('_').str[0]
# # create a dictionary with group counts
# group_size_dict = group.value_counts().to_dict()
# # map the group size to each passenger
# df['group_size'] = group.map(group_size_dict)
# # drop 'Name' and 'Cabin' columns from the DataFrame
# df = df.drop(['PassengerId', 'Name', 'Cabin', 'Num', 'Deck'], axis=1)

#### Data Transformation:  
*Normalize, scale, or encode data as necessary.*

In [10]:
# prepare feature and target
X = train.drop(['Transported'], axis=1)
y = train['Transported']
# split data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# initialize the scaler
scaler = StandardScaler()
# fit the scaler with train set (X_train)
scaler.fit(X_train)
# apply the scaler to train set (X_train) and validation set (X_test)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Data Modeling:  
*Conduct a comprehensive assessment by deploying a suite of classification algorithms, such as K-Nearest Neighbors (KNN), Logistic Regression, and Random Forest, to ensure a robust evaluation of the dataset's predictive dynamics.*

In [11]:
def save_model_predictions_to_csv(model, test_dataset, scaler, output_filename='submission.csv'):
    
    try:
        # save the 'PassengerId' column
        PassengerId = test_dataset['PassengerId']
        # clean the dataset
        test_dataset = data_cleaning(test_dataset)
        test_dataset = scaler.transform(test_dataset)
        # make predictions using the provided model
        y_pred = model.predict(test_dataset)

        # create a DataFrame with 'PassengerId' and the predictions
        submission_df = pd.DataFrame({
            'PassengerId': PassengerId,
            'Transported': y_pred
        })

        # save the DataFrame to a CSV file
        submission_df.to_csv(output_filename, index=False)
        print(f"Your submission was successfully saved to {output_filename}!")
        
    except Exception as e:
        # handle any exceptions that might occur
        print("An error occurred while saving the submission:")
        print(e)

#### KNN
The K-Nearest Neighbors algorithm boasts simplicity in its non-parametric and instance-based approach, excels at classifying non-linear data, and performs well with a small number of input variables.

In [12]:
def train_and_evaluate_knn(X_train, y_train, X_test, y_test, n_neighbors=3):
    
    '''Train a k-Nearest Neighbors classifier and evaluate its accuracy.'''
    
    # initialize the kNN classifier with specified number of neighbors
    knn = KNeighborsClassifier(n_neighbors)
    # train the classifier on the training data
    knn.fit(X_train, y_train)
    # make predictions on the testing data
    y_pred = knn.predict(X_test)
    # calculate the accuracy of the predictions
    accuracy = accuracy_score(y_test, y_pred)
    # display accuracy
    print(f'The accuracy is: {accuracy}')
    
    return knn

In [13]:
# #### implement K-fold cross-validation to choose the optimal K
# knn = KNeighborsClassifier()
# # define the parameter grid
# param_grid = {'n_neighbors': range(1, 31)}
# # use GridSearchCV
# grid_search = GridSearchCV(knn, param_grid, cv=5) # cv is the number of folds
# # fit the grid search to the data
# grid_search.fit(X_train, y_train)
# # get the best parameter
# best_k = grid_search.best_params_['n_neighbors']
# print(f"The best value for 'k' is {best_k}")

In [14]:
knn = train_and_evaluate_knn(X_train, y_train, X_test, y_test, 30)
save_model_predictions_to_csv(knn, test, scaler)

The accuracy is: 0.7827586206896552


  df[feature] = df[feature].fillna(df[feature].mode()[0])


Your submission was successfully saved to submission.csv!


#### Logistic Regression
Logistic Regression offers a probabilistic understanding of class membership, excels with binary classification tasks, and maintains efficiency with resource use, making it a preferred model for its interpretability and speed in scenarios with dichotomous outcomes.

In [15]:
def train_and_evaluate_logreg(X_train, y_train, X_test, y_test):
    '''Train a logistic regression classifier and evaluate its accuracy.'''
    
    # initialize the logistic model
    logreg = LogisticRegression()
    # fit the model to the training data
    logreg.fit(X_train, y_train)
    # predict probabilities
    y_pred = logreg.predict(X_test)
    # calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    # evaluate accuracy
    print('#' * 50)
    print(f"Accuracy:\n {accuracy: .2f}")

    # evaluate classification report
    print('#' * 50)
    print(f"classification report:\n {classification_report(y_test, y_pred)}")
    
    # access the model's coefficients and intercept
    coefficients = logreg.coef_
    intercept = logreg.intercept_
    # matching the coefficients to the feature names
    feature_importance = pd.DataFrame(data=coefficients.T, index=X.columns, columns=['Coefficient'])
    print('#' * 50)
    print(f"feature importance:\n {feature_importance}")
    
    return logreg

In [16]:
logreg = train_and_evaluate_logreg(X_train, y_train, X_test, y_test)
# save_model_predictions_to_csv(logreg, test, scaler)

##################################################
Accuracy:
  0.75
##################################################
classification report:
               precision    recall  f1-score   support

       False       0.78      0.72      0.75       443
        True       0.73      0.79      0.76       427

    accuracy                           0.75       870
   macro avg       0.76      0.75      0.75       870
weighted avg       0.76      0.75      0.75       870

##################################################
feature importance:
               Coefficient
HomePlanet      -0.236909
CryoSleep        0.800497
Destination      0.171908
Age             -0.018787
VIP              0.052130
RoomService     -0.818237
FoodCourt        1.069406
ShoppingMall     0.431736
Spa             -1.897077
VRDeck          -1.776285
Num             -0.033014
Side             0.293276
group_size       0.096323


#### random forest
Random Forest is a versatile algorithm that can handle both classification and regression tasks with high accuracy, manages large datasets with thousands of input variables without variable deletion, and provides important measures of feature significance, all while being less prone to overfitting compared to individual decision trees.

In [17]:
def train_and_evaluate_rf(X_train, y_train, X_test, y_test):
    '''Train a random forest classifier and evaluate its accuracy.'''
    
    # initialize the Random Forest model
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    # fit the model to the training data
    rf.fit(X_train, y_train)
    # predict class
    y_pred = rf.predict(X_test)

    # evaluate accuracy
    print(f"Accuracy:\n {accuracy_score(y_test, y_pred)}")
    # detailed classification report
    print(f"classification report:\n {classification_report(y_test, y_pred)}")
    
    return rf

In [18]:
rf = train_and_evaluate_rf(X_train, y_train, X_test, y_test)
# save_model_predictions_to_csv(rf, test, scaler)

Accuracy:
 0.764367816091954
classification report:
               precision    recall  f1-score   support

       False       0.77      0.76      0.77       443
        True       0.76      0.77      0.76       427

    accuracy                           0.76       870
   macro avg       0.76      0.76      0.76       870
weighted avg       0.76      0.76      0.76       870



#### Deep Learning
Deep Neural Networks excel in handling complex patterns within vast datasets, often surpassing human-level performance in tasks with large feature spaces (thousands to millions), and are highly adaptable through their deep architecture and ability to learn feature representations automatically.

In [19]:
#### create a custom dataset class that inherits torch.utils.data.Dataset
# create TabularDataset class in order to use DataLoader
class TabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):
        return self.features[index], self.labels[index]

# create PyTorch Dataset objects
train_dataset = TabularDataset(X_train, y_train)
test_dataset = TabularDataset(X_test, y_test)

# create DataLoaders
# in each epoch, the train process iterate each bach until all samples are calculated
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

# create an iterator over the dataset
train_iter = iter(train_loader)
# get the next batch
features, labels = next(train_iter)
# print the shapes
print(f'Feature batch shape: {features.size()}')
print(f'Labels batch shape: {labels.size()}')

Feature batch shape: torch.Size([64, 13])
Labels batch shape: torch.Size([64])


In [20]:
#### define the model
class ClassificationModel(nn.Module):
    def __init__(self, num_features, num_classes=1):  # default num_classes to 1 for binary classification
        super().__init__()
        self.sequential1 = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.ReLU()
        )
        self.sequential2 = nn.Sequential(
            nn.Linear(64, 48),
            nn.ReLU()
        )
        self.sequential3 = nn.Sequential(
            nn.Linear(48, 32),
            nn.ReLU()
        )
        self.output_layer = nn.Linear(32, num_classes) 

    def forward(self, x):
        x = self.sequential1(x)
        x = self.sequential2(x)
        x = self.sequential3(x)
        x = torch.sigmoid(self.output_layer(x))  # using sigmoid for binary classification
        return x

In [21]:
#### define the model 2
class ClassificationModel_2(nn.Module):
    def __init__(self, num_features, num_classes=1, dropout_rate=0.5):
        super().__init__()
        self.layer1 = nn.Linear(num_features, 64)
        self.batch_norm1 = nn.BatchNorm1d(64)
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.layer2 = nn.Linear(64, 48)
        self.batch_norm2 = nn.BatchNorm1d(48)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        self.layer3 = nn.Linear(48, 32)
        self.batch_norm3 = nn.BatchNorm1d(32)
        self.dropout3 = nn.Dropout(dropout_rate)
        
        self.output_layer = nn.Linear(32, num_classes)
        
    def forward(self, x):
        x = F.relu(self.batch_norm1(self.layer1(x)))
        x = self.dropout1(x)
        
        x = F.relu(self.batch_norm2(self.layer2(x)))
        x = self.dropout2(x)
        
        x = F.relu(self.batch_norm3(self.layer3(x)))
        x = self.dropout3(x)
        
        x = torch.sigmoid(self.output_layer(x))  # still using sigmoid for binary classification
        return x

In [22]:
# create an instance of the ClassificationModel with the number of features from training data
model = ClassificationModel_2(num_features=X_train.shape[1])

# use BCEWithLogitsLoss which combines a sigmoid layer and the BCELoss in one single class
criterion = nn.BCEWithLogitsLoss()
# initialize the optimizer with the Adam algorithm and a learning rate of 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# set the device to cuda if available, otherwise use cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# transfer the model to the chosen device
model.to(device)

# set the number of epochs for training
num_epochs = 1000

# start the training loop
for epoch in range(num_epochs):
    # set the model to training mode
    model.train()
    # initialize the total loss variable
    total_loss = 0

    # iterate over batches of data from the train_loader
    for inputs, labels in train_loader:
        # transfer inputs and labels to the device
        inputs, labels = inputs.to(device), labels.to(device)
        # reset the gradients of the model parameters
        optimizer.zero_grad()
        # forward pass: compute the model outputs
        outputs = model(inputs)
        # calculate the loss between outputs and labels
        loss = criterion(outputs, labels.unsqueeze(1))
        # backward pass: compute the gradient of the loss with respect to model parameters
        loss.backward()
        # add the current loss to the total loss
        total_loss += loss.item()
        # update the model parameters
        optimizer.step()
    
    # print the loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

# switch off gradients for validation, saves memory and computations
with torch.no_grad():
    # set the model to evaluation mode
    model.eval()
    # initialize correct prediction count
    correct = 0
    # initialize total prediction count
    total = 0

    # iterate over batches of data from the test_loader
    for inputs, labels in test_loader:
        # transfer inputs and labels to the device
        inputs, labels = inputs.to(device), labels.to(device)
        # forward pass: compute the model outputs
        outputs = model(inputs)
        # convert outputs probabilities to predicted class (0 or 1)
        predicted = outputs.round()
        # count the total number of labels
        total += labels.size(0)
        # count the number of correct predictions
        correct += (predicted == labels.unsqueeze(1)).sum().item()

# calculate the accuracy of predictions
accuracy = (correct / total) * 100
# print the accuracy of the model on the test data
print(f'Accuracy of the model on the test data: {accuracy:.2f}%')

Epoch 1/1000, Loss: 0.6933493943718391
Epoch 101/1000, Loss: 0.5968037296116837
Epoch 201/1000, Loss: 0.5933174224403815
Epoch 301/1000, Loss: 0.5900514915706666
Epoch 401/1000, Loss: 0.5931730885815815
Epoch 501/1000, Loss: 0.5908882942626147
Epoch 601/1000, Loss: 0.589815760046486
Epoch 701/1000, Loss: 0.5913383728120385
Epoch 801/1000, Loss: 0.5884961626394009
Epoch 901/1000, Loss: 0.5905789345260558
Accuracy of the model on the test data: 76.55%


In [23]:
def save_model_predictions_to_csv(model, test_dataset, scaler, output_filename='submission.csv'):
    
    try:
        # save the 'PassengerId' column
        PassengerId = test_dataset['PassengerId']
        # clean the dataset
        test_dataset = data_cleaning(test_dataset)
        test_dataset = scaler.transform(test_dataset)
        # make predictions using the provided model
        y_pred = model(torch.tensor(test_dataset, dtype=torch.float32).to(device))
        y_pred = y_pred.round().int()
        y_pred = y_pred.detach().cpu().numpy().flatten()

        # create a DataFrame with 'PassengerId' and the predictions
        submission_df = pd.DataFrame({
            'PassengerId': PassengerId,
            'Transported': y_pred
        })

        # save the DataFrame to a CSV file
        submission_df.to_csv(output_filename, index=False)
        print(f"Your submission was successfully saved to {output_filename}!")
        
    except Exception as e:
        # handle any exceptions that might occur
        print("An error occurred while saving the submission:")
        print(e)

In [24]:
# save_model_predictions_to_csv(model, test, scaler)