## **Data Analysis**

### Define the objective:
*The goal of this data analysis is to find relevance and patterns of the dataset, using statistical method and data visualization to show a clear relationship between different features within the dataset.*

In [1]:
#### import libraries
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

# import KNN from scikit-learn library (especially for ML)
from sklearn.neighbors import KNeighborsClassifier
# import logistic regression model
from sklearn.linear_model import LogisticRegression
# import random forest model
from sklearn.ensemble import RandomForestClassifier
# import data split method 
from sklearn.model_selection import train_test_split
# import K-fold cross-validation method
from sklearn.model_selection import GridSearchCV
# import evaluation method
from sklearn.metrics import accuracy_score, classification_report
# normalize the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# import PyTorch for Deep Learning
import torch
# import neural network
from torch import nn
# import relative math functions
import torch.nn.functional as F
# import PyTorch DataLoader
from torch.utils.data import Dataset, DataLoader

### Data Collection:
*Collect the relevant data from competition website, then convert it into pandas DataFrame.*

In [2]:
#### load the data
DIR_PATH = '/kaggle/input/titanic'
train = pd.read_csv(os.path.join(DIR_PATH, 'train.csv'))

### Data Clearning:
*Clean the unhelpful columns and NaN value.*

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# drop unhelpful feature from the observation
train.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
# drop NaN rows
train.dropna(subset=['Age', 'Embarked'], inplace=True)
# train.dropna(subset=['Cabin'], inplace=True)

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


### Data Preprocessing

#### Data Transformation

In [7]:
# convert categorical data into numerical data
# use get_dummies() to perform one-hot encoding on 'Sex' and 'Embarked'
train = pd.get_dummies(train, columns=['Sex', 'Embarked'])
# separate the feature matrix and the target value
X = train.drop('Survived', axis=1)
y = train['Survived']

# normalize the data
# split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# initialize the scaler
scaler = StandardScaler()
# fit on training set only
scaler.fit(X_train)
# apply transform to both the training set and the test set
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#### Feature Engineering

### Data Modeling

> > ### K-Nearest Neighbors: 
> > This is a type of instance-based supervised learning algorithm used for both classification and regression.  
> 
> Pros:
> 1. simple to understand and implement.
> 2. no need to build a model, tune several parameters.
> 3. the algorithm is versatile, it can be used for classification, regression and search (as in recommender system).
> 
> Cons:
> 1. the algorithm gets significantly slower as the dataset grows.
> 2. requires high memory - needs to store all the training data.
> 3. sensitive to the scale of the data and irrelevant features.
> 4. typically not as accurate as more sophisticated methods, especially on datasets with a lot of features.

In [8]:
def train_and_evaluate_knn(X_train, y_train, X_test, y_test, n_neighbors=3):
    '''Train a K-Nearest Neighbors classifier and evaluate its accuracy.'''
    
    # initialize the KNN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    # fit the model on the training data
    knn.fit(X_train, y_train)
    # predict the labels for the test set
    y_pred = knn.predict(X_test)
    # calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    print(f'Accuracy: {accuracy: .2f}')

In [9]:
# test it when K is 3
train_and_evaluate_knn(X_train, y_train, X_test, y_test, n_neighbors=3)

Accuracy:  0.78


In [10]:
#### implement K-fold cross-validation to choose the optimal K
knn = KNeighborsClassifier()
# define the parameter grid
param_grid = {'n_neighbors': range(1, 31)}
# use GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5) # cv is the number of folds
# fit the grid search to the data
grid_search.fit(X_train, y_train)
# get the best parameter
best_k = grid_search.best_params_['n_neighbors']
print(f"The best value for 'k' is {best_k}")

The best value for 'k' is 4


In [11]:
# test it when K is 20
train_and_evaluate_knn(X_train, y_train, X_test, y_test, n_neighbors=20)

Accuracy:  0.78


> > ### Logistic regression: 
> > This is a statistical model that can model a binomial outcome with one or more explanatory variables. It is used extensively in many fields, including the medical and social sciences.  
> 
> Pros:
> 1. it can perform well when the dataset is linearly separable or when the boundary between classes can be approximated with a linear combination of features.
> 2. logistic regression not only provides a classification but also gives the probabilities of the outcome, which can be a valuable insight.
> 3. can be extended to multiclass classification problems.
> 4. it is computationally less intensive.
> 
> Cons:
> 1. it assumes a linear relationship between the independent variables and log odds of the dependent variables.
> 2. logistic regression can't capture complex relationships with non-linear boundaries as accurately as neural networks and decision trees.
> 3. it is sensitive to outliers and may need scale the data.

In [12]:
# initialize the logistic model
logreg = LogisticRegression()
# fit the model to the training data
logreg.fit(X_train, y_train)
# predict probabilities
y_pred = logreg.predict(X_test)
# calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# evaluate accuracy
print(f"Accuracy: {accuracy: .2f}")

# evaluate classification report
print(classification_report(y_test, y_pred))

Accuracy:  0.79
              precision    recall  f1-score   support

           0       0.79      0.87      0.83       122
           1       0.80      0.70      0.74        92

    accuracy                           0.79       214
   macro avg       0.80      0.78      0.79       214
weighted avg       0.79      0.79      0.79       214



In [13]:
# access the model's coefficients and intercept
coefficients = logreg.coef_
intercept = logreg.intercept_
# matching the coefficients to the feature names
feature_importance = pd.DataFrame(data=coefficients.T, index=X.columns, columns=['Coefficient'])
print(feature_importance)

             Coefficient
PassengerId     0.081795
Pclass         -1.231483
Age            -0.571095
SibSp          -0.269481
Parch           0.010455
Fare           -0.010707
Sex_female      0.678744
Sex_male       -0.678744
Embarked_C      0.057743
Embarked_Q     -0.011158
Embarked_S     -0.049136


> > ### Random Forest: 
> > This is an ensemble machine learning algorithm that combines multiple decision trees to create a more accurate and robust model. It's particularly well-suited for classification and regression tasks and works well with both categorical and continuous data.
> 
> Pros: 
> 1. it can be used for both classification and regression tasks and has the ability to handle large datasets with higher dimensionality.
> 2. it can capture nonlinearity in the data by combining the results from various trees.
> 3. due to the averaging of multiple trees, it is quite robust to noisein the input data.
> 
> Cons:
> 1. Random Forest models are not easily interpretable.
> 2. it can be computationly intensive and slow to train.
> 3. it's not ideal for linear problems.

In [14]:
# initialize the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# fit the model to the training data
rf.fit(X_train, y_train)
# predict class
y_pred = rf.predict(X_test)

# evaluate accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
# detailed classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.7570093457943925
              precision    recall  f1-score   support

           0       0.75      0.85      0.80       122
           1       0.76      0.63      0.69        92

    accuracy                           0.76       214
   macro avg       0.76      0.74      0.75       214
weighted avg       0.76      0.76      0.75       214



> > ### Deep Learning:
> > 
> > Deep learning is a subset of machine learning that utilizes artificial neural networks with multiple layers—hence the term "deep"—to model and understand complex patterns in data. It is known for its effectiveness in tasks that deal with unstructured data like images, text, and audio.
> 
> Pros:
> 1. Flexibility in handling unstructured data:** Deep learning models excel at processing data with high dimensionality and complexity, such as images, sound waves, and text.
> 2. High accuracy: They can achieve high levels of accuracy in various applications, including image recognition, natural language processing, and speech recognition, given enough data.
> 3. Automatic feature extraction: These models can automatically learn the representations needed for feature detection or classification, eliminating the need for manual feature engineering.
> 
> Cons:
> 1. Large data requirement: Deep learning models typically require vast amounts of labeled data to train effectively.
> 2. Computational intensity: The training process for deep learning models is resource-intensive, often necessitating the use of GPUs or even distributed computing environments.
> 3. Opacity: Often considered as "black boxes," deep learning models can be challenging to interpret, making it hard to understand the exact reasons behind their decisions or predictions.

In [15]:
#### create a custom dataset class that inherits torch.utils.data.Dataset
# create TabularDataset class in order to use DataLoader
class TabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):
        return self.features[index], self.labels[index]

# create PyTorch Dataset objects
train_dataset = TabularDataset(X_train, y_train)
test_dataset = TabularDataset(X_test, y_test)

# create DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

In [16]:
#### define the model
class ClassificationModel(nn.Module):
    def __init__(self, num_features, num_classes=1): # default num_classes to 1 for binary classification
        super().__init__()
        self.layer1 = nn.Linear(num_features, 64)
        self.layer2 = nn.Linear(64, 64)
        self.output_layer = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = torch.sigmoid(self.output_layer(x))
        return x

# initialize the model
model = ClassificationModel(num_features=X_train.shape[1])

# compile the model
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  

In [17]:
#### train the model

# if a GPU is available, use it, else use a CPU
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
# transfer the model to the device
model.to(device)
# set the whole process iteration number
num_epochs = 2000

for epoch in range(num_epochs):
    model.train() # set the model to training mode
    total_loss = 0    

    for inputs, labels in train_loader:
        # transfer data to GPU if available
        inputs, labels = inputs.to(device), labels.to(device)
        # zero the gradients for every batch
        optimizer.zero_grad()
        # make predictions for this batch
        outputs = model(inputs)
        # compute the loss and its gradients
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        # update the total loss
        total_loss += loss.item()
        # adjust learning weights
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")


Epoch 1/2000, Loss: 0.6853911578655243
Epoch 2/2000, Loss: 0.6458273157477379
Epoch 3/2000, Loss: 0.607136495411396
Epoch 4/2000, Loss: 0.5623162612318993
Epoch 5/2000, Loss: 0.5118650570511818
Epoch 6/2000, Loss: 0.47359927371144295
Epoch 7/2000, Loss: 0.4439462535083294
Epoch 8/2000, Loss: 0.4276914708316326
Epoch 9/2000, Loss: 0.41372331976890564
Epoch 10/2000, Loss: 0.4093751013278961
Epoch 11/2000, Loss: 0.39838262647390366
Epoch 12/2000, Loss: 0.3942129500210285
Epoch 13/2000, Loss: 0.3845435343682766
Epoch 14/2000, Loss: 0.3811558820307255
Epoch 15/2000, Loss: 0.3765643909573555
Epoch 16/2000, Loss: 0.3747381716966629
Epoch 17/2000, Loss: 0.3706602305173874
Epoch 18/2000, Loss: 0.3672422170639038
Epoch 19/2000, Loss: 0.3624321147799492
Epoch 20/2000, Loss: 0.36104701086878777
Epoch 21/2000, Loss: 0.35900741815567017
Epoch 22/2000, Loss: 0.3551968168467283
Epoch 23/2000, Loss: 0.3537999428808689
Epoch 24/2000, Loss: 0.348607137799263
Epoch 25/2000, Loss: 0.345942135900259
Epoch 2