### Import Dependencies

In [189]:
# Import Dependencies
import numpy as np
import pandas as pd
import os

### Load Dataset

In [190]:
# Load Dataset
train_df = pd.read_csv("./titanic_data/train.csv")
test_df = pd.read_csv("./titanic_data/test.csv")

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [191]:
test_df.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


### Preprocess Data

In [192]:
def preprocess(df):
    df = df.copy()
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_str(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    def family_size(x, y):
        return x + y
        
    def format_fare(x):
        return round(x, 2)


    df["TicketNum"] = df["Ticket"].apply(ticket_number)
    df["TicketStr"] = df["Ticket"].apply(ticket_str)
    df["FamilySize"] = family_size(df["SibSp"], df["Parch"])
    df["Fare"] = df["Fare"].apply(format_fare)

    return df

preprocessed_train_df = preprocess(train_df)
preprocessed_test_df = preprocess(test_df)

preprocessed_train_df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TicketNum,TicketStr,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C,17599,PC,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S,3101282,STON/O2.,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,113803,NONE,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,373450,NONE,0
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.46,,Q,330877,NONE,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.86,E46,S,17463,NONE,0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.07,,S,349909,NONE,4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.13,,S,347742,NONE,2
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.07,,C,237736,NONE,1


In [193]:
preprocessed_test_df.head(20)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TicketNum,TicketStr,FamilySize
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q,330911,NONE,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,363272,NONE,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q,240276,NONE,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.66,,S,315154,NONE,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.29,,S,3101298,NONE,2
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.22,,S,7538,NONE,0
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.63,,Q,330972,NONE,0
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,248738,NONE,2
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.23,,C,2657,NONE,0
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,48871,A/4,2


In [194]:
# Set Categorical Variables
import pandas as pd

# Assuming you have a dataframe named data_processed_Train and data_processed_Test

# Convert 'Survived' column to categorical
preprocessed_train_df['Survived'] = pd.Categorical(preprocessed_train_df['Survived'])

# Convert 'Pclass' columns to categorical for both Train and Test data
preprocessed_train_df['Pclass'] = pd.Categorical(preprocessed_train_df['Pclass'])
preprocessed_test_df['Pclass'] = pd.Categorical(preprocessed_test_df['Pclass'])

In [195]:
# Create Training Partition
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming you have a dataframe named data_processed_Train

# Create training partition
train_index = train_test_split(preprocessed_train_df['Survived'], test_size=0.3, random_state=42)

# Create 'train' column and set it to False by default
preprocessed_train_df['train'] = False

# Set 'train' column to True for rows in the training partition
preprocessed_train_df.loc[train_index[0], 'train'] = True

# Create data sets
titanic_train = preprocessed_train_df[preprocessed_train_df['train'] == True].drop(columns=['train'])
titanic_test = preprocessed_train_df[preprocessed_train_df['train'] == False].drop(columns=['train'])


### Forward and Backward Selection

In [200]:
# Forward Step Model
import statsmodels.api as sm
import pandas as pd

# Assuming titanic_train is a pandas DataFrame with the necessary columns

# Define the model formula
formula = 'Survived ~ Pclass + Sex + Age + Fare + SibSp + Parch + Embarked'

# Fit the logistic regression model with stepwise variable selection
model = sm.GLM.from_formula(formula, family=sm.families.Binomial(), data=titanic_train).fit(method='bfgs')

# Display the summary of the fitted model
print(model.summary())


                       Generalized Linear Model Regression Results                        
Dep. Variable:     ['Survived[0]', 'Survived[1]']   No. Observations:                    2
Model:                                        GLM   Df Residuals:                        0
Model Family:                            Binomial   Df Model:                            1
Link Function:                              Logit   Scale:                          1.0000
Method:                                      bfgs   Log-Likelihood:            -2.9307e-07
Date:                            Thu, 30 Nov 2023   Deviance:                   5.8614e-07
Time:                                    17:44:33   Pearson chi2:                 2.93e-07
No. Iterations:                                 0   Pseudo R-squ. (CS):             0.7500
Covariance Type:                        nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
--------



In [None]:
# Backward Step Model

### Lasso and Ridge

In [None]:
# Lasso

In [None]:
# Ridge

### GLM, KNN, Naive Bayes Model with LOOCV

In [None]:
# Linear Regression with LOOVC

In [None]:
# KNN with LOOVC

In [None]:
# Naive bayes with LOOVC

### Spread Prediction on Train

In [None]:
# Spread rediction into test
# FS
# BS
# Lasso
# Ridge
# GLM
# KNN
# NB

### Train Prediction Summary

In [None]:
# Accuracy Summary

In [None]:
# Sensitivity

In [None]:
# Specificity

### Spread Prediction on Test

In [None]:
# Spread prediction into test
# FS
# BS
# Lasso
# Ridge
# GLM
# KNN
# NB

### Test Prediction Summary

In [None]:
# Accuracy Summary

In [None]:
# Sensitivity

In [None]:
# Specificity

### Spread Prediction into Kaggle Test Set

In [None]:
# Spread Prediction on Final Test (FS)

# Spread Prediction on Final Test (BS)

# Spread Prediction on Final Test (Ridge)

# Kaggle Submissions