In [None]:
# Cell 0: Install packages

import sys

# Equivalent of `python -m pip install <package>`
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install scikit-learn

In [1]:
# Cell 1: Import statements and warning suppression

import pandas as pd

import numpy as np

import pickle

from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

from sklearn.preprocessing import StandardScaler

from sklearn.impute import KNNImputer

In [2]:
# Cell 2: Function for data preprocessing

def preprocess_data(df):

    # Fill missing values with KNNImputer
    knn_imputer = KNNImputer(n_neighbors=5)
    df[['Age']] = knn_imputer.fit_transform(df[['Age']])
    
    # Calculate the mode (most frequent value) of the 'Embarked' column
    mode_value = df['Embarked'].mode()[0]
    
    # Use fillna and assign the result back to the 'Embarked' column
    df['Embarked'] = df['Embarked'].fillna(mode_value)

    # Encode categorical variables
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})  
    
    return df

In [3]:
# Cell 3: Reading and preprocessing the dataset

# Read the dataset
url = 'https://raw.githubusercontent.com/kelly-olsson/WouldYouSurviveTheTitanic/main/dataset/train.csv'
df = pd.read_csv(url)

# Preprocess the data
df = preprocess_data(df)

In [4]:
# Cell 4: Setting up predictor and target variables

# Separate into x and y values
predictorVariables = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[predictorVariables]
y = df['Survived']

In [9]:
# Cell 5: Feature selection code (Run only if selecting features)


####### Comment this code if running entire notebook (only run while selecting best features) ########

# Scale the data when searching for best features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=predictorVariables)

# Use mutual_info_classif for feature selection
test = SelectKBest(score_func=mutual_info_classif, k=4)
mi_scores = test.fit(X, y)
np.set_printoptions(precision=3)

# Best predictor variables were consistently ['Pclass', 'Sex', 'Age', 'Fare']
print("\\nPredictor variables: " + str(predictorVariables))
print("Predictor Mutual Information Scores: " + str(mi_scores.scores_))

# Select significant variables using the get_support() function
cols = mi_scores.get_support(indices=True)
print(cols)
features = X.columns[cols]
print(features.values)

####### End Feature Selection Section ########

\nPredictor variables: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
Predictor Mutual Information Scores: [0.06  0.147 0.03  0.02  0.017 0.132 0.014]
[0 1 2 5]
['Pclass' 'Sex' 'Age' 'Fare']


In [10]:
# Cell 6: Feature selection and scaling

# Use consistently best features
features = ['Pclass', 'Sex', 'Age', 'Fare']

# Re-assign X with significant columns only after chi-square test
X = df[features]
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Pickle scaler
pickle.dump(scaler, open('sc_x.pkl', 'wb'))

In [11]:
# Cell 7: Data splitting

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [12]:
# Cell 8: Build logistic regression model and perform cross-validation

# Build logistic regression model
logisticModel = LogisticRegression(fit_intercept=True, solver='liblinear')
grid = GridSearchCV(logisticModel, param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]}, cv=5)
grid.fit(X_train, y_train)

# Report best parameters as evaluated by grid search
print("Best parameters found by grid search: ", grid.best_params_)

# Identify best model
best_model = grid.best_estimator_

# Report best model metrics
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']
for metric in scoring_metrics:
    scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring=metric)
    print(f"{metric.capitalize()} scores across folds: {scores}")
    print(f"Mean {metric.capitalize()}: {scores.mean()}")
    print(f"Standard deviation {metric.capitalize()}: {scores.std()}\n")

# Pickle trained model
with open('model_pkl', 'wb') as files:
    pickle.dump(grid.best_estimator_, files)

Best parameters found by grid search:  {'C': 0.1}
Accuracy scores across folds: [0.776 0.791 0.828 0.752 0.782]
Mean Accuracy: 0.7858713949051734
Standard deviation Accuracy: 0.02489083764691754

Precision scores across folds: [0.756 0.712 0.788 0.702 0.745]
Mean Precision: 0.7405380022870105
Standard deviation Precision: 0.031102684540876918

Recall scores across folds: [0.642 0.792 0.774 0.635 0.673]
Mean Recall: 0.7030478955007258
Standard deviation Recall: 0.06683855493866012

F1 scores across folds: [0.694 0.75  0.781 0.667 0.707]
Mean F1: 0.7197134611420325
Standard deviation F1: 0.040764630801629585



In [13]:
# Cell 9: Load best model and make some predictions and score them

# Load saved model
with open('model_pkl', 'rb') as f:
    loadedModel = pickle.load(f)

y_pred = loadedModel.predict(X_test)
print("***Predictions")
print(y_pred)

# Show confusion matrix and accuracy scores
cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('\nAccuracy:', accuracy)
print("\nConfusion Matrix")
print(cm)
print("Recall: " + str(recall))
print("Precision: " + str(precision))
print("F1-Score: " + str(f1))

***Predictions
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 1
 0 1 0 0 0 1 1 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0
 0 1 0 1 1 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0
 1 1 1 0 1 0 1 1 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0
 0]

Accuracy: 0.8026905829596412

Confusion Matrix
Predicted    0   1
Actual            
0          122  22
1           22  57
Recall: 0.7215189873417721
Precision: 0.7215189873417721
F1-Score: 0.7215189873417721


In [14]:
# Cell 10: Load best model and make some predictions and score them

# Create a single prediction.
singleSampleDf = pd.DataFrame(columns=features)

# All men who died in train data
pClass =  3
sex = 0
age = 22
fare = 7.25

passengerData = {'Pclass': pClass, 'Sex': sex, 'Age': age, 'Fare': fare}
singleSampleDf = pd.DataFrame([passengerData])

# Scale the singleSampleDf using the same scaler object
loaded_scalerX = pickle.load(open('sc_x.pkl', 'rb'))
singleSampleDf_scaled = loaded_scalerX.transform(singleSampleDf)

singlePrediction = loadedModel.predict(singleSampleDf_scaled)
print("Single prediction: " + str(singlePrediction))

Single prediction: [0]
