# Lesson 9 Class Exercise Titanic Survival Prediction

## Data Plumbing and Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('../data/titanic.csv')
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


### Question 1: Which value is the mode for the 'Embarked' and Pclass columns, and fill the missing values in place?

In [2]:
# Find the mode for Embarked and Pclass
mode_embarked = df['Embarked'].mode()[0]
mode_pclass = df['Pclass'].mode()[0]

# fill missing values in both columns with the mode values
df['Embarked'] = df['Embarked'].fillna(mode_embarked)
df['Pclass'] = df['Pclass'].fillna(mode_pclass)

# print the mode values
print(f'Mode of Embarked is {mode_embarked}')
print(f'Mode of Pclass is {mode_pclass}')



Mode of Embarked is S
Mode of Pclass is 3


### Question 2: Which value is the mode for the 'Age' column, andfill the missing values in place?

In [3]:
# Find the mode for Age column 
mode_age = df['Age'].mode()[0]

# fill missing values
df['Age'] = df['Age'].fillna(mode_age)

# print the Age mode
print(f'Mode of Age is {mode_age}')

Mode of Age is 24.0


### Question 3: Drop the following columns: PassengerId, Name, Ticket, and Cabin

In [4]:
# the list to drop
col_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']

df = df.drop(columns = col_drop, axis = 1)

# check
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### Question 4: Encode Binary Categorical Feature ('Sex')

In [5]:
# One hot encode 
onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# add sex column
df_processed = pd.DataFrame()
df_processed['Sex'] = df['Sex']

# One hot encode 'Sex'
onehot.fit(df_processed)
col_names = onehot.get_feature_names_out(df_processed.columns)

# add one hot encoded values to new columns to original dataframe
df_processed_onehot = pd.DataFrame(onehot.transform(df_processed), columns = col_names)
df = pd.concat([df, df_processed_onehot], axis = 1)

# drop original 'Sex' column
df = df.drop(columns = 'Sex')

# check
df.head()


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male
0,0,3,22.0,1,0,7.25,S,0.0,1.0
1,1,1,38.0,1,0,71.2833,C,1.0,0.0
2,1,3,26.0,0,0,7.925,S,1.0,0.0
3,1,1,35.0,1,0,53.1,S,1.0,0.0
4,0,3,35.0,0,0,8.05,S,0.0,1.0


### Question 5: Separate the data into features ($X$) and the target variable ($y$) (Survived)

In [6]:
# separate the features (all columns except 'Survived') into X
X = df.drop('Survived', axis = 1)

# separate the target variable 'Survived' into y
y = df['Survived']

print("Features (X) Head:")
print(X.head())
print("\nTarget (y) Head:")
print(y.head())


Features (X) Head:
   Pclass   Age  SibSp  Parch     Fare Embarked  Sex_female  Sex_male
0       3  22.0      1      0   7.2500        S         0.0       1.0
1       1  38.0      1      0  71.2833        C         1.0       0.0
2       3  26.0      0      0   7.9250        S         1.0       0.0
3       1  35.0      1      0  53.1000        S         1.0       0.0
4       3  35.0      0      0   8.0500        S         0.0       1.0

Target (y) Head:
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


### Question 6: Split the data into training and testing sets ($80\%$ train, $20\%$ test) using a fixed random_state=42

In [7]:
# split data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.20,
    random_state = 42
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (712, 8)
X_test shape: (179, 8)
y_train shape: (712,)
y_test shape: (179,)


### Question 7: Apply One Hot Encoding to 'Pclass' and 'Embarked' require

In [8]:
# add Pclass and Embarked columns to new dataframe
df_processed2 = df[['Pclass','Embarked']].copy()
# onehot encode the new dataframe
onehot.fit(df_processed2)
col_names = onehot.get_feature_names_out(df_processed2.columns)

# add one hot encoded values
df_processed2_onehot = pd.DataFrame(onehot.transform(df_processed2), columns = col_names)

# add df_processed2 to original df
df = pd.concat([df, df_processed2_onehot], axis = 1)

# drop original Pclass and Embarked columns
df = df.drop(columns = ['Pclass','Embarked'])
df.head()


Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1,35.0,1,0,53.1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


### Question 8: Create a new feature FamilySize from the existing SibSp and Parch

In [9]:
# Create the new FamilySize feature
# FamilySize = SibSp + Parch + 1 (to include the person themselves)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Display the first few rows to verify the new column
print(df[['SibSp', 'Parch', 'FamilySize']].head())

   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1


### Question 9: Create a binary feature `IsAlone` (1 if alone, 0 otherwise) based on the new `FamilySize` feature.

In [10]:
# create binary IsAlone feature
# 1 if family is 1 (alone), 0 otherwise
df['IsAlone'] = np.where(df['FamilySize'] == 1, 1, 0)

# display the relationship between FamilySize and IsAlone
print(df[['FamilySize', 'IsAlone']].head())

   FamilySize  IsAlone
0           2        0
1           2        0
2           1        1
3           2        0
4           1        1


### Question 10: Initialize and train the three specified `sklearn` models: Logistic Regression, K-Nearest Neighbors (K=5), and Random Forest Classifier.

In [11]:
# re-do the train test split with the udpated df

# separate the features (all columns except 'Survived') into X
X = df.drop('Survived', axis = 1)

# separate the target variable 'Survived' into y
y = df['Survived']

# split data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.20,
    random_state = 42
)


In [12]:
# Model initialization

# Logistic Regression
# max_iter is increased to ensure convergence for the given dataset
log_reg_model = LogisticRegression(random_state = 42, max_iter = 1000)

# K-Nearest Neighbors (K = 5)
knn_model = KNeighborsClassifier(n_neighbors = 5)

# Random Forest Classifier
rand_forest_model = RandomForestClassifier(random_state = 42)


# Model Training (Fitting)

# Train Logistic Regression
log_reg_model.fit(X_train, y_train)

# Train K-Nearest Neighbors
knn_model.fit(X_train, y_train)

# Train Random Forest Classifier
rand_forest_model.fit(X_train, y_train)


### Question 11: Write the code structure to make predictions on the test set for all three models and calculate the Accuracy, Precision, and Recall scores for each.

In [14]:
# make predictions on test set on the 3 models

# create a dictionary to store predictions
predictions = {}

# Logistic Regression Model Prediction
y_pred_log_reg = log_reg_model.predict(X_test)
predictions['Logistic_Regression'] = y_pred_log_reg

# K-Nearest Neighbors Prediction
y_pred_knn = knn_model.predict(X_test)
predictions['K_Nearest_Neighbors'] = y_pred_knn

# Random Forest Classifier Prediction
y_pred_rand_forest = rand_forest_model.predict(X_test)
predictions['Random_Forest_Classifier'] = y_pred_rand_forest



# Calculate Accuracy, Precision and Recall scores
# create a list to store results
results = []

for name, y_pred in predictions.items():
    # calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    # store results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall
    })

# Convert results to a DataFrame for clean comparison
results_df = pd.DataFrame(results).set_index('Model')

print("\n--- Model Performance Comparison ---")
print(results_df)


--- Model Performance Comparison ---
                          Accuracy  Precision    Recall
Model                                                  
Logistic_Regression       0.793296   0.776119  0.702703
K_Nearest_Neighbors       0.731844   0.724138  0.567568
Random_Forest_Classifier  0.826816   0.786667  0.797297
