# Step 1 Data Processing
Apply appropriate techniques to preprocess data (e.g., normalization, standardization)

In [1]:
## Data Processing libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

passenger = pd.read_csv("titanic_augmented.csv")

passenger.shape

(891, 26)

In [2]:
passenger.tail(6)
passenger.iloc[:, 0:13]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,name_length
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,23
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,22
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,44
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,21
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,28
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,40
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,21


In [64]:
passenger.iloc[:, 13:25]

Unnamed: 0,title,title_group,family_size,is_alone,ticket_group_size,fare_per_person,age_fare_ratio,cabin_deck,cabin_room_number,booking_reference,service_id,cabin_score
0,Mr,Mr,2,0,1,7.250,3.034483,Unknown,,92490,221958,6.134152
1,Mrs,Mrs,2,0,1,71.283,0.533084,C,85.0,15655423,771155,4.182430
2,Miss,Miss,1,1,1,7.925,3.280757,Unknown,,90218500,231932,9.327285
3,Mrs,Mrs,2,0,2,26.550,0.659134,C,123.0,2493079,465838,8.660639
4,Mr,Mr,1,1,1,8.050,4.347826,Unknown,,59517148,359178,0.452187
...,...,...,...,...,...,...,...,...,...,...,...,...
886,Rev,Other,1,1,1,13.000,2.076923,Unknown,,83757278,538661,4.308875
887,Miss,Miss,1,1,1,30.000,0.633333,B,42.0,91664020,498929,2.487143
888,Miss,Miss,4,0,2,11.725,0.000000,Unknown,,55618889,680466,6.171450
889,Mr,Mr,1,1,1,30.000,0.866667,C,148.0,94737372,673695,7.067772


In [3]:
passenger.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,is_alone,ticket_group_size,fare_per_person,age_fare_ratio,cabin_deck,cabin_room_number,booking_reference,service_id,cabin_score,name_word_count
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891,200.0,891.0,891.0,891.0,891.0
unique,,,,891,2,,,,681.0,,...,,,,,9,,,,,
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,...,,,,,Unknown,,,,,
freq,,,,1,577,,,,7.0,,...,,,,,687,,,,,
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,...,0.602694,1.787879,17.789001,1.572536,,50.49,51081180.0,536369.988777,4.956762,4.06734
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,...,0.489615,1.361142,21.218127,1.661773,,35.39497,28381740.0,261551.630299,2.915177,1.168866
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,...,0.0,1.0,0.0,0.0,,2.0,92490.0,102869.0,0.04632,3.0
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,...,0.0,1.0,7.7625,0.116026,,22.0,28319620.0,299638.0,2.325861,3.0
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,...,1.0,1.0,8.85,1.175795,,43.0,51288530.0,535564.0,4.954913,4.0
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,...,1.0,2.0,24.288,2.543045,,77.25,74931310.0,757663.0,7.479345,4.0


In [4]:
passenger.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PassengerId        891 non-null    int64  
 1   Survived           891 non-null    int64  
 2   Pclass             891 non-null    int64  
 3   Name               891 non-null    object 
 4   Sex                891 non-null    object 
 5   Age                714 non-null    float64
 6   SibSp              891 non-null    int64  
 7   Parch              891 non-null    int64  
 8   Ticket             891 non-null    object 
 9   Fare               891 non-null    float64
 10  Cabin              204 non-null    object 
 11  Embarked           889 non-null    object 
 12  name_length        891 non-null    int64  
 13  title              891 non-null    object 
 14  title_group        891 non-null    object 
 15  family_size        891 non-null    int64  
 16  is_alone           891 non

In [5]:
# Drop Irrelevant, Weak, Redundant Features
drop_cols = [
    "PassengerId",    # identifier
    "Name",           # title already extracted
    "Ticket",         # too high-cardinality
    "Cabin",           # raw cabin string; usually replaced by deck
    "name_length",                  # Too many missing data and it is redundant to the decomposed ones
    "name_word_count",      # Weak predictor
    "title",           # Keep the title_group instead
    "cabin_room_number",
    "booking_reference",
    "service_id"
]

cols_dropping = [col for col in drop_cols if col in passenger.columns]
passenger_clean = passenger.drop(columns=cols_dropping)


# Separate Predictors and Response 
target = "Survived"
X = passenger_clean.drop(columns=[target])
y = passenger_clean[target]

# separate numeric and categories for future pipelining
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

if 'Pclass' in num_features:
    num_features.remove('Pclass')
    cat_features.append('Pclass')

# Pipelining
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, num_features),
        ('cat', categorical_pipeline, cat_features)
    ]
)

preprocessing_only = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Splitting train/test 
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y   # preserves class ratio
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("Survival rate in full dataset:", y.mean())
print("Survival rate in training set:", y_train.mean())
print("Survival rate in test set:", y_test.mean())

# CV re-sampling
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

#StratifiedKFold(n_splits=10)


# example placeholder for a model
logreg_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),   # from earlier step
    ('model', LogisticRegression(max_iter=1000))
])

# useful in the future
# scores = cross_val_score(
#     logreg_pipeline,
#     X_train,
#     y_train,
#     cv=cv,
#     scoring='accuracy'
# )

# print("CV Accuracy:", scores)
# print("Mean Accuracy:", scores.mean())

##Check for missing value]
# X_check = preprocessing_only.fit_transform(X_train)
# X_check.shape
# np.isnan(X_check).sum()

Training set size: (668, 15)
Test set size: (223, 15)
Survival rate in full dataset: 0.3838383838383838
Survival rate in training set: 0.38323353293413176
Survival rate in test set: 0.38565022421524664


# Step 2 Data Splitting and Resampling
Split the dataset into training and test sets (25% test) and use appropriate resampling techniques (e.g., k-fold cross-validation) to ensure robust model evaluation.

# Step 3 Model Building
## LDA

### Linear Discriminant Analysis (LDA)

Linear Discriminant Analysis (LDA) is a supervised classification method that models class-specific means and covariance matrices to identify linear combinations of features that best separate different classes. In this project, LDA serves as a representative linear classifier and provides a theoretical comparison with Logistic Regression.

All preprocessing procedures, including missing value imputation, feature scaling, and categorical encoding, are integrated into a unified pipeline to ensure reproducibility and prevent data leakage.


In [6]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# --- LDA pipeline ---
lda_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),   
    ("model", LinearDiscriminantAnalysis())
])

# --- 5-fold CV on training set ---
lda_cv_scores = cross_val_score(
    lda_pipeline,
    X_train,
    y_train,
    cv=cv,                 
    scoring="accuracy"
)

print("LDA CV Accuracy (folds):", lda_cv_scores)
print("LDA CV Mean Accuracy:", lda_cv_scores.mean())
print("LDA CV Std:", lda_cv_scores.std())

# --- Fit on full training set ---
lda_pipeline.fit(X_train, y_train)

# --- Evaluate on test set ---
y_pred_lda = lda_pipeline.predict(X_test)

print("\nLDA Test Accuracy:", accuracy_score(y_test, y_pred_lda))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lda))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lda))


LDA CV Accuracy (folds): [0.85820896 0.81343284 0.82835821 0.84210526 0.80451128]
LDA CV Mean Accuracy: 0.8293233082706767
LDA CV Std: 0.019322703426083988

LDA Test Accuracy: 0.8340807174887892

Confusion Matrix:
 [[120  17]
 [ 20  66]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87       137
           1       0.80      0.77      0.78        86

    accuracy                           0.83       223
   macro avg       0.83      0.82      0.82       223
weighted avg       0.83      0.83      0.83       223



### Hyperparameter Tuning

GridSearchCV with stratified 5-fold cross-validation is applied to optimize LDA hyperparameters. Different solvers and shrinkage parameters are evaluated to improve numerical stability and classification performance. The best-performing configuration is selected based on cross-validation accuracy.


In [None]:
#Model tuning
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline

lda_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", LinearDiscriminantAnalysis())
])

param_grid_lda = [
    {"model__solver": ["svd"]},  
    {"model__solver": ["lsqr"],  
     "model__shrinkage": [None, "auto", 0.0, 0.1, 0.3, 0.5, 0.7, 0.9]}
]

grid_lda = GridSearchCV(
    lda_pipeline,
    param_grid_lda,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    error_score="raise"  
)

grid_lda.fit(X_train, y_train)

print("Best LDA params:", grid_lda.best_params_)
print("Best CV accuracy:", grid_lda.best_score_)

### Model Evaluation

The optimized LDA model is evaluated on an independent test set. Classification accuracy, confusion matrix, and precision-recall metrics are reported to assess model generalization performance and class-wise prediction behavior.


In [None]:
#Test set accuracy + confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

best_lda = grid_lda.best_estimator_

y_pred_lda = best_lda.predict(X_test)

print("LDA Test Accuracy:", accuracy_score(y_test, y_pred_lda))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lda))
print("Report:\n", classification_report(y_test, y_pred_lda))


## Random Forest

## Logistic Regression