# Titanic Preprocessing Upgrades + ML (Improved Accuracy)

This notebook applies improved preprocessing to boost accuracy:
- Group-wise imputation (Age/Fare)
- Feature engineering (family_size, is_alone, Deck, log(Fare))
- One-hot encoding
- Scaling for Logistic Regression
- Class-weighted Logistic Regression & Random Forest


## Load Libraries & CSVs

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

y = train_df['Survived']
train_df = train_df.drop(columns=['Survived'])

train_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Combine for Consistent Preprocessing

In [3]:
combined_pd = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
combined_pd.isna().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

## Feature Engineering (Title, Family, Deck, Fare log)

In [4]:
# Extract raw title
combined_pd['raw_title'] = combined_pd['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()

title_mapping = {
    'Capt': 'Military', 'Col': 'Military', 'Major': 'Military',
    'Don': 'Nobility', 'Dona': 'Nobility', 'Jonkheer': 'Nobility', 'Lady': 'Nobility', 'Sir': 'Nobility', 'Master': 'Nobility', 'the Countess': 'Nobility',
    'Miss': 'Ms', 'Mlle': 'Ms', 'Ms': 'Ms',
    'Mrs': 'Mrs', 'Mme': 'Mrs'
}

combined_pd['title'] = combined_pd['raw_title'].map(lambda t: title_mapping.get(t, t))

# Family features
combined_pd['family_size'] = combined_pd['SibSp'] + combined_pd['Parch'] + 1
combined_pd['is_alone'] = (combined_pd['family_size'] == 1).astype(int)

# Deck from Cabin
combined_pd['Deck'] = combined_pd['Cabin'].str[0].fillna('U')

# Log Fare
combined_pd['Fare'] = combined_pd['Fare'].fillna(combined_pd['Fare'].median())
combined_pd['Fare_log'] = np.log1p(combined_pd['Fare'])

combined_pd[['Name','raw_title','title','family_size','is_alone','Deck','Fare_log']].head()

Unnamed: 0,Name,raw_title,title,family_size,is_alone,Deck,Fare_log
0,"Braund, Mr. Owen Harris",Mr,Mr,2,0,U,2.110213
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs,Mrs,2,0,C,4.280593
2,"Heikkinen, Miss. Laina",Miss,Ms,1,1,U,2.188856
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs,Mrs,2,0,C,3.990834
4,"Allen, Mr. William Henry",Mr,Mr,1,1,U,2.202765


## Group-wise Imputation for Age & Fare

In [5]:
combined_pd['Age'] = combined_pd.groupby(['title','Sex','Pclass'])['Age'] \
    .transform(lambda x: x.fillna(x.median()))

combined_pd['Age'] = combined_pd['Age'].fillna(combined_pd['Age'].median())

combined_pd['Fare'] = combined_pd.groupby('Pclass')['Fare'] \
    .transform(lambda x: x.fillna(x.median()))

combined_pd[['Age','Fare']].isna().sum()

Age     0
Fare    0
dtype: int64

## One-Hot Encoding

In [6]:
one_hot = pd.get_dummies(
    combined_pd,
    columns=['Sex','Embarked','title','Deck','Pclass'],
    drop_first=True,
    dtype=int
)

one_hot.shape

(1309, 31)

## Split Back to Train/Test

In [7]:
X_all = one_hot.drop(columns=['PassengerId','Name','Ticket','Cabin','raw_title'], errors='ignore')

X_train = X_all.iloc[:len(y), :].copy()
X_test  = X_all.iloc[len(y):, :].copy()

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42, stratify=y)

X_tr.shape, X_val.shape

((712, 26), (179, 26))

## Scale Numeric Features (for Logistic Regression)

In [8]:
num_cols = ['Age','Fare_log','family_size','SibSp','Parch','Fare']

scaler = StandardScaler()
scaler.fit(X_tr[num_cols])

X_tr[num_cols]  = scaler.transform(X_tr[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

X_tr[num_cols].head()

Unnamed: 0,Age,Fare_log,family_size,SibSp,Parch,Fare
692,-0.250335,1.124592,-0.556339,-0.465084,-0.466183,0.513812
481,0.044894,-3.014278,-0.556339,-0.465084,-0.466183,-0.662563
527,0.893679,2.508198,-0.556339,-0.465084,-0.466183,3.955399
855,-0.840794,-0.627019,0.073412,-0.465084,0.727782,-0.467874
801,0.118701,0.361872,0.703162,0.478335,0.727782,-0.115977


## Train Models

In [9]:
log_reg = LogisticRegression(max_iter=500, class_weight='balanced')
log_reg.fit(X_tr, y_tr)
y_pred_lr = log_reg.predict(X_val)

rf = RandomForestClassifier(n_estimators=400, random_state=42, class_weight='balanced_subsample')
rf.fit(X_tr, y_tr)
y_pred_rf = rf.predict(X_val)

print('Logistic Regression Accuracy:', accuracy_score(y_val, y_pred_lr))
print('Random Forest Accuracy:', accuracy_score(y_val, y_pred_rf))

print('LR Confusion Matrix:\n', confusion_matrix(y_val, y_pred_lr))
print('RF Confusion Matrix:\n', confusion_matrix(y_val, y_pred_rf))

Logistic Regression Accuracy: 0.8212290502793296
Random Forest Accuracy: 0.7932960893854749
LR Confusion Matrix:
 [[92 18]
 [14 55]]
RF Confusion Matrix:
 [[94 16]
 [21 48]]


## (Optional) Predict on test.csv

In [10]:
test_preds_rf = rf.predict(X_test)
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': test_preds_rf})
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
