# Titanic: Enhanced Modeling & Feature Engineering

This notebook adds:
- Rich feature engineering (titles, decks, family features, age/fare buckets, traveling alone, etc.).
- Multiple model comparison with `GridSearchCV` and stratified cross-validation.
- A leaderboard of models and tuned hyperparameters.
- Optional `submission.csv` if `test.csv` is present.

> Place `train.csv` (and optionally `test.csv`) next to this notebook or in `/mnt/data/`.


In [38]:
# Core setup
import os, re, math, warnings, textwrap
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

import joblib
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [39]:

train_df = pd.read_csv('C:/Datasets/train.csv')
test_df = pd.read_csv('C:/Datasets/train.csv') 

In [40]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [41]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [43]:
train_df['Age'].median()

np.float64(28.0)

In [44]:
train_df["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [45]:
train_df["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [46]:
train_df["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [47]:
train_df["Pclass"].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

## Feature Engineering

We create:
- `Relatives = SibSp + Parch` and `IsAlone`.
- `FamilySize = Relatives + 1` (including the passenger).
- Extract `Title` from `Name` and group rare titles.
- `Deck` from the first letter of `Cabin`.
- `AgeBucket` via fixed bins; `FareBucket` via quantiles.
- Keep `Pclass`, `Sex`, `Embarked` as categorical features.


In [61]:
# Feature engineering helpers
def extract_title(name: str) -> str:
    if pd.isna(name):
        return "Unknown"
    m = re.search(r",\s*([^\.]+)\.", name)
    if m:
        return m.group(1).strip()
    return "Unknown"

def map_title(title: str) -> str:
    # Group rare titles
    title = title.strip()
    common = {"Mr","Mrs","Miss","Master"}
    if title in common:
        return title
    # Map similar/rare titles
    mapping = {
        "Mlle":"Miss","Ms":"Miss","Mme":"Mrs",
        "Lady":"Royalty","Countess":"Royalty","Sir":"Royalty","Don":"Royalty","Dona":"Royalty","Jonkheer":"Royalty",
        "Dr":"Officer","Rev":"Officer","Col":"Officer","Major":"Officer","Capt":"Officer"
    }
    return mapping.get(title, "Rare")

def first_letter_or_unknown(x):
    if isinstance(x, str) and len(x)>0:
        return x[0].upper()
    return "Unknown"

def build_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # Relatives & alone
    out["Relatives"] = out["SibSp"].fillna(0) + out["Parch"].fillna(0)
    out["FamilySize"] = out["Relatives"] + 1
    out["IsAlone"] = (out["Relatives"] == 0).astype(int)

    # Title
    out["TitleRaw"] = out["Name"].apply(extract_title)
    out["Title"] = out["TitleRaw"].apply(map_title)

    # Deck from Cabin
    out["Deck"] = out["Cabin"].apply(first_letter_or_unknown)

    # Age bucket (fixed sensible bins)
    # Infant, Child, Teen, YoungAdult, Adult, MidAge, Senior
    age_bins = [0, 2, 12, 18, 30, 45, 60, np.inf]
    age_labels = ["Infant","Child","Teen","YoungAdult","Adult","MidAge","Senior"]
    out["AgeBucket"] = pd.cut(out["Age"], bins=age_bins, labels=age_labels)

    # Fare bucket (quantiles)
    out["FareBucket"] = pd.qcut(out["Fare"], q=4, duplicates="drop")

    # Traveling alone category for emphasis
    out["TravelType"] = np.where(out["IsAlone"]==1, "Alone", "WithFamily")

    # Replace SibSp & Parch with sum if desired (keep originals too for comparison)
    out["SibSpParchSum"] = out["Relatives"]

    # Columns used downstream
    return out

fe_train = build_features(train_df)
fe_test  = build_features(test_df) if test_df is not None else None

fe_train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Relatives,FamilySize,IsAlone,TitleRaw,Title,Deck,AgeBucket,FareBucket,TravelType,SibSpParchSum
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,1,2,0,Mr,Mr,Unknown,YoungAdult,"(-0.001, 7.91]",WithFamily,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,1,2,0,Mrs,Mrs,C,Adult,"(31.0, 512.329]",WithFamily,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,0,1,1,Miss,Miss,Unknown,YoungAdult,"(7.91, 14.454]",Alone,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,1,2,0,Mrs,Mrs,C,Adult,"(31.0, 512.329]",WithFamily,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,1,1,Mr,Mr,Unknown,Adult,"(7.91, 14.454]",Alone,0


In [49]:
corr_matrix = train_df.corr(numeric_only=True)

corr_matrix['Survived'].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

## Preprocessing & Column Transformer
- Numeric features imputed (median) + scaled (for certain models).
- Categorical features imputed (most frequent) + one-hot encoded.


In [62]:
# Define feature sets
target = "Survived"
numeric_features = ["Age","Fare","FamilySize","Relatives","SibSpParchSum"]
categorical_features = [
    "Pclass","Sex","Embarked","Title","Deck","AgeBucket","FareBucket","TravelType"
]

# Preprocessors
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))  # sparse-safe
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X = fe_train[numeric_features + categorical_features]
y = fe_train[target]


## Models & Hyperparameter Grids

We compare several classifiers with tuned grids using `GridSearchCV` and 5-fold `StratifiedKFold`.


In [51]:
# Define models and parameter grids
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

pipelines_and_grids = []

# Logistic Regression
pipelines_and_grids.append((
    "LogReg",
    Pipeline([("prep", preprocessor),
              ("clf", LogisticRegression(max_iter=500, random_state=RANDOM_STATE))]),
    {
        "clf__C":[0.1, 0.5, 1.0, 2.0, 5.0],
        "clf__penalty":["l2"],
        "clf__solver":["lbfgs","liblinear"]
    }
))

# SVC
pipelines_and_grids.append((
    "SVC",
    Pipeline([("prep", preprocessor),
              ("clf", SVC(random_state=RANDOM_STATE))]),
    {
        "clf__C":[0.5, 1, 2, 5],
        "clf__kernel":["rbf","linear"],
        "clf__gamma":["scale","auto"]
    }
))

# KNN
pipelines_and_grids.append((
    "KNN",
    Pipeline([("prep", preprocessor),
              ("clf", KNeighborsClassifier())]),
    {
        "clf__n_neighbors":[3,5,7,9,11],
        "clf__weights":["uniform","distance"],
        "clf__p":[1,2]
    }
))



results = []
best_estimators = {}

for name, pipe, grid in pipelines_and_grids:
    print(f"=== {name}: GridSearchCV ===")
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring="accuracy",
        cv=cv,
        n_jobs=-1,
        refit=True,
        verbose=0
    )
    gs.fit(X, y)
    best_score = gs.best_score_
    best_params = gs.best_params_
    results.append((name, best_score, best_params))
    best_estimators[name] = gs.best_estimator_
    print(f"Best CV accuracy: {best_score:.4f}")
    print(f"Best params: {best_params}\n")

leaderboard = pd.DataFrame(results, columns=["Model","CV_Accuracy","Best_Params"]).sort_values("CV_Accuracy", ascending=False).reset_index(drop=True)
leaderboard


=== LogReg: GridSearchCV ===
Best CV accuracy: 0.8294
Best params: {'clf__C': 1.0, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}

=== SVC: GridSearchCV ===
Best CV accuracy: 0.8373
Best params: {'clf__C': 2, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'}

=== KNN: GridSearchCV ===
Best CV accuracy: 0.8440
Best params: {'clf__n_neighbors': 7, 'clf__p': 1, 'clf__weights': 'uniform'}



Unnamed: 0,Model,CV_Accuracy,Best_Params
0,KNN,0.844009,"{'clf__n_neighbors': 7, 'clf__p': 1, 'clf__wei..."
1,SVC,0.837254,"{'clf__C': 2, 'clf__gamma': 'auto', 'clf__kern..."
2,LogReg,0.829402,"{'clf__C': 1.0, 'clf__penalty': 'l2', 'clf__so..."


## Fit Best Model & Evaluate (Train CV)
We refit the top model (already refit by GridSearch) and keep it for export.


In [54]:
# Select the top model
top_row = leaderboard.iloc[0]
top_model_name = top_row["Model"]
top_model = best_estimators[top_model_name]

print("Top model:", top_model_name)
print("Params:", top_model.get_params())



Top model: KNN
Params: {'memory': None, 'steps': [('prep', ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler',
                                                  StandardScaler(with_mean=False))]),
                                 ['Age', 'Fare', 'FamilySize', 'Relatives',
                                  'SibSpParchSum']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck',
                                  'AgeBucket', 'FareBu

## Optional: Create `submission.csv`
If `test.csv` is available, we will produce a Kaggle-style submission with `PassengerId` and `Survived`.


In [64]:
if test_df is not None:
    X_test = preprocessor.fit_transform(fe_test[numeric_features + categorical_features])
    preds = top_model.predict(X_test)
    submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": preds.astype(int)
    })
    submission_path = "C:/Datasets/artifacts/submission4.csv"
    submission.to_csv(submission_path, index=False)
    print(f"Created submission at {submission_path}")
else:
    print("test.csv not found. Skipping submission generation.")




ValueError: X has 42 features, but ColumnTransformer is expecting 13 features as input.