<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-14/prepare_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# prepare_data.py - error-free preprocessing for Titanic
# - reads:  data/train.csv  and  data/test.csv
# - writes: data/processed/train_processed.csv  and  data/processed/test_processed.csv

import os
from pathlib import Path
import pandas as pd
import numpy as np

RAW_DIR = Path("data")
OUT_DIR = Path("data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------
# 0) sanity check - files present
# -------------------------
train_path = RAW_DIR / "train.csv"
test_path = RAW_DIR / "test.csv"

if not train_path.exists():
    raise FileNotFoundError(f"train.csv not found at {train_path}. Put Kaggle train.csv in the 'data' folder.")
if not test_path.exists():
    raise FileNotFoundError(f"test.csv not found at {test_path}. Put Kaggle test.csv in the 'data' folder.")

# -------------------------
# 1) load raw files
# -------------------------
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print("Raw shapes -> train:", train.shape, " test:", test.shape)

# preserve PassengerId for final submission
test_passenger_id = test["PassengerId"].copy()

# -------------------------
# 2) combine train+test so dummies/encoding align
# -------------------------
train["is_train"] = 1
test["is_train"] = 0
# Ensure Survived exists in test to keep columns consistent (fill with NaN)
if "Survived" not in test.columns:
    test["Survived"] = np.nan

full = pd.concat([train, test], ignore_index=True, sort=False)
print("Combined shape:", full.shape)

# -------------------------
# 3) feature engineering on combined set
# -------------------------

# Title extraction (use raw string to avoid escape warnings)
full["Title"] = full["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)

# Clean titles and collapse rare ones
full["Title"] = full["Title"].replace({
    "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
    "Lady":"Rare","Countess":"Rare","Capt":"Rare","Col":"Rare","Don":"Rare",
    "Dr":"Rare","Major":"Rare","Rev":"Rare","Sir":"Rare","Jonkheer":"Rare","Dona":"Rare"
})
# mark very rare titles as 'Rare'
title_counts = full["Title"].value_counts(dropna=True)
rare_titles = title_counts[title_counts < 10].index.tolist()
full["Title"] = full["Title"].replace(rare_titles, "Rare")
full["Title"] = full["Title"].fillna("Unknown")

# Family features
full["FamilySize"] = full["SibSp"].fillna(0).astype(int) + full["Parch"].fillna(0).astype(int) + 1
full["IsAlone"] = (full["FamilySize"] == 1).astype(int)

# Fare: fill missing then log transform
full["Fare"] = pd.to_numeric(full["Fare"], errors="coerce")
full["Fare"].fillna(full["Fare"].median(), inplace=True)
full["Fare_log"] = np.log1p(full["Fare"])

# Age: fill missing by median age per Title, fallback to global median
full["Age"] = pd.to_numeric(full["Age"], errors="coerce")
age_median_by_title = full.groupby("Title")["Age"].transform("median")
global_age_median = full["Age"].median()
full["Age"] = full["Age"].fillna(age_median_by_title)
full["Age"] = full["Age"].fillna(global_age_median)

# Age bins (optional numeric category)
full["AgeBin"] = pd.cut(full["Age"], bins=[0,12,20,40,60,120], labels=["Child","Teen","Adult","MidAge","Senior"])
full["AgeBin"] = full["AgeBin"].astype(str).fillna("Unknown")

# Embarked: fill with mode
if "Embarked" in full.columns:
    full["Embarked"] = full["Embarked"].fillna(full["Embarked"].mode().iloc[0])

# Deck from Cabin - take first letter, fill 'U' for unknown
if "Cabin" in full.columns:
    full["Deck"] = full["Cabin"].astype(str).str[0].replace("n", "U")  # guard vs NaN -> 'n'
    full["Deck"] = full["Deck"].replace("N", "U").fillna("U")
else:
    full["Deck"] = "U"

# Ticket group (group size) - optional
if "Ticket" in full.columns:
    full["Ticket"] = full["Ticket"].fillna("NA")
    ticket_counts = full["Ticket"].map(full["Ticket"].value_counts())
    full["TicketGroupSize"] = ticket_counts.fillna(1).astype(int)
else:
    full["TicketGroupSize"] = 1

# Drop columns we won't use directly (Name/Cabin/Ticket) BEFORE get_dummies
drop_cols = [c for c in ["Name","Ticket","Cabin"] if c in full.columns]
full = full.drop(columns=drop_cols)

# -------------------------
# 4) Encoding: one-hot on categorical columns (do on combined so both sets match)
# -------------------------
# Choose categorical cols to one-hot: Sex, Embarked, Title, Deck, AgeBin (only those that exist)
cat_cols = [c for c in ["Sex","Embarked","Title","Deck","AgeBin"] if c in full.columns]
full = pd.get_dummies(full, columns=cat_cols, drop_first=True)

# -------------------------
# 5) split back into train/test
# -------------------------
train_proc = full[full["is_train"] == 1].copy().drop(columns=["is_train"])
test_proc = full[full["is_train"] == 0].copy().drop(columns=["is_train"])

# Ensure Survived present only in train_proc (it will be NaN in test_proc) -> remove in test_proc
if "Survived" in test_proc.columns:
    test_proc = test_proc.drop(columns=["Survived"])

# -------------------------
# 6) ensure PassengerId preserved in test_proc
# -------------------------
# if PassengerId exists already, overwrite with original saved values to be safe
test_proc["PassengerId"] = test_passenger_id.values

# put PassengerId as first column
cols = list(test_proc.columns)
if cols[0] != "PassengerId":
    cols.remove("PassengerId")
    cols = ["PassengerId"] + cols
    test_proc = test_proc[cols]

# -------------------------
# 7) Align columns: make sure test_proc has exactly the same feature columns (except Survived)
# -------------------------
# For safety, get the feature columns from train_proc (excluding Survived)
train_feature_cols = [c for c in train_proc.columns if c != "Survived"]

# If some train feature columns are missing from test (rare), add them as zeros
for c in train_feature_cols:
    if c not in test_proc.columns:
        test_proc[c] = 0

# Reorder test_proc columns to match train feature order, but keep PassengerId first
ordered_test_cols = ["PassengerId"] + [c for c in train_feature_cols if c in test_proc.columns and c != "PassengerId"]
test_proc = test_proc[ordered_test_cols]

# If test has extra columns not in train (shouldn't happen since we used combined dummies), drop them (except PassengerId)
extra_cols = [c for c in test_proc.columns if c not in (["PassengerId"] + train_feature_cols)]
if extra_cols:
    # keep PassengerId and drop others
    drop_these = [c for c in extra_cols if c != "PassengerId"]
    if drop_these:
        test_proc = test_proc.drop(columns=drop_these)

# Final train_proc: keep Survived and feature columns
train_proc = train_proc[["Survived"] + [c for c in train_feature_cols if c != "Survived"]]

# -------------------------
# 8) Save outputs
# -------------------------
train_out = OUT_DIR / "train_processed.csv"
test_out = OUT_DIR / "test_processed.csv"

train_proc.to_csv(train_out, index=False)
test_proc.to_csv(test_out, index=False)

print("✅ Saved processed files:")
print(" -", train_out, " shape:", train_proc.shape)
print(" -", test_out, " shape:", test_proc.shape)


Raw shapes -> train: (891, 12)  test: (418, 11)
Combined shape: (1309, 13)
✅ Saved processed files:
 - data/processed/train_processed.csv  shape: (891, 30)
 - data/processed/test_processed.csv  shape: (418, 29)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full["Fare"].fillna(full["Fare"].median(), inplace=True)


In [14]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ---------------------------
# 1) Paths
# ---------------------------
DATA_DIR = Path("data/processed")
RAW_DIR = Path("data/")
SUB_DIR = Path("submissions")
SUB_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train_processed.csv"
test_path = DATA_DIR / "test_processed.csv"
raw_test_path = RAW_DIR / "test.csv"  # original test.csv for correct PassengerId order

# ---------------------------
# 2) Load data
# ---------------------------
train = pd.read_csv(train_path)
test_processed = pd.read_csv(test_path)
test_raw = pd.read_csv(raw_test_path)  # use for PassengerId

# Separate target and features
X = train.drop(["Survived", "PassengerId"], axis=1)
y = train["Survived"]
X_test = test_processed.drop("PassengerId", axis=1)

# ---------------------------
# 3) Train/Validation split
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# 4) Define base models
# ---------------------------
base_models = [
    ("rf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)),
    ("gb", GradientBoostingClassifier(n_estimators=200, random_state=42)),
]

# ---------------------------
# 5) Define stacking model
# ---------------------------
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(max_iter=1000),
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# ---------------------------
# 6) Train & Validate
# ---------------------------
stack_model.fit(X_train, y_train)
val_pred = stack_model.predict(X_val)

val_acc = accuracy_score(y_val, val_pred)
print("✅ Validation Accuracy:", val_acc)

# ---------------------------
# 7) Predict on test set
# ---------------------------
test_pred = stack_model.predict(X_test)
test_pred = test_pred.astype(int)  # ensure 0 or 1 for Kaggle

# ---------------------------
# 8) Save submission (Kaggle-ready)
# ---------------------------
submission = pd.DataFrame({
    "PassengerId": test_raw["PassengerId"],  # original order!
    "Survived": test_pred
})

out_path = SUB_DIR / "day13_stacking_clean.csv"
submission.to_csv(out_path, index=False)
print("✅ Submission saved at:", out_path)


✅ Validation Accuracy: 0.8044692737430168
✅ Submission saved at: submissions/day13_stacking_clean.csv
