<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-7/day07_make_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# day07_make_submission.py
# Usage: put train.csv (or day02/day02_titanic_preserved.csv) and test.csv in your project root,
# then run: python day07_make_submission.py
# Output: day07/submission.csv

import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.ensemble import RandomForestClassifier

OUT = Path("day07"); OUT.mkdir(exist_ok=True)
ASSETS = OUT / "assets"; ASSETS.mkdir(exist_ok=True)

# -------------------------
# 1) Load data (train + test)
# -------------------------
# prefer preserved cleaned train (with Name/Cabin). Fallback to train.csv
train_paths = [Path("day02/day02_titanic_preserved.csv"), Path("day02/train.csv"), Path("train.csv")]
train_path = next((p for p in train_paths if p.exists()), None)
if train_path is None:
    raise FileNotFoundError("train.csv not found. Place train.csv or day02/day02_titanic_preserved.csv in project root.")
test_path = Path("test.csv")
if not test_path.exists():
    raise FileNotFoundError("test.csv not found. Download it from the Kaggle Titanic competition and put test.csv in project root.")

print("Using train file:", train_path)
print("Using test file:", test_path)

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Keep PassengerId for submission
test_passenger_ids = test['PassengerId'].copy()

# -------------------------
# 2) Feature engineering function (must match Day5)
# -------------------------
def feature_engineer(df, age_map=None, is_train=False):
    df = df.copy()
    # FamilySize, IsAlone
    df['FamilySize'] = df.get('SibSp', 0).fillna(0).astype(int) + df.get('Parch', 0).fillna(0).astype(int) + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Title extraction
    if 'Name' in df.columns:
        # Extract the title and then strip whitespace, ensuring it's a Series
        df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.')[0].str.strip()
        # replace known variants
        df['Title'] = df['Title'].replace({
            "Mlle":"Miss","Ms":"Miss","Mme":"Mrs",
            "Lady":"Rare","Countess":"Rare","Sir":"Rare","Don":"Rare","Dona":"Rare",
            "Col":"Rare","Major":"Rare","Capt":"Rare","Rev":"Rare","Dr":"Rare"
        })
    else:
        df['Title'] = 'Unknown'

    # collapse very rare titles if train
    if is_train:
        vc = df['Title'].value_counts()
        rare_titles = vc[vc < 10].index.tolist()
        df.loc[df['Title'].isin(rare_titles), 'Title'] = 'Rare'
    else:
        # For test set, any unseen title will remain; we'll handle alignment later
        pass

    # Deck from Cabin (first letter), missing->U
    if 'Cabin' in df.columns:
        df['Cabin'] = df['Cabin'].fillna('U')
        df['Deck'] = df['Cabin'].astype(str).str[0]
    else:
        df['Deck'] = 'U'

    # Fare per person (handle missing)
    df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')
    if df['Fare'].isna().any():
        df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']

    # Age imputation by Title median (train computes map; test uses it)
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    if is_train:
        age_map = df.groupby('Title')['Age'].median().to_dict()
        df['Age'] = df.apply(lambda r: age_map[r['Title']] if pd.isna(r['Age']) and r['Title'] in age_map else r['Age'], axis=1)
    else:
        if age_map is not None:
            # fill using mapping; fallback to overall median
            overall_med = np.nanmedian(list(age_map.values()))
            df['Age'] = df.apply(lambda r: age_map.get(r['Title'], overall_med) if pd.isna(r['Age']) else r['Age'], axis=1)
    # If still missing, fill with median
    if df['Age'].isna().any():
        df['Age'] = df['Age'].fillna(df['Age'].median())

    # log fare
    df['Fare_log'] = np.log1p(df['Fare'])

    # Age bin
    df['AgeBin'] = pd.cut(df['Age'], bins=[0,12,20,40,60,120], labels=['Child','Teen','Adult','MidAge','Senior'])

    # Sex binary
    if 'Sex' in df.columns:
        df['Sex_bin'] = (df['Sex'].str.lower().str.startswith('m')).astype(int)
    else:
        df['Sex_bin'] = 0

    return df, (age_map if is_train else None)

# Run on train (compute age_map)
train_fe, age_map = feature_engineer(train, is_train=True)
# Run on test using train's age_map
test_fe, _ = feature_engineer(test, age_map=age_map, is_train=False)

# -------------------------
# 3) Pick features & encode
# -------------------------
feature_cols = ['Pclass','Sex_bin','Age','Fare_log','FamilySize','FarePerPerson','IsAlone','Title','Deck','Embarked','AgeBin']
# keep only existing
feature_cols = [c for c in feature_cols if c in train_fe.columns]

X_train = train_fe[feature_cols].copy()
y_train = train_fe['Survived'].copy()
X_test = test_fe[feature_cols].copy()

# One-hot encode categorical columns (Title, Deck, Embarked, AgeBin)
cat_cols = [c for c in ['Title','Deck','Embarked','AgeBin'] if c in X_train.columns]
X_train = pd.get_dummies(X_train, columns=cat_cols, dummy_na=False)
X_test = pd.get_dummies(X_test, columns=cat_cols, dummy_na=False)

# Align test columns to train columns (add missing columns with 0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# -------------------------
# 4) Train a model (RandomForest baseline)
# -------------------------
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Optionally save model & columns for future
joblib.dump(model, ASSETS / "rf_day07.joblib")
pd.Series(X_train.columns).to_csv(ASSETS / "feature_columns_day07.csv", index=False)

# -------------------------
# 5) Predict on test, create submission
# -------------------------
preds = model.predict(X_test)
# ensure int type 0/1
preds = preds.astype(int)

submission = pd.DataFrame({'PassengerId': test_passenger_ids, 'Survived': preds})
submission_path = OUT / "submission.csv"
submission.to_csv(submission_path, index=False)
print("Submission saved to:", submission_path)

# Save a quick results note
with open(OUT / "readme.txt", "w") as f:
    f.write("Day 7: Baseline RF submission\n")
    f.write(f"Model: RandomForestClassifier(n_estimators=200)\n")
    f.write(f"Features used: {len(X_train.columns)} columns\n")
    f.write("Submission: submission.csv\n")
print("All done. Upload submission.csv to Kaggle.")

Using train file: day02/day02_titanic_preserved.csv
Using test file: test.csv
Train shape: (891, 29) Test shape: (418, 29)
Submission saved to: day07/submission.csv
All done. Upload submission.csv to Kaggle.
