In [2]:
# 03_Predict_Survival.ipynb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib

# =========================
# 1️⃣ Load and clean data
# =========================
df = pd.read_csv("../data/train.csv")

# Fix tab-separated issue if needed
if '\t' in df.columns[0]:
    df = pd.read_csv("../data/train.csv", sep='\t')

df.columns = [col.strip() for col in df.columns]

# Create CabinDeck feature
df["CabinDeck"] = df["Cabin"].astype(str).str[0]
df["CabinDeck"].replace("n", "Unknown", inplace=True)

# Select relevant features
features = ["Age", "Sex", "CabinDeck", "Embarked"]
df = df[features + ["Survived"]].dropna()

# =========================
# 2️⃣ Split and build model
# =========================
X = df[features]
y = df["Survived"]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ["Age"]),
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["Sex", "CabinDeck", "Embarked"]),
    ]
)

# Model pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=500))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "../models/titanic_model.pkl")
print("✅ Model trained and saved successfully!")

# =========================
# 3️⃣ Interactive Prediction
# =========================

def get_valid_input(prompt, valid_options=None, numeric=False, min_val=None, max_val=None):
    """Ask user for input with validation."""
    while True:
        val = input(prompt).strip()
        if numeric:
            try:
                val = float(val)
                if min_val is not None and (val < min_val or val > max_val):
                    print(f"Please enter a number between {min_val} and {max_val}.")
                else:
                    return val
            except ValueError:
                print("Please enter a valid number.")
        else:
            if valid_options and val not in valid_options:
                print(f"Please choose one of: {', '.join(valid_options)}")
            else:
                return val


# Allowed options (based on dataset)
sex_options = ["male", "female"]
cabin_deck_options = sorted(df["CabinDeck"].unique().tolist())
embarked_options = sorted(df["Embarked"].dropna().unique().tolist())

print("\n--- Titanic Survival Predictor ---")
print("Please enter the following passenger details:\n")

age = get_valid_input("Age (0–80): ", numeric=True, min_val=0, max_val=80)
sex = get_valid_input(f"Sex ({', '.join(sex_options)}): ", valid_options=sex_options)
cabin_deck = get_valid_input(f"Cabin Deck ({', '.join(cabin_deck_options)}): ", valid_options=cabin_deck_options)
embarked = get_valid_input(f"Embarked Port ({', '.join(embarked_options)}): ", valid_options=embarked_options)

# Create DataFrame for input
user_input = pd.DataFrame([{
    "Age": age,
    "Sex": sex,
    "CabinDeck": cabin_deck,
    "Embarked": embarked
}])

# Load model
model = joblib.load("../models/titanic_model.pkl")

# Predict survival probability
survival_prob = model.predict_proba(user_input)[0][1] * 100

print("\n🎯 Prediction Result:")
print(f"Estimated survival probability: {survival_prob:.2f}%")
print("Higher % means higher chance of survival.\n")

✅ Model trained and saved successfully!

--- Titanic Survival Predictor ---
Please enter the following passenger details:



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["CabinDeck"].replace("n", "Unknown", inplace=True)


Age (0–80):  30
Sex (male, female):  male
Cabin Deck (A, B, C, D, E, F, G, T, Unknown):  T
Embarked Port (C, Q, S):  S



🎯 Prediction Result:
Estimated survival probability: 29.48%
Higher % means higher chance of survival.

