# Month 3: Python Basics + Project (Titanic Classification)

In [1]:
# =========================
# Titanic: Python basics -> EDA -> ML (Logistic Regression)
# Uses seaborn's Titanic dataset; outputs model file and metrics.
# =========================

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from pathlib import Path

# Load dataset
titanic = sns.load_dataset('titanic')  # seaborn provides this; simple for demo
print("Raw columns:", titanic.columns.tolist())
print("Rows:", len(titanic))

# Basic cleaning & quick EDA (Python basics at work)
df = titanic.copy()

# Keep a few useful features and drop rows with missing crucial info (simple approach)
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked']].dropna().reset_index(drop=True)
df['sex'] = df['sex'].map({'male':0,'female':1})
df['embarked'] = df['embarked'].map({'C':0,'Q':1,'S':2})

# Feature & label
X = df.drop('survived', axis=1)
y = df['survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a simple logistic regression
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", round(acc, 4))
print("\nClassification report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Save model and a small "artifact" (columns used)
out_dir = Path("outputs")
out_dir.mkdir(exist_ok=True)
model_path = out_dir/"titanic_logreg.joblib"
joblib.dump({'model': model, 'columns': X.columns.tolist()}, model_path)
print(f"Saved model to: {model_path}")

# Example inference function (to customize later)
def predict_survival(sample_dict):
    # sample_dict should contain pclass, sex (0/1), age, sibsp, parch, fare, embarked (0/1/2)
    row = pd.DataFrame([sample_dict])
    return int(model.predict(row)[0]), float(model.predict_proba(row)[0][1])

# Quick test inference
test_sample = {'pclass':3, 'sex':0, 'age':22.0, 'sibsp':1, 'parch':0, 'fare':7.25, 'embarked':2}
pred_label, prob_survive = predict_survival(test_sample)
print("\nTest sample prediction -> survived:", pred_label, "prob:", round(prob_survive,4))



Raw columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']
Rows: 891
Accuracy: 0.7902

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83        85
           1       0.75      0.72      0.74        58

    accuracy                           0.79       143
   macro avg       0.78      0.78      0.78       143
weighted avg       0.79      0.79      0.79       143

Confusion Matrix:
 [[71 14]
 [16 42]]
Saved model to: outputs/titanic_logreg.joblib

Test sample prediction -> survived: 0 prob: 0.0994
