In [1]:
# --- PART A: Model Building Script ---
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib
from google.colab import files

# 1. Robust Data Loading
# We try to load local CSV first. If not found, we load from Seaborn (internet).
if os.path.exists('train.csv'):
    df = pd.read_csv('train.csv')
    print("✅ Loaded from local train.csv")
else:
    print("⚠️ Local file not found. Loading from Seaborn library...")
    df = sns.load_dataset('titanic')
    # Seaborn dataset needs slight adjustment to match Kaggle format names
    df.rename(columns={'sex': 'Sex', 'age': 'Age', 'fare': 'Fare',
                       'pclass': 'Pclass', 'sibsp': 'SibSp', 'survived': 'Survived'}, inplace=True)

# 2. Feature Selection (We pick 5 as requested)
# Selected: Pclass, Sex, Age, SibSp, Fare
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare']
target = 'Survived'

df = df[features + [target]].copy()

# 3. Preprocessing (Rubric Requirements)
# a. Handling Missing Values (Fill Age with Median)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# c. Encoding Categorical Variables
# Map Sex: male = 0, female = 1
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# 4. Split Data
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build Pipeline
# Combines Scaling (Rubric: "Feature scaling") + Model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 6. Train
pipeline.fit(X_train, y_train)

# 7. Evaluate (Rubric: "Classification report")
y_pred = pipeline.predict(X_test)
print("\n--- Model Classification Report ---")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# 8. Save Model
joblib.dump(pipeline, 'titanic_survival_model.pkl')
print("\n✅ Model saved as 'titanic_survival_model.pkl'")

# 9. Verify Reload (Rubric: "Demonstrate model can be reloaded")
loaded_model = joblib.load('titanic_survival_model.pkl')
test_pred = loaded_model.predict(X_test.iloc[0:1])
print(f"Test Reload Prediction: {test_pred[0]} (Works!)")

# 10. Download
files.download('titanic_survival_model.pkl')

✅ Loaded from local train.csv

--- Model Classification Report ---
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179

Accuracy: 0.8045

✅ Model saved as 'titanic_survival_model.pkl'
Test Reload Prediction: 0 (Works!)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>