In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 1) Load cleaned data
df = pd.read_csv("downloads/titanic_clean.csv")  # or "downloads/titanic_clean.csv"

# 2) Features & target
X = df[['Pclass','Sex','Age','Fare','SibSp','Parch']]
y = df['Survived']

# 3) Models (use your tuned RF from Day 9 if you want)
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)  # Day 9 best

# 4) Stratified 5-fold CV (keeps class balance)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 5) Cross-validated accuracy
lr_scores = cross_val_score(lr, X, y, cv=cv, scoring='accuracy')
rf_scores = cross_val_score(rf, X, y, cv=cv, scoring='accuracy')

print("Logistic Regression CV accuracy -> mean={:.3f}  std={:.3f}  scores={}".format(
    lr_scores.mean(), lr_scores.std(), [round(s,3) for s in lr_scores]
))
print("Random Forest CV accuracy      -> mean={:.3f}  std={:.3f}  scores={}".format(
    rf_scores.mean(), rf_scores.std(), [round(s,3) for s in rf_scores]
))


Logistic Regression CV accuracy -> mean=0.788  std=0.011  scores=[np.float64(0.793), np.float64(0.787), np.float64(0.787), np.float64(0.77), np.float64(0.803)]
Random Forest CV accuracy      -> mean=0.827  std=0.023  scores=[np.float64(0.86), np.float64(0.803), np.float64(0.803), np.float64(0.82), np.float64(0.848)]
