In [1]:
# data wangling
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# custom modules
# add project src folder to sys.path so we can import preprocess.py
sys.path.append("../../src")
import preprocess

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# load data

train_df = pd.read_csv('../../data/raw/train.csv')
test_df = pd.read_csv('../../data/raw/test.csv')
submission_df = pd.read_csv('../../data/raw/gender_submission.csv')

X = train_df.drop("Survived", axis=1)
y = train_df["Survived"]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Preprocess
X_processed = preprocess.preprocess(X)
test_processed = preprocess.preprocess(test_df)
X_processed.head()
# data leakage check
# ensure all the features are knowwn at boarding time

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,3,0,35.0,0,0,8.05


In [3]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Model
model = LogisticRegression(
    solver="liblinear",
    random_state=42
)

# Cross-validation setup
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

# Run CV
cv_scores = cross_val_score(
    model,
    X_processed,
    y,
    cv=cv,
    scoring="accuracy"
)

print("Cross‑Validation Accuracy Scores:", cv_scores)
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f}")
print(f"Std CV Accuracy:  {np.std(cv_scores):.4f}")

# Train final model on full dataset
model.fit(X_processed, y)

# --- Training accuracy on full dataset ---
train_pred = model.predict(X_processed)
train_acc = accuracy_score(y, train_pred)
print(f"Training Accuracy (full data): {train_acc:.4f}")

Cross‑Validation Accuracy Scores: [0.7877095  0.79213483 0.79775281 0.76966292 0.82022472]
Mean CV Accuracy: 0.7935
Std CV Accuracy:  0.0163
Training Accuracy (full data): 0.8002


In [4]:
# Predict test
test_pred = model.predict(test_processed)

# Create submission
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": test_pred
})
submission.to_csv("../../submissions/submission_exp001.csv", index=False)
print("submission_exp001.csv created")

submission_exp001.csv created
