In [2]:
#3_ML_Baseline_Template.ipynb

# ==========================
# ML Baseline Template
# ==========================
# WHEN TO USE:
# After cleaning data. A "baseline model" gives a quick benchmark.
# Use it before heavy feature engineering or hyperparameter tuning.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
# Load cleaned dataset
df = pd.read_csv("cleaned_dataset.csv")

In [None]:
# --------------------------
# Setup Target + Features
# --------------------------
# WHEN TO USE:
# Always separate target variable (y) from features (X).
target_column = "Survived"   # change this
X = df.drop(columns=[target_column], errors="ignore")
y = df[target_column]

In [None]:
# Dummy encode categoricals
X = pd.get_dummies(X, drop_first=True)


In [None]:
# Train-test split (standard = 80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# --------------------------
# Baseline 1: Logistic Regression
# --------------------------
# WHEN TO USE:
# Best starting point for binary classification.
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# --------------------------
# Baseline 2: Random Forest
# --------------------------
# WHEN TO USE:
# For non-linear datasets, handles mixed features well.
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# --------------------------
# Baseline 3: Support Vector Machine
# --------------------------
# WHEN TO USE:
# Good for high-dimensional data. Sensitive to scaling.
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred))

print("✅ Baselines tested – choose best model as benchmark.")