In [1]:
# loading data from the stroke sqlite database
import pandas as pd
import sqlite3

conn = sqlite3.connect("../data/stroke.db")

df = pd.read_sql_query("""
SELECT *
FROM patients
JOIN medical_conditions USING(patient_id)
JOIN lifestyle USING(patient_id)
""", conn)

conn.close()
df.head()

Unnamed: 0,patient_id,gender,age,ever_married,Residence_type,stroke,id,hypertension,heart_disease,avg_glucose_level,bmi,id.1,work_type,smoking_status
0,9046,Male,67.0,Yes,Urban,1,1,0,1,228.69,36.6,1,Private,formerly smoked
1,31112,Male,80.0,Yes,Rural,1,2,0,1,105.92,32.5,2,Private,never smoked
2,60182,Female,49.0,Yes,Urban,1,3,0,0,171.23,34.4,3,Private,smokes
3,1665,Female,79.0,Yes,Rural,1,4,1,0,174.12,24.0,4,Self-employed,never smoked
4,56669,Male,81.0,Yes,Urban,1,5,0,0,186.21,29.0,5,Private,formerly smoked


In [2]:
# creating X and y and splitting into train and test
from sklearn.model_selection import train_test_split

df = df.copy()

y = df["stroke"]
X = df.drop(columns=["stroke"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((3927, 13), (982, 13))

In [3]:
# defining preprocessing for numeric and categorical columns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_cols = ["age", "hypertension", "heart_disease", "avg_glucose_level", "bmi"]
categorical_cols = ["gender", "ever_married", "Residence_type", "work_type", "smoking_status"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

In [4]:
# setting up models and a small helper to run experiments (without xgboost)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from pathlib import Path
import joblib

base_dir = Path("..")
models_dir = base_dir / "models"
models_dir.mkdir(exist_ok=True)

models = {
    "logreg": LogisticRegression(max_iter=1000),
    "rf": RandomForestClassifier(n_estimators=200, random_state=42),
    "gb": GradientBoostingClassifier(random_state=42),
    "svm": SVC(kernel="rbf", probability=True, random_state=42)
}

def run_experiment(model_name, model, use_pca):
    steps = [("preprocess", preprocessor)]
    if use_pca:
        steps.append(("pca", PCA(n_components=5)))
    steps.append(("clf", model))

    pipe = Pipeline(steps)
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    pca_tag = "with_pca" if use_pca else "no_pca"
    model_path = models_dir / f"{model_name}_{pca_tag}_no_optuna.pkl"
    joblib.dump(pipe, model_path)

    return {
        "model_name": model_name,
        "pca": use_pca,
        "optuna": False,
        "f1_score": f1,
        "model_path": str(model_path)
    }


In [5]:
# running experiments for all models without pca
experiments_no_optuna = []

for name, model in models.items():
    result = run_experiment(name, model, use_pca=False)
    experiments_no_optuna.append(result)

pd.DataFrame(experiments_no_optuna)

Unnamed: 0,model_name,pca,optuna,f1_score,model_path
0,logreg,False,False,0.0,../models/logreg_no_pca_no_optuna.pkl
1,rf,False,False,0.0,../models/rf_no_pca_no_optuna.pkl
2,gb,False,False,0.0,../models/gb_no_pca_no_optuna.pkl
3,svm,False,False,0.0,../models/svm_no_pca_no_optuna.pkl


In [6]:
# running experiments for all models with pca
for name, model in models.items():
    result = run_experiment(name, model, use_pca=True)
    experiments_no_optuna.append(result)

results_df = pd.DataFrame(experiments_no_optuna)
results_df

Unnamed: 0,model_name,pca,optuna,f1_score,model_path
0,logreg,False,False,0.0,../models/logreg_no_pca_no_optuna.pkl
1,rf,False,False,0.0,../models/rf_no_pca_no_optuna.pkl
2,gb,False,False,0.0,../models/gb_no_pca_no_optuna.pkl
3,svm,False,False,0.0,../models/svm_no_pca_no_optuna.pkl
4,logreg,True,False,0.0,../models/logreg_with_pca_no_optuna.pkl
5,rf,True,False,0.0,../models/rf_with_pca_no_optuna.pkl
6,gb,True,False,0.041667,../models/gb_with_pca_no_optuna.pkl
7,svm,True,False,0.0,../models/svm_with_pca_no_optuna.pkl


In [7]:
# saving the metrics for experiments without optuna
metrics_path = base_dir / "data" / "metrics_no_optuna.csv"
results_df.to_csv(metrics_path, index=False)
metrics_path

PosixPath('../data/metrics_no_optuna.csv')