In [1]:
# loading data from the stroke sqlite database
import pandas as pd
import sqlite3

conn = sqlite3.connect("../data/stroke.db")

df = pd.read_sql_query("""
SELECT *
FROM patients
JOIN medical_conditions USING(patient_id)
JOIN lifestyle USING(patient_id)
""", conn)

conn.close()
df.head()

Unnamed: 0,patient_id,gender,age,ever_married,Residence_type,stroke,id,hypertension,heart_disease,avg_glucose_level,bmi,id.1,work_type,smoking_status
0,9046,Male,67.0,Yes,Urban,1,1,0,1,228.69,36.6,1,Private,formerly smoked
1,31112,Male,80.0,Yes,Rural,1,2,0,1,105.92,32.5,2,Private,never smoked
2,60182,Female,49.0,Yes,Urban,1,3,0,0,171.23,34.4,3,Private,smokes
3,1665,Female,79.0,Yes,Rural,1,4,1,0,174.12,24.0,4,Self-employed,never smoked
4,56669,Male,81.0,Yes,Urban,1,5,0,0,186.21,29.0,5,Private,formerly smoked


In [2]:
# creating X and y and splitting into train and test
from sklearn.model_selection import train_test_split

df = df.copy()

y = df["stroke"]
X = df.drop(columns=["stroke"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((3927, 13), (982, 13))

In [3]:
# defining preprocessing for numeric and categorical columns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_cols = ["age", "hypertension", "heart_disease", "avg_glucose_level", "bmi"]
categorical_cols = ["gender", "ever_married", "Residence_type", "work_type", "smoking_status"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

In [4]:
# setting up optuna, models and helper functions
import optuna
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from pathlib import Path
import joblib

base_dir = Path("..")
models_dir = base_dir / "models"
models_dir.mkdir(exist_ok=True)

model_names = ["logreg", "rf", "gb", "svm"]

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# building a pipeline for a given model type, pca flag and optuna trial
def build_pipeline(model_name, trial, use_pca):
    if model_name == "logreg":
        C = trial.suggest_float("C", 1e-3, 10.0, log=True)
        clf = LogisticRegression(
            C=C,
            max_iter=1000,
            class_weight="balanced"
        )
    elif model_name == "rf":
        n_estimators = trial.suggest_int("n_estimators", 100, 400, step=50)
        max_depth = trial.suggest_int("max_depth", 3, 15)
        max_features = trial.suggest_float("max_features", 0.3, 1.0)
        clf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            random_state=42,
            n_jobs=-1
        )
    elif model_name == "gb":
        n_estimators = trial.suggest_int("n_estimators", 100, 400, step=50)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
        max_depth = trial.suggest_int("max_depth", 2, 5)
        clf = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )
    elif model_name == "svm":
        C = trial.suggest_float("C", 0.1, 20.0, log=True)
        gamma = trial.suggest_float("gamma", 1e-4, 1.0, log=True)
        clf = SVC(
            kernel="rbf",
            C=C,
            gamma=gamma,
            probability=True,
            random_state=42
        )
    else:
        raise ValueError("unknown model name")

    steps = [("preprocess", preprocessor)]
    if use_pca:
        n_components = trial.suggest_int("n_components", 3, 10)
        steps.append(("pca", PCA(n_components=n_components)))
    steps.append(("clf", clf))
    pipe = Pipeline(steps)
    return pipe

In [6]:
# defining optuna objective to optimize f1 score
from sklearn.model_selection import train_test_split

def make_objective(model_name, use_pca):
    def objective(trial):
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train, y_train, test_size=0.25, random_state=trial.number, stratify=y_train
        )
        pipe = build_pipeline(model_name, trial, use_pca)
        pipe.fit(X_tr, y_tr)
        y_pred = pipe.predict(X_val)
        f1 = f1_score(y_val, y_pred)
        return f1
    return objective

In [7]:
# running optuna tuning for each model with and without pca
tuned_results = []

for use_pca in [False, True]:
    for model_name in model_names:
        study_name = f"{model_name}_{'with_pca' if use_pca else 'no_pca'}"
        study = optuna.create_study(direction="maximize", study_name=study_name)
        study.optimize(make_objective(model_name, use_pca), n_trials=20)

        best_trial = study.best_trial
        best_pipe = build_pipeline(model_name, best_trial, use_pca)
        best_pipe.fit(X_train, y_train)

        y_pred_test = best_pipe.predict(X_test)
        f1 = f1_score(y_test, y_pred_test)

        pca_tag = "with_pca" if use_pca else "no_pca"
        model_path = models_dir / f"{model_name}_{pca_tag}_with_optuna.pkl"
        joblib.dump(best_pipe, model_path)

        tuned_results.append({
            "model_name": model_name,
            "pca": use_pca,
            "optuna": True,
            "f1_score": f1,
            "model_path": str(model_path)
        })

results_optuna_df = pd.DataFrame(tuned_results)
results_optuna_df

[I 2025-12-18 15:31:23,919] A new study created in memory with name: logreg_no_pca
[I 2025-12-18 15:31:23,943] Trial 0 finished with value: 0.20408163265306123 and parameters: {'C': 0.00373827833042037}. Best is trial 0 with value: 0.20408163265306123.
[I 2025-12-18 15:31:23,956] Trial 1 finished with value: 0.21710526315789475 and parameters: {'C': 0.310202494828186}. Best is trial 1 with value: 0.21710526315789475.
[I 2025-12-18 15:31:23,967] Trial 2 finished with value: 0.23026315789473684 and parameters: {'C': 0.21398978576569896}. Best is trial 2 with value: 0.23026315789473684.
[I 2025-12-18 15:31:23,976] Trial 3 finished with value: 0.2591362126245847 and parameters: {'C': 0.0018783050948409815}. Best is trial 3 with value: 0.2591362126245847.
[I 2025-12-18 15:31:23,984] Trial 4 finished with value: 0.24060150375939848 and parameters: {'C': 0.0014405552620481946}. Best is trial 3 with value: 0.2591362126245847.
[I 2025-12-18 15:31:23,994] Trial 5 finished with value: 0.214511041

Unnamed: 0,model_name,pca,optuna,f1_score,model_path
0,logreg,False,True,0.184932,../models/logreg_no_pca_with_optuna.pkl
1,rf,False,True,0.042553,../models/rf_no_pca_with_optuna.pkl
2,gb,False,True,0.0,../models/gb_no_pca_with_optuna.pkl
3,svm,False,True,0.040816,../models/svm_no_pca_with_optuna.pkl
4,logreg,True,True,0.18315,../models/logreg_with_pca_with_optuna.pkl
5,rf,True,True,0.0,../models/rf_with_pca_with_optuna.pkl
6,gb,True,True,0.0,../models/gb_with_pca_with_optuna.pkl
7,svm,True,True,0.083333,../models/svm_with_pca_with_optuna.pkl


In [8]:
# saving the metrics for experiments with optuna
metrics_optuna_path = base_dir / "data" / "metrics_with_optuna.csv"
results_optuna_df.to_csv(metrics_optuna_path, index=False)
metrics_optuna_path

PosixPath('../data/metrics_with_optuna.csv')

In [9]:
# checking all metrics and finding the best model overall
import pandas as pd

no_opt = pd.read_csv("../data/metrics_no_optuna.csv")
with_opt = pd.read_csv("../data/metrics_with_optuna.csv")

all_metrics = pd.concat([no_opt, with_opt], ignore_index=True)
all_metrics_sorted = all_metrics.sort_values("f1_score", ascending=False)
best = all_metrics_sorted.iloc[0]
best

model_name                                     logreg
pca                                             False
optuna                                           True
f1_score                                     0.184932
model_path    ../models/logreg_no_pca_with_optuna.pkl
Name: 8, dtype: object