In [20]:
# Import libraries
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Data preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, auc, RocCurveDisplay, make_scorer
from imblearn.over_sampling import SMOTE 

# Modelling libraries
import optuna
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
import pickle

# General settings 
pd.set_option('display.max_columns', None)

In [29]:
db_name = r'C:\Users\marci\OneDrive\Pulpit\Infoshare_Academy_Data_Science\[Projekt]\[REPO]\jdszr16-random-forest-rangers\database\db_heart_disease.db'
conn = sqlite3.connect(db_name)

query = 'SELECT * FROM tbl_observations;'
df = pd.read_sql_query(query, conn)

conn.close()

In [30]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4,0,0,0,0,0,0,195,106.0,70.0,26.97,80,77,0
1,0,46,2,0,0,0,0,0,0,250,121.0,81.0,28.73,95,76,0
2,1,48,1,1,20,0,0,0,0,245,127.5,80.0,25.34,75,70,0
3,0,61,3,1,30,0,0,1,0,225,150.0,95.0,28.58,65,103,1
4,0,46,3,1,23,0,0,0,0,285,130.0,84.0,23.1,85,85,0


In [11]:
df.rename(columns={'male': 'sex'}, inplace=True)

for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].replace('NA', np.nan)
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df[column] = df[column].astype('float64')
    
df = df.dropna()

df['education'] = df['education'].astype(int)

cat_columns = [col for col in df.columns if col in ['sex', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'TenYearCHD']]

num_columns = [col for col in df.columns if col in ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'glucose']]

scaled_data = df.loc[:,~df.columns.isin(['heartRate','currentSmoker'])].copy()

In [17]:
# Splitting the data into features (X) and target variable (y)
X = scaled_data.drop('TenYearCHD', axis=1)
y = scaled_data['TenYearCHD']

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# resampling of X_train and y_train with SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



In [18]:
def objective(trial):
    # Define the hyperparameters to tune
    C = trial.suggest_float('C', 0.001, 100, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    
    # Create and train the logistic regression model
    lr_model = LogisticRegression(random_state=42, penalty=penalty, C=C, solver='liblinear')
    lr_model.fit(X_train_resampled, y_train_resampled)
    
    # Predict probabilities for the test set
    y_pred_prob_lr = lr_model.predict_proba(X_test)[:, 1]
    
    # Calculate the AUC score
    auc_score = roc_auc_score(y_test, y_pred_prob_lr)
    
    return auc_score

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Train the final model with the best hyperparameters
best_lr_model = LogisticRegression(random_state=42, **study.best_params, solver='liblinear')
best_lr_model.fit(X_train_resampled, y_train_resampled)

[I 2025-01-29 20:22:17,761] A new study created in memory with name: no-name-ecec657c-1b0a-41c7-a796-8f82c34d7ca8
[I 2025-01-29 20:22:17,927] Trial 0 finished with value: 0.7289119503056597 and parameters: {'C': 0.19345836651262704, 'penalty': 'l2'}. Best is trial 0 with value: 0.7289119503056597.
[I 2025-01-29 20:22:17,944] Trial 1 finished with value: 0.7226693452967856 and parameters: {'C': 0.0054157138950036415, 'penalty': 'l1'}. Best is trial 0 with value: 0.7289119503056597.
[I 2025-01-29 20:22:17,956] Trial 2 finished with value: 0.7362822914612501 and parameters: {'C': 0.04290656692942343, 'penalty': 'l1'}. Best is trial 2 with value: 0.7362822914612501.
[I 2025-01-29 20:22:17,977] Trial 3 finished with value: 0.7276486393216328 and parameters: {'C': 9.803942677522796, 'penalty': 'l1'}. Best is trial 2 with value: 0.7362822914612501.
[I 2025-01-29 20:22:17,991] Trial 4 finished with value: 0.7397702622756852 and parameters: {'C': 0.0013770820544791004, 'penalty': 'l2'}. Best is

In [32]:
# Update numerical and categorical columns based on the actual dataset
numerical_columns = [col for col in ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'glucose'] if col in scaled_data.columns]
categorical_columns = [col for col in ['sex', 'education', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes'] if col in scaled_data.columns]

# Define preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numerical features
        ('cat', OneHotEncoder(), categorical_columns)  # One-hot encode categorical features
    ]
)

# Define the pipeline with SMOTE and Logistic Regression
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),  # Preprocessing
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),  # Resampling
    ('classifier', LogisticRegression())  # Classifier
])

# Train the final pipeline with the best hyperparameters
pipeline.set_params(classifier__C=best_lr_model.C, classifier__penalty=best_lr_model.penalty)
pipeline.fit(X_train, y_train)

# Calibration of the pipeline model
calibrated_pipeline = CalibratedClassifierCV(pipeline, method='sigmoid', cv='prefit')
calibrated_pipeline.fit(X_train, y_train)

# Save the calibrated pipeline to disk
with open('pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
    
# Save the calibrated pipeline to disk
with open('calibrated_pipeline.pkl', 'wb') as f:
    pickle.dump(calibrated_pipeline, f)




In [26]:
y_pred = pipeline.predict_proba(X_test)[:,1]
y_pred

array([0.34964962, 0.48409878, 0.24027393, ..., 0.30075   , 0.28645192,
       0.33808647])