In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, ShuffleSplit

In [18]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [19]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [20]:
df = df.drop('id', axis=1)

In [21]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [23]:
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [24]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,True,False,True,False,True,False,False,True,True,False,False
1,61.0,0,0,202.21,28.893237,1,False,False,True,False,False,True,False,False,False,True,False
2,80.0,0,1,105.92,32.5,1,True,False,True,False,True,False,False,False,False,True,False
3,49.0,0,0,171.23,34.4,1,False,False,True,False,True,False,False,True,False,False,True
4,79.0,1,0,174.12,24.0,1,False,False,True,False,False,True,False,False,False,True,False


In [6]:
X = df.drop('stroke', axis='columns')
y = df.stroke

In [7]:
X = df.drop('stroke', axis=1)  # Exclude the target variable
y = df['stroke']  # Target variable

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

logistic_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

logistic_grid = GridSearchCV(LogisticRegression(max_iter=10000), logistic_params, cv=cv, scoring='accuracy')

logistic_grid.fit(X, y)

In [8]:
print("Best Parameters:", logistic_grid.best_params_)
print("Best Cross-Validation Score:", logistic_grid.best_score_)

Best Parameters: {'C': 0.1, 'solver': 'liblinear'}
Best Cross-Validation Score: 0.9526418786692759


In [11]:
import pickle

with open('logistic_model_stroke.pkl', 'wb') as file:
    pickle.dump(logistic_grid, file)

print("Model saved to 'logistic_model_stroke.pkl'")


Model saved to 'logistic_model_stroke.pkl'
