In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
import joblib


In [13]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

# Store and save imputation values
imputation_values = {
    'bmi': df['bmi'].median(),
    'avg_glucose_level': df['avg_glucose_level'].median(),
    'hypertension': df['hypertension'].mode()[0],
    'heart_disease': df['heart_disease'].mode()[0]
}
joblib.dump(imputation_values, 'imputation_values.pkl')

# Handle missing values
df["bmi"].fillna(imputation_values['bmi'], inplace=True)
df["avg_glucose_level"].fillna(imputation_values['avg_glucose_level'], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["bmi"].fillna(imputation_values['bmi'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["avg_glucose_level"].fillna(imputation_values['avg_glucose_level'], inplace=True)


In [10]:
encoder = LabelEncoder()
columns_to_encode = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
encoders = {}  # Store encoders for production use

for col in columns_to_encode:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])

# Save encoders for production
joblib.dump(encoders, 'encoders.pkl')

['encoders.pkl']

In [11]:
# Prepare features and target
X = df.drop(['stroke', 'id'], axis=1)
y = df['stroke']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler for production
joblib.dump(scaler, 'scaler.pkl')

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize models
logreg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
svc = SVC(probability=True, random_state=42)
knn = KNeighborsClassifier()

In [12]:
voting_clf = VotingClassifier(
    estimators=[('logreg', logreg), ('rf', rf), ('gb', gb), ('svc', svc), ('knn', knn)],
    voting='soft'
)
voting_clf.fit(X_train, y_train)

# Evaluate model
accuracy = voting_clf.score(X_test, y_test)
print(f'Model Accuracy: {accuracy:.4f}')

# Save final model
joblib.dump(voting_clf, 'stroke_model.pkl')

Model Accuracy: 0.9393


['stroke_model.pkl']