In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib
import logging
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

warnings.filterwarnings('ignore')



In [5]:
# Load the dataset
logger.info("Loading dataset...")
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

INFO:__main__:Loading dataset...


In [6]:
# Data Preprocessing
logger.info("Preprocessing data...")
# Handle missing values
df['bmi'] = df['bmi'].replace('N/A', np.nan)
df['bmi'] = df['bmi'].astype(float)
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
logger.info(f"Missing BMI values filled. BMI null count: {df['bmi'].isna().sum()}")

INFO:__main__:Preprocessing data...
INFO:__main__:Missing BMI values filled. BMI null count: 0


In [7]:
# Drop 'id' column as it's not useful for prediction
df = df.drop('id', axis=1)

In [8]:
# Drop rows where gender is 'Other'
df = df[df['gender'] != 'Other']
logger.info(f"Gender value counts: {df['gender'].value_counts().to_dict()}")

INFO:__main__:Gender value counts: {'Female': 2994, 'Male': 2115}


In [9]:
# Encoding categorical values
df['gender'] = df['gender'].replace({'Male': 0, 'Female': 1}).astype(np.int8)
df['Residence_type'] = df['Residence_type'].replace({'Rural': 0, 'Urban': 1}).astype(np.int8)
df['work_type'] = df['work_type'].replace({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4}).astype(np.int8)

# Label encode ever_married and smoking_status
label_encoders = {}
for col in ['ever_married', 'smoking_status']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    logger.info(f"Label encoded {col}")

INFO:__main__:Label encoded ever_married
INFO:__main__:Label encoded smoking_status


In [10]:
# Define features and target
X = df.drop('stroke', axis=1)
y = df['stroke']

In [11]:
# Apply StandardScaler before train-test split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
logger.info("Features scaled")

INFO:__main__:Features scaled


In [12]:
# Split the scaled data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
logger.info("Data split into train and test sets")


INFO:__main__:Data split into train and test sets


In [13]:
# Apply SMOTE to balance the classes
logger.info("Applying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_train_scaled, y_train = smote.fit_resample(X_train_scaled, y_train)
logger.info(f"Post-SMOTE class distribution: {pd.Series(y_train).value_counts().to_dict()}")

INFO:__main__:Applying SMOTE to balance classes...
INFO:__main__:Post-SMOTE class distribution: {0: 3888, 1: 3888}


In [14]:
# Initialize and train the model with GridSearchCV
logger.info("Training Gradient Boosting model with GridSearchCV...")
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5]
}

INFO:__main__:Training Gradient Boosting model with GridSearchCV...


In [15]:
model = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_
logger.info(f"Best parameters: {grid_search.best_params_}")

INFO:__main__:Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [16]:
# Make predictions
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
logger.info(f"Model Accuracy: {accuracy:.4f}")
logger.info(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Model Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

INFO:__main__:Model Accuracy: 0.9247
INFO:__main__:ROC-AUC Score: 0.5715


Model Accuracy: 0.9247
ROC-AUC Score: 0.5715

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       972
           1       0.20      0.18      0.19        50

    accuracy                           0.92      1022
   macro avg       0.58      0.57      0.57      1022
weighted avg       0.92      0.92      0.92      1022



In [17]:
# Save the model, scaler, and label encoders (commented out as per original code)
joblib.dump(best_model, './stroke_gb_model.pkl')
joblib.dump(scaler, './scaler_stroke.pkl')
joblib.dump(label_encoders, './label_encoder_stroke.pkl')

['./label_encoder_stroke.pkl']

In [18]:
# Define prediction function
def predict_stroke(sample, model, scaler, label_encoders, feature_order):
    try:
        logger.info("Preprocessing sample input...")
        sample_df = pd.DataFrame([sample])
        sample_df['gender'] = sample_df['gender'].replace({'Male': 0, 'Female': 1})
        for col in ['ever_married', 'smoking_status']:
            sample_df[col] = label_encoders[col].transform(sample_df[col])
        sample_df['work_type'] = sample_df['work_type'].replace({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4})
        sample_df['Residence_type'] = sample_df['Residence_type'].replace({'Rural': 0, 'Urban': 1})
        sample_df = sample_df[feature_order]
        sample_scaled = scaler.transform(sample_df)
        prediction = model.predict(sample_scaled)[0]
        probabilities = model.predict_proba(sample_scaled)[0]
        logger.info("Prediction successful")
        return {
            'prediction': 'Stroke' if prediction == 1 else 'No Stroke',
            'prob_no_stroke': probabilities[0],
            'prob_stroke': probabilities[1]
        }
    except Exception as e:
        logger.error(f"Error during prediction: {str(e)}")
        raise

In [19]:
# Test samples
feature_order = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']

samples = [
    {
        'gender': 'Male',
        'age': 40,
        'hypertension': 1,
        'heart_disease': 0,
        'ever_married': 'Yes',
        'work_type': 'Govt_job',
        'Residence_type': 'Rural',
        'avg_glucose_level': 212.01,
        'bmi': 28.4,
        'smoking_status': 'never smoked'
    },
    {
        'gender': 'Male',
        'age': 67,
        'hypertension': 0,
        'heart_disease': 1,
        'ever_married': 'Yes',
        'work_type': 'Private',
        'Residence_type': 'Urban',
        'avg_glucose_level': 228.69,
        'bmi': 36.6,
        'smoking_status': 'formerly smoked'
    },
    {
        'gender': 'Female',
        'age': 49,
        'hypertension': 0,
        'heart_disease': 0,
        'ever_married': 'Yes',
        'work_type': 'Private',
        'Residence_type': 'Urban',
        'avg_glucose_level': 171.23,
        'bmi': 34.4,
        'smoking_status': 'smokes'
    },
    {
        'gender': 'Female',
        'age': 79,
        'hypertension': 1,
        'heart_disease': 0,
        'ever_married': 'Yes',
        'work_type': 'Self-employed',
        'Residence_type': 'Rural',
        'avg_glucose_level': 174.12,
        'bmi': 24.0,
        'smoking_status': 'never smoked'
    }
]

In [20]:
# Predict for each sample
for i, sample in enumerate(samples, 1):
    try:
        result = predict_stroke(sample, best_model, scaler, label_encoders, feature_order)
        print(f"\nSample {i}:")
        print(f"Prediction for stroke: {result['prediction']}")
        print(f"Probability of No Stroke (0): {result['prob_no_stroke']:.4f}")
        print(f"Probability of Stroke (1): {result['prob_stroke']:.4f}")
    except Exception as e:
        logger.error(f"Error predicting sample {i}: {str(e)}")

INFO:__main__:Preprocessing sample input...
INFO:__main__:Prediction successful
INFO:__main__:Preprocessing sample input...
INFO:__main__:Prediction successful
INFO:__main__:Preprocessing sample input...
INFO:__main__:Prediction successful
INFO:__main__:Preprocessing sample input...
INFO:__main__:Prediction successful



Sample 1:
Prediction for stroke: No Stroke
Probability of No Stroke (0): 0.9922
Probability of Stroke (1): 0.0078

Sample 2:
Prediction for stroke: Stroke
Probability of No Stroke (0): 0.2247
Probability of Stroke (1): 0.7753

Sample 3:
Prediction for stroke: No Stroke
Probability of No Stroke (0): 0.5185
Probability of Stroke (1): 0.4815

Sample 4:
Prediction for stroke: Stroke
Probability of No Stroke (0): 0.3719
Probability of Stroke (1): 0.6281


In [21]:
samples_two = [
    {#1
        'gender': 'Male',
        'age': 67,
        'hypertension': 0,
        'heart_disease': 1,
        'ever_married': 'Yes',
        'work_type': 'Private',
        'Residence_type': 'Urban',
        'avg_glucose_level': 228.69,
        'bmi': 36.6,
        'smoking_status': 'formerly smoked',
    },
    {#1
        'gender': 'Female',
        'age': 61,
        'hypertension': 0,
        'heart_disease': 1,
        'ever_married': 'Yes',
        'work_type': 'Govt_job',
        'Residence_type': 'Rural',
        'avg_glucose_level': 120.46,
        'bmi': 36.8,
        'smoking_status': 'smokes',
        
    },
    {
        'gender': 'Male',
        'age': 31,
        'hypertension': 1,
        'heart_disease': 0,
        'ever_married': 'Yes',
        'work_type': 'Govt_job',
        'Residence_type': 'Urban',
        'avg_glucose_level': 92.11,
        'bmi': None,
        'smoking_status': 'never smoked',
    } #0
]

In [22]:
# Predict for each sample
for i, sample in enumerate(samples_two, 1):
    try:
        result = predict_stroke(sample, best_model, scaler, label_encoders, feature_order)
        print(f"\nSample {i}:")
        print(f"Prediction for stroke: {result['prediction']}")
        print(f"Probability of No Stroke (0): {result['prob_no_stroke']:.4f}")
        print(f"Probability of Stroke (1): {result['prob_stroke']:.4f}")
    except Exception as e:
        logger.error(f"Error predicting sample {i}: {str(e)}")

INFO:__main__:Preprocessing sample input...
INFO:__main__:Prediction successful
INFO:__main__:Preprocessing sample input...
INFO:__main__:Prediction successful
INFO:__main__:Preprocessing sample input...
ERROR:__main__:Error during prediction: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
ERROR:__main__:Error predicting sample 3: Input X contains NaN.
GradientBoostingClassifier does not accept 


Sample 1:
Prediction for stroke: Stroke
Probability of No Stroke (0): 0.2247
Probability of Stroke (1): 0.7753

Sample 2:
Prediction for stroke: Stroke
Probability of No Stroke (0): 0.4291
Probability of Stroke (1): 0.5709
