In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [4]:
# Data Preprocessing
# Handle missing values
df['bmi'] = df['bmi'].replace('N/A', np.nan)
df['bmi'] = df['bmi'].astype(float)
df['bmi'].fillna(df['bmi'].mean(), inplace=True)

In [5]:
df['bmi'].isna().sum()

0

In [6]:
# Drop 'id' column as it's not useful for prediction
df = df.drop('id', axis=1)

In [7]:
# Encoding categorical values

df['gender'] = df['gender'].replace({'Male':0,'Female':1,'Other':2}).astype(np.int8)
df['Residence_type'] = df['Residence_type'].replace({'Rural':0,'Urban':1}).astype(np.int8)
df['work_type'] = df['work_type'].replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':3,'Never_worked':4}).astype(np.int8)

In [8]:
df.dtypes

gender                  int8
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type               int8
Residence_type          int8
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [9]:
# Drop rows where gender is 'Other'
df = df[df['gender'] != 2]

In [10]:
df['gender'].value_counts()

gender
1    2994
0    2115
Name: count, dtype: int64

In [11]:
df.dtypes

gender                  int8
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type               int8
Residence_type          int8
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [12]:
df['age'].value_counts()

age
78.00    102
57.00     95
52.00     90
54.00     87
51.00     86
        ... 
1.40       3
0.48       3
0.16       3
0.40       2
0.08       2
Name: count, Length: 104, dtype: int64

In [13]:
# Convert age from years to months
df['age'] = df['age'] * 12

In [14]:
df['age'] = df['age'].apply(lambda x: int(x) if isinstance(x, float) else x)

In [15]:
df.dtypes

gender                  int8
age                    int64
hypertension           int64
heart_disease          int64
ever_married          object
work_type               int8
Residence_type          int8
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [16]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [17]:
label_encoders = {}
for col in ['ever_married', 'smoking_status']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoder for potential future use

In [18]:
# Define features and target
X = df.drop('stroke', axis=1)
y = df['stroke']

# Apply StandardScaler before train-test split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the scaled data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
# joblib.dump(scaler, './scaler_stroke.pkl')
# joblib.dump(label_encoders, './label_encoder_stroke.pkl')

In [20]:
# Initialize and train the model
model = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9501

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022



In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [22]:
# Define models with parameters to reduce overfitting
models = {
    'logistic_regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000, C=0.1),
    'svm': SVC(random_state=42, class_weight='balanced', probability=True, C=0.5),
    'random_forest': RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100, max_depth=5),
    'gradient_boosting': GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=3, learning_rate=0.05),
    'knn': KNeighborsClassifier(n_neighbors=5)
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name.replace('_', ' ').title()}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report for {model_name.replace('_', ' ').title()}:")
    print(classification_report(y_test, y_pred))


Training Logistic Regression...
Accuracy: 0.7339
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      0.73      0.84       972
           1       0.13      0.80      0.23        50

    accuracy                           0.73      1022
   macro avg       0.56      0.77      0.53      1022
weighted avg       0.94      0.73      0.81      1022


Training Svm...
Accuracy: 0.7153
Classification Report for Svm:
              precision    recall  f1-score   support

           0       0.98      0.71      0.83       972
           1       0.12      0.78      0.21        50

    accuracy                           0.72      1022
   macro avg       0.55      0.75      0.52      1022
weighted avg       0.94      0.72      0.80      1022


Training Random Forest...
Accuracy: 0.7035
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.99      0.70      0.82 

In [23]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"Classification Report for {model_name.replace('_', ' ').title()}:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9491
Classification Report for Knn:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.25      0.02      0.04        50

    accuracy                           0.95      1022
   macro avg       0.60      0.51      0.51      1022
weighted avg       0.92      0.95      0.93      1022



In [24]:
# # Save the model
# joblib.dump(model, f'./knn_Stroke-im.pkl')

1 stroke below

In [30]:
# Load the trained KNN model, scaler, and label encoders
knn = joblib.load('./knn_Stroke-im.pkl')
scaler = joblib.load('./scaler_stroke.pkl')
label_encoders = joblib.load('./label_encoder_stroke.pkl')

# Define the sample input (including age, excluding id and stroke)
sample = {
    'gender': 'Male',
    'age': 40,  # Age in years, will be converted to months
    'hypertension': 1,
    'heart_disease': 0,
    'ever_married': 'Yes',
    'work_type': 'Govt_job',  # Encoded as 2 (int8)
    'Residence_type': 'Rural',  # Encoded as 0 (int8)
    'avg_glucose_level': 212.01,
    'bmi': 28.4,
    'smoking_status': 'never smoked'
}

# Convert sample to DataFrame
sample_df = pd.DataFrame([sample])

# Manually encode gender (Male=0, Female=1, as in training)
sample_df['gender'] = sample_df['gender'].replace({'Male': 0, 'Female': 1})

# Convert age to months and trim to integer
sample_df['age'] = sample_df['age'] * 12
sample_df['age'] = sample_df['age'].apply(lambda x: int(x) if isinstance(x, float) else x)

# Label encode ever_married and smoking_status
for col in ['ever_married', 'smoking_status']:
    sample_df[col] = label_encoders[col].transform(sample_df[col])

# Encode work_type and Residence_type (already int8 in training)
# Use manual encoding as in training (Govt_job=2, Rural=0)
sample_df['work_type'] = sample_df['work_type'].replace({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4})
sample_df['Residence_type'] = sample_df['Residence_type'].replace({'Rural': 0, 'Urban': 1})

# Ensure columns are in the same order as training
feature_order = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
sample_df = sample_df[feature_order]

# Scale the features
sample_scaled = scaler.transform(sample_df)

# Make prediction
prediction = knn.predict(sample_scaled)[0]
probabilities = knn.predict_proba(sample_scaled)[0]

# Output results
print(f"Prediction for stroke: {'Stroke' if prediction == 1 else 'No Stroke'}")
print(f"Probability of No Stroke (0): {probabilities[0]:.4f}")
print(f"Probability of Stroke (1): {probabilities[1]:.4f}")

Prediction for stroke: No Stroke
Probability of No Stroke (0): 1.0000
Probability of Stroke (1): 0.0000


0 stroke below

In [31]:
# Load the trained KNN model, scaler, and label encoders
knn = joblib.load('./knn_Stroke-im.pkl')
scaler = joblib.load('./scaler_stroke.pkl')
label_encoders = joblib.load('./label_encoder_stroke.pkl')

# Define the sample input (including age, excluding id and stroke)
sample = {
    'gender': 'Male',
    'age': 67,  # Age in years, will be converted to months
    'hypertension': 0,
    'heart_disease': 1,
    'ever_married': 'Yes',
    'work_type': 'Private',  # Encoded as 0 (int8)
    'Residence_type': 'Urban',  # Encoded as 1 (int8)
    'avg_glucose_level': 228.69,
    'bmi': 36.6,
    'smoking_status': 'formerly smoked'
}

# Convert sample to DataFrame
sample_df = pd.DataFrame([sample])

# Manually encode gender (Male=0, Female=1, as in training)
sample_df['gender'] = sample_df['gender'].replace({'Male': 0, 'Female': 1})

# Convert age to months and trim to integer
sample_df['age'] = sample_df['age'] * 12
sample_df['age'] = sample_df['age'].apply(lambda x: int(x) if isinstance(x, float) else x)

# Label encode ever_married and smoking_status
for col in ['ever_married', 'smoking_status']:
    sample_df[col] = label_encoders[col].transform(sample_df[col])

# Encode work_type and Residence_type (already int8 in training)
# Use manual encoding as in training (Govt_job=2, Rural=0)
sample_df['work_type'] = sample_df['work_type'].replace({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4})
sample_df['Residence_type'] = sample_df['Residence_type'].replace({'Rural': 0, 'Urban': 1})

# Ensure columns are in the same order as training
feature_order = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
sample_df = sample_df[feature_order]

# Scale the features
sample_scaled = scaler.transform(sample_df)

# Make prediction
prediction = knn.predict(sample_scaled)[0]
probabilities = knn.predict_proba(sample_scaled)[0]

# Output results
print(f"Prediction for stroke: {'Stroke' if prediction == 1 else 'No Stroke'}")
print(f"Probability of No Stroke (0): {probabilities[0]:.4f}")
print(f"Probability of Stroke (1): {probabilities[1]:.4f}")


Prediction for stroke: No Stroke
Probability of No Stroke (0): 0.6000
Probability of Stroke (1): 0.4000


In [32]:
# Load the trained KNN model, scaler, and label encoders
knn = joblib.load('./knn_Stroke-im.pkl')
scaler = joblib.load('./scaler_stroke.pkl')
label_encoders = joblib.load('./label_encoder_stroke.pkl')


# Define the sample input (including age, excluding id and stroke)
sample = {
    'gender': 'Female',
    'age': 49,  # Age in years, will be converted to months
    'hypertension': 0,
    'heart_disease': 0,
    'ever_married': 'Yes',
    'work_type': 'Private',  # Encoded as 0 (int8)
    'Residence_type': 'Urban',  # Encoded as 1 (int8)
    'avg_glucose_level': 171.23,
    'bmi': 34.4,
    'smoking_status': 'smokes'
}


# Convert sample to DataFrame
sample_df = pd.DataFrame([sample])

# Manually encode gender (Male=0, Female=1, as in training)
sample_df['gender'] = sample_df['gender'].replace({'Male': 0, 'Female': 1})

# Convert age to months and trim to integer
sample_df['age'] = sample_df['age'] * 12
sample_df['age'] = sample_df['age'].apply(lambda x: int(x) if isinstance(x, float) else x)

# Label encode ever_married and smoking_status
for col in ['ever_married', 'smoking_status']:
    sample_df[col] = label_encoders[col].transform(sample_df[col])

# Encode work_type and Residence_type (already int8 in training)
# Use manual encoding as in training (Govt_job=2, Rural=0)
sample_df['work_type'] = sample_df['work_type'].replace({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4})
sample_df['Residence_type'] = sample_df['Residence_type'].replace({'Rural': 0, 'Urban': 1})

# Ensure columns are in the same order as training
feature_order = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
sample_df = sample_df[feature_order]

# Scale the features
sample_scaled = scaler.transform(sample_df)

# Make prediction
prediction = knn.predict(sample_scaled)[0]
probabilities = knn.predict_proba(sample_scaled)[0]

# Output results
print(f"Prediction for stroke: {'Stroke' if prediction == 1 else 'No Stroke'}")
print(f"Probability of No Stroke (0): {probabilities[0]:.4f}")
print(f"Probability of Stroke (1): {probabilities[1]:.4f}")




Prediction for stroke: No Stroke
Probability of No Stroke (0): 0.8000
Probability of Stroke (1): 0.2000


1 stroke below

In [33]:
# Load the trained KNN model, scaler, and label encoders
knn = joblib.load('./knn_Stroke-im.pkl')
scaler = joblib.load('./scaler_stroke.pkl')
label_encoders = joblib.load('./label_encoder_stroke.pkl')


# Define the sample input (including age, excluding id and stroke)
sample = {
    'gender': 'Female',
    'age': 79,  # Age in years, will be converted to months
    'hypertension': 1,
    'heart_disease': 0,
    'ever_married': 'Yes',
    'work_type': 'Self-employed',  # Encoded as 1 (int8)
    'Residence_type': 'Rural',  # Encoded as 0 (int8)
    'avg_glucose_level': 174.12,
    'bmi': 24.0,
    'smoking_status': 'never smoked'
}



# Convert sample to DataFrame
sample_df = pd.DataFrame([sample])

# Manually encode gender (Male=0, Female=1, as in training)
sample_df['gender'] = sample_df['gender'].replace({'Male': 0, 'Female': 1})

# Convert age to months and trim to integer
sample_df['age'] = sample_df['age'] * 12
sample_df['age'] = sample_df['age'].apply(lambda x: int(x) if isinstance(x, float) else x)

# Label encode ever_married and smoking_status
for col in ['ever_married', 'smoking_status']:
    sample_df[col] = label_encoders[col].transform(sample_df[col])

# Encode work_type and Residence_type (already int8 in training)
# Use manual encoding as in training (Govt_job=2, Rural=0)
sample_df['work_type'] = sample_df['work_type'].replace({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4})
sample_df['Residence_type'] = sample_df['Residence_type'].replace({'Rural': 0, 'Urban': 1})

# Ensure columns are in the same order as training
feature_order = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
sample_df = sample_df[feature_order]

# Scale the features
sample_scaled = scaler.transform(sample_df)

# Make prediction
prediction = knn.predict(sample_scaled)[0]
probabilities = knn.predict_proba(sample_scaled)[0]

# Output results
print(f"Prediction for stroke: {'Stroke' if prediction == 1 else 'No Stroke'}")
print(f"Probability of No Stroke (0): {probabilities[0]:.4f}")
print(f"Probability of Stroke (1): {probabilities[1]:.4f}")


Prediction for stroke: No Stroke
Probability of No Stroke (0): 0.6000
Probability of Stroke (1): 0.4000
