<a href="https://colab.research.google.com/github/songthao1610/Map-Modelling---Project-1/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE




# New Section

In [17]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [18]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [19]:
data.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201


In [20]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())


In [51]:
data['stroke'].value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,4861
1,249


In [21]:
data.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,0


In [22]:
for feature in data.columns:
  print(f'{feature}:', data[feature].unique())

id: [ 9046 51676 31112 ... 19723 37544 44679]
gender: ['Male' 'Female' 'Other']
age: [6.70e+01 6.10e+01 8.00e+01 4.90e+01 7.90e+01 8.10e+01 7.40e+01 6.90e+01
 5.90e+01 7.80e+01 5.40e+01 5.00e+01 6.40e+01 7.50e+01 6.00e+01 5.70e+01
 7.10e+01 5.20e+01 8.20e+01 6.50e+01 5.80e+01 4.20e+01 4.80e+01 7.20e+01
 6.30e+01 7.60e+01 3.90e+01 7.70e+01 7.30e+01 5.60e+01 4.50e+01 7.00e+01
 6.60e+01 5.10e+01 4.30e+01 6.80e+01 4.70e+01 5.30e+01 3.80e+01 5.50e+01
 1.32e+00 4.60e+01 3.20e+01 1.40e+01 3.00e+00 8.00e+00 3.70e+01 4.00e+01
 3.50e+01 2.00e+01 4.40e+01 2.50e+01 2.70e+01 2.30e+01 1.70e+01 1.30e+01
 4.00e+00 1.60e+01 2.20e+01 3.00e+01 2.90e+01 1.10e+01 2.10e+01 1.80e+01
 3.30e+01 2.40e+01 3.40e+01 3.60e+01 6.40e-01 4.10e+01 8.80e-01 5.00e+00
 2.60e+01 3.10e+01 7.00e+00 1.20e+01 6.20e+01 2.00e+00 9.00e+00 1.50e+01
 2.80e+01 1.00e+01 1.80e+00 3.20e-01 1.08e+00 1.90e+01 6.00e+00 1.16e+00
 1.00e+00 1.40e+00 1.72e+00 2.40e-01 1.64e+00 1.56e+00 7.20e-01 1.88e+00
 1.24e+00 8.00e-01 4.00e-01 8.00e-02 1.

In [41]:
df = data.copy()
bins = [0, 18, 30, 40, 50, 100]  # Define the edges of the bins
labels = ['0-18', '19-30', '31-40', '41-50', '51+']  # Labels for the bins
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)


In [42]:
df['age_group']

Unnamed: 0,age_group
0,51+
1,51+
2,51+
3,41-50
4,51+
...,...
5105,51+
5106,51+
5107,31-40
5108,51+


In [58]:
label_encoder = LabelEncoder()

cat_columns = ['gender','ever_married','work_type','Residence_type','smoking_status','age_group']
for column in cat_columns:
    df[column] = label_encoder.fit_transform(df[column])

Y = df['stroke']
X = df.drop(columns = ['stroke','id','age'])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Fit and apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



In [59]:
X_train_resampled

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,age_group
0,1,0,0,0,4,0,90.420000,16.200000,0,0
1,1,0,0,0,2,1,207.580000,22.800000,3,1
2,1,1,0,1,2,0,91.280000,26.500000,2,3
3,0,1,0,1,2,1,150.740000,40.300000,0,4
4,0,0,0,1,2,0,82.570000,36.000000,1,4
...,...,...,...,...,...,...,...,...,...,...
6829,1,0,0,0,1,0,217.896120,28.320992,0,4
6830,1,1,0,1,2,0,247.301388,27.934177,2,4
6831,0,0,1,0,2,0,60.668965,29.473962,1,4
6832,0,0,0,1,1,1,110.723110,25.791870,0,4


In [63]:

classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'Random Forest': RandomForestClassifier()}
param_grid = {
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Random Forest': {'n_estimators': [10, 50, 100], 'max_depth': [ 10, 20]}

}

best_estimators = {}

for clf_name, clf in classifiers.items():
    print(f"Performing GridSearchCV for {clf_name}...")

    # Use GridSearchCV for parameter tuning
    grid_search = GridSearchCV(clf, param_grid[clf_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_resampled, y_train_resampled)

    # Store the best estimator
    best_estimators[clf_name] = grid_search.best_estimator_

    # Evaluate on test data
    y_pred = grid_search.best_estimator_.predict(X_test)
    print(f"Best Model for {clf_name}: {grid_search.best_params_}")
    print(f"Accuracy for {clf_name}: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report for {clf_name}:\n{classification_report(y_test, y_pred)}")

    print("-" * 50)
best_model_name = max(best_estimators, key=lambda clf: accuracy_score(y_test, best_estimators[clf].predict(X_test)))
print(f"Overall best model: {best_model_name}")


Performing GridSearchCV for Logistic Regression...
Best Model for Logistic Regression: {'C': 0.1}
Accuracy for Logistic Regression: 0.7195042400521853
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.97      0.73      0.83      1444
           1       0.12      0.57      0.19        89

    accuracy                           0.72      1533
   macro avg       0.54      0.65      0.51      1533
weighted avg       0.92      0.72      0.79      1533

--------------------------------------------------
Performing GridSearchCV for Random Forest...
Best Model for Random Forest: {'max_depth': 20, 'n_estimators': 50}
Accuracy for Random Forest: 0.8793215916503587
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.95      0.92      0.93      1444
           1       0.15      0.22      0.18        89

    accuracy                           0.88      1533
   ma