In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, clear_output
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
dataset= pd.read_csv("stroke prediction.csv")
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [3]:
dataset.drop(columns=["id","Residence_type"],inplace=True)

In [4]:
dataset["gender"]=dataset["gender"].astype('category')
dataset["ever_married"]=dataset["ever_married"].astype('category')
# dataset["Residence_type"]=dataset["Residence_type"].astype('category')
dataset["work_type"]=dataset["work_type"].astype('category')
dataset["smoking_status"]=dataset.smoking_status.astype("category")

In [5]:
dataset_imputed=dataset.copy()

In [6]:
dataset_imputed.dtypes

gender               category
age                   float64
hypertension            int64
heart_disease           int64
ever_married         category
work_type            category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                  int64
dtype: object

In [7]:
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()

dataset_imputed.gender=encoder.fit_transform(dataset_imputed.gender)

dataset_imputed.ever_married=encoder.fit_transform(dataset_imputed.ever_married)

dataset_imputed.work_type=encoder.fit_transform(dataset_imputed.work_type)

# dataset_imputed.Residence_type=encoder.fit_transform(dataset_imputed.Residence_type)

dataset_imputed.smoking_status=dataset_imputed.smoking_status.map({'never smoked':0,'formerly smoked':1,'smokes':2})

In [8]:
X=dataset_imputed.drop(columns="stroke",axis=1)
y=dataset_imputed["stroke"]

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.33,random_state=88,stratify=y)

In [9]:
from sklearn.impute import KNNImputer

imputer=KNNImputer()

X_trimputed=pd.DataFrame(imputer.fit_transform(X_train),columns=X_train.columns)

X_teimputed=pd.DataFrame(imputer.transform(X_test),columns=X_test.columns)

In [10]:
X_trimputed["smoking_status"]=X_trimputed["smoking_status"].apply(round)
X_trimputed["smoking_status"].value_counts()

# X_trimputed["smoking_status"]=X_trimputed["smoking_status"].astype('category')
X_trimputed["work_type"]=X_trimputed["work_type"].apply(int)
# X_trimputed["work_type"]=X_trimputed["work_type"].astype('category')

In [11]:
X_teimputed["smoking_status"]=X_teimputed["smoking_status"].apply(round)
X_teimputed["smoking_status"].value_counts()

# X_teimputed["smoking_status"]=X_teimputed["smoking_status"].astype('category')
X_teimputed["work_type"]=X_teimputed["work_type"].apply(int)
# X_teimputed["work_type"]=X_teimputed["work_type"].astype('category')

In [12]:
# X_trimputed["gender"]=X_trimputed["gender"].astype("category")
X_trimputed[["hypertension","heart_disease","ever_married"]]=X_trimputed[["hypertension","heart_disease","ever_married"]].astype("int")

# X_teimputed["gender"]=X_teimputed["gender"].astype("category")
X_teimputed[["hypertension","heart_disease","ever_married"]]=X_teimputed[["hypertension","heart_disease","ever_married"]].astype("int")

In [13]:
X_trimputed=pd.get_dummies(X_trimputed,drop_first=True)
X_teimputed=pd.get_dummies(X_teimputed,drop_first=True)

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

scaler.fit(X_trimputed)

X_trimputed=pd.DataFrame(scaler.transform(X_trimputed),columns=X_trimputed.columns)

X_teimputed=pd.DataFrame(scaler.transform(X_teimputed),columns=X_teimputed.columns)

In [15]:
from sklearn.metrics import classification_report,roc_auc_score,f1_score

In [16]:
X_trimputed

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status
0,0.0,0.865723,0.0,0.0,1.0,0.50,0.187511,0.219114,1.0
1,0.0,0.987793,0.0,0.0,1.0,0.50,0.188400,0.196970,0.0
2,0.0,0.609375,0.0,0.0,1.0,0.50,0.061769,0.136364,0.5
3,0.5,0.133301,0.0,0.0,0.0,1.00,0.125021,0.127040,0.0
4,0.5,0.731445,0.0,0.0,1.0,0.00,0.089731,0.219114,1.0
...,...,...,...,...,...,...,...,...,...
29073,0.0,0.890137,0.0,0.0,1.0,0.00,0.065582,0.033800,0.0
29074,0.0,0.523926,0.0,0.0,0.0,0.00,0.168658,0.226107,0.0
29075,0.0,0.829102,1.0,0.0,1.0,0.75,0.640061,0.191142,0.0
29076,0.0,0.279785,0.0,0.0,0.0,0.50,0.184926,0.209790,0.5


# AdaBoost

In [41]:
#creating an Gradient boosting instance
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,class_weight='balanced'),random_state=40)

In [42]:
#training the model
clf.fit(X_trimputed,y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight='balanced',
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                    

In [43]:
y_train_pred=clf.predict(X_trimputed)
y_test_pred=clf.predict(X_teimputed)

In [44]:
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84     28553
           1       0.06      0.86      0.10       525

    accuracy                           0.73     29078
   macro avg       0.53      0.79      0.47     29078
weighted avg       0.98      0.73      0.83     29078



In [45]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84     14064
           1       0.05      0.80      0.10       258

    accuracy                           0.73     14322
   macro avg       0.52      0.76      0.47     14322
weighted avg       0.98      0.73      0.83     14322



## Grid Search

In [23]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [20]:
list1=[]

for i in range(1,10):
    list1.append(DecisionTreeClassifier(class_weight='balanced',max_depth=i))

In [22]:
parameters={'algorithm':["SAMME","SAMME.R"],
            'n_estimators':np.arange(50,300,50),
            'learning_rate':np.arange(0.5,2.5,0.1),
            'base_estimator':list1}

In [24]:
search=RandomizedSearchCV(AdaBoostClassifier(random_state=40),parameters,scoring='f1',cv=5,n_jobs=-1,verbose=2,n_iter=500)

In [25]:
search.fit(X_trimputed,y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 23.4min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 29.3min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 34.3min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                                base_estimator=None,
                                                learning_rate=1.0,
                                                n_estimators=50,
                                                random_state=40),
                   iid='deprecated', n_iter=500, n_jobs=-1,
                   param_distributions={'algorithm': ['SAMME', 'SAMME.R'],
                                        'base_estimator': [DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                  class_weight='balanced',
                                                                                  criterion='gini',
                                                                                  max...
                                                                                  min_samples_

In [26]:
search.best_score_

0.14027642034081564

In [27]:
search.best_params_

{'n_estimators': 250,
 'learning_rate': 0.7,
 'base_estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                        max_depth=2, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 'algorithm': 'SAMME'}

In [28]:
#creating an Gradient boosting instance
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,class_weight='balanced'),algorithm='SAMME',random_state=40,learning_rate=0.7,n_estimators=250)

In [29]:
#training the model
clf.fit(X_trimputed,y_train)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight='balanced',
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                      

In [30]:
y_train_pred=clf.predict(X_trimputed)
y_test_pred=clf.predict(X_teimputed)

In [41]:
y_train_prob=clf.predict_proba(X_trimputed)[:,1]
y_test_prob=clf.predict_proba(X_teimputed)[:,1]

In [42]:
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95     28553
           1       0.09      0.46      0.15       525

    accuracy                           0.90     29078
   macro avg       0.54      0.69      0.55     29078
weighted avg       0.97      0.90      0.93     29078



In [43]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95     14064
           1       0.08      0.41      0.13       258

    accuracy                           0.90     14322
   macro avg       0.53      0.66      0.54     14322
weighted avg       0.97      0.90      0.93     14322



## Grid Search(roc_auc)

## Grid Search

In [45]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [46]:
list1=[]

for i in range(1,10):
    list1.append(DecisionTreeClassifier(class_weight='balanced',max_depth=i))

In [47]:
parameters={'algorithm':["SAMME","SAMME.R"],
            'n_estimators':np.arange(50,300,50),
            'learning_rate':np.arange(0.5,2.5,0.1),
            'base_estimator':list1}

In [48]:
search=RandomizedSearchCV(AdaBoostClassifier(random_state=40),parameters,scoring='roc_auc',cv=5,n_jobs=-1,verbose=2,n_iter=500)

In [49]:
search.fit(X_trimputed,y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 24.7min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                                base_estimator=None,
                                                learning_rate=1.0,
                                                n_estimators=50,
                                                random_state=40),
                   iid='deprecated', n_iter=500, n_jobs=-1,
                   param_distributions={'algorithm': ['SAMME', 'SAMME.R'],
                                        'base_estimator': [DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                  class_weight='balanced',
                                                                                  criterion='gini',
                                                                                  max...
                                                                                  min_samples_

In [50]:
search.best_score_

0.8439353928859357

In [51]:
search.best_params_

{'n_estimators': 50,
 'learning_rate': 0.7,
 'base_estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                        max_depth=1, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best'),
 'algorithm': 'SAMME.R'}