In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import StratifiedKFold,cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("F:\dataset_for_ML\supervised\BrainStrokeDataset\strokePredictionDataset.csv")

In [3]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [4]:
X = df.iloc[:,:-1]
y = df["stroke"]

In [5]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.000000,formerly smoked
...,...,...,...,...,...,...,...,...,...,...
5177,Male,41.0,0,0,No,Private,Rural,70.15,29.756631,formerly smoked
5178,Male,40.0,0,0,Yes,Private,Urban,191.15,31.124172,smokes
5179,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.798304,smokes
5180,Male,40.0,0,0,Yes,Private,Rural,83.94,29.951301,smokes


In [6]:
df["smoking_status"].value_counts()


smoking_status
never smoked       1878
Unknown            1561
formerly smoked     915
smokes              828
Name: count, dtype: int64

In [7]:
cat_ohe = ["gender","ever_married","work_type","Residence_type"]
cat_ordinal = ["smoking_status"]
smoking_status_label = [["smokes","formerly smoked","Unknown","never smoked"]]
col_to_scale = ["age","avg_glucose_level","bmi"]

In [8]:
cat_ohe_ppl = Pipeline(steps=[
    ("OheEncoder",OneHotEncoder(drop="first",sparse_output=False))
])
scaling_ppl = Pipeline(steps=[
    ("SS",StandardScaler())
])
ordinal_ppl = Pipeline(steps=[
    ("OrdinalEncoder",OrdinalEncoder(categories=smoking_status_label))
])

In [9]:
preprocessing = ColumnTransformer(
    transformers=[
        ("OHE",cat_ohe_ppl,cat_ohe),
        ("Ordinal",ordinal_ppl,cat_ordinal),
        ("scale",scaling_ppl,col_to_scale),
    ]
    ,remainder="passthrough",
    n_jobs=-1
)

In [10]:
preprocessing

In [11]:
lrpipe = make_pipeline(preprocessing,LogisticRegression())
dtpipe = make_pipeline(preprocessing,DecisionTreeClassifier())
rfcpipe = make_pipeline(preprocessing,RandomForestClassifier(random_state=42))

In [12]:
rfcpipe

In [13]:
def modelBuilding(X,y,pipe):
    skfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    accuracy_list,precision_list,recall_list,f1_list = [],[],[],[]
    for train_idx,test_idx in skfold.split(X,y):
        # print(train_idx,test_idx)
        X_train,X_test = X.iloc[train_idx],X.iloc[test_idx]
        # print(X_train)
        y_train,y_test = y[train_idx],y[test_idx]
        # print(X_train)
        pipe.fit(X_train,y_train)
        pred = pipe.predict(X_test)

        accuracy_list.append(accuracy_score(pred,y_test))
        precision_list.append(precision_score(pred,y_test))
        recall_list.append(recall_score(pred,y_test))
        f1_list.append(f1_score(pred,y_test))
    
    return accuracy_list,precision_list,recall_list,f1_list
        

In [14]:
acc_list,precision_list,recall_list,f1_list = modelBuilding(X,y,lrpipe)

For DecisionTreeClassifier



In [15]:
# just simple DT 
param_grid = {
    "decisiontreeclassifier__max_depth" : [3,5,10,None],
    "decisiontreeclassifier__min_samples_split" : [2,5]
}

In [16]:
grid = GridSearchCV(dtpipe,param_grid,cv=5,n_jobs=-1,scoring="accuracy")

In [17]:
grid.fit(X,y)

In [18]:
grid.best_score_,grid.best_params_

(0.9438445471232356,
 {'decisiontreeclassifier__max_depth': 3,
  'decisiontreeclassifier__min_samples_split': 2})

In [19]:
best_pipe = grid.best_estimator_

In [20]:
acc_list,precision_list,recall_list,f1_list = modelBuilding(X,y,best_pipe)

In [21]:
acc_list,precision_list,recall_list,f1_list

([0.9431051108968177,
  0.944069431051109,
  0.944015444015444,
  0.9411196911196911,
  0.944980694980695],
 [0.0, 0.0, 0.0, 0.017543859649122806, 0.0],
 [0.0, 0.0, 0.0, 0.16666666666666666, 0.0],
 [0.0, 0.0, 0.0, 0.031746031746031744, 0.0])

FOR RANDOM FOREST CLASSIFIER 

In [22]:
param_grid_rfc = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [None, 5, 10],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2']
}


In [23]:
grid_rfc = GridSearchCV(rfcpipe,param_grid_rfc,cv=5,scoring="accuracy",n_jobs=-1)

In [24]:
grid_rfc.fit(X,y)

In [25]:
grid_rfc.best_params_

{'randomforestclassifier__max_depth': None,
 'randomforestclassifier__max_features': 'sqrt',
 'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 100}

In [26]:
grid_rfc_model  = grid_rfc.best_estimator_

In [27]:
acc_list,precision_list,recall_list,f1_list = modelBuilding(X,y,grid_rfc_model)

In [28]:
acc_list,precision_list,recall_list,f1_list

([0.9604628736740598,
  0.9614271938283511,
  0.9555984555984556,
  0.9507722007722008,
  0.9555984555984556],
 [0.3103448275862069,
  0.3103448275862069,
  0.20689655172413793,
  0.15789473684210525,
  0.22807017543859648],
 [0.9473684210526315, 1.0, 1.0, 0.75, 0.8666666666666667],
 [0.4675324675324675,
  0.47368421052631576,
  0.34285714285714286,
  0.2608695652173913,
  0.3611111111111111])

LETS CALCULATE THE CROSS_VAL_SCORE ... 

In [40]:
def crossValScore(X,y,pipe):
    # score = []
    skfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    # score.append(cross_val_score(pipe,X,y,cv=skfold,scoring="accuracy"))
    return (cross_val_score(pipe,X,y,cv=skfold,scoring="accuracy"))
    # return score

In [41]:
score_lr_pipe = crossValScore(X,y,lrpipe)

In [45]:
score_lr_pipe,score_lr_pipe.mean(),score_lr_pipe.std()

(array([0.94406943, 0.94406943, 0.94498069, 0.94498069, 0.94498069]),
 0.9446161894088606,
 0.0004464263296978743)

In [49]:
score_dtc = crossValScore(X,y,best_pipe)

In [50]:
score_dtc,score_dtc.mean(),score_dtc.std()

(array([0.94310511, 0.94406943, 0.94401544, 0.94111969, 0.94498069]),
 0.9434580744127514,
 0.0013111377407456879)

In [51]:
score_rfc = crossValScore(X,y,grid_rfc_model)

In [52]:
score_rfc,score_rfc.mean(),score_rfc.std()

(array([0.96046287, 0.96142719, 0.95559846, 0.9507722 , 0.95559846]),
 0.9567718358943045,
 0.0038482562706094407)