In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import ( 
     
    TimeSeriesSplit,
    KFold,
    StratifiedKFold,
    GroupKFold,
    StratifiedGroupKFold
    
    
)

DataSet

In [2]:
data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [3]:
data.shape

(5110, 12)

In [4]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
def get_prep_data():
    
    data["doctor"] = np.random.randint(0, 8, size=len(data))
    data["smoking_status"] = data["smoking_status"].astype("category")
    data["ever_married"] = data["ever_married"].replace("Yes", True).replace("No", False)
    data["gender"] = data["gender"].astype("category")
    data["Residence_type"] = data["Residence_type"].astype("category")
    data["work_type"] = data["work_type"].astype("category")
    holdout_ids = data.sample(n= 300, random_state = 32).index
    
    train = ( 
        
        data.loc[~data.id.isin(holdout_ids)]
        .sample(frac =1, random_state = 32)
        .sort_values("doctor")
        .reset_index(drop = True)
        )
    
    holdout = ( 
        data.loc[data.index.isin(holdout_ids)]
        .sample(frac =1, random_state =32)
        
        .sort_values("doctor")
        .reset_index(drop = True)
      )
    
    return train, holdout
train, holdout = get_prep_data()
    

In [6]:
train.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,doctor
0,15528,Male,58.0,1,0,True,Private,Rural,223.36,41.5,formerly smoked,0,0
1,24892,Male,64.0,0,0,True,Private,Rural,97.08,31.7,Unknown,0,0
2,51257,Male,32.0,0,0,False,Private,Rural,72.1,23.2,never smoked,0,0
3,58936,Male,59.0,0,0,True,Private,Rural,203.16,43.4,Unknown,0,0
4,47456,Male,30.0,0,0,True,Private,Rural,58.89,26.1,formerly smoked,0,0


In [7]:
train.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke', 'doctor'],
      dtype='object')

In [8]:
def get_x_y():
    
    FEATURES = ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
           'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
           'smoking_status', 'stroke', 'doctor']

    GROUP = "doctor"

    TARGET = "stroke"
    
    x = data[FEATURES]
    y = data[TARGET]
    groups = data[GROUP]
    
    return x, y, groups, FEATURES
x, y, groups, FEATURES = get_x_y()
    
 

In [9]:
print(FEATURES)

['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke', 'doctor']


In [10]:
x, y, groups, FEATURES = get_x_y()

clf = lgb.LGBMClassifier(n_estimators = 10)
clf.fit(x, y)

In [11]:
print(FEATURES)

['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke', 'doctor']


In [12]:
print(data.dtypes)

id                      int64
gender               category
age                   float64
hypertension            int64
heart_disease           int64
ever_married             bool
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                  int64
doctor                  int64
dtype: object


## Predictions


In [13]:
pred = clf.predict(x)
pred_prob = clf.predict_proba(x)[:, 1]
pred_prob


array([0.72343714, 0.72343714, 0.72343714, ..., 0.01789695, 0.01789695,
       0.01789695])

In [14]:
from sklearn.metrics import accuracy_score, roc_auc_score

acc_score = accuracy_score(y, pred)
roc_auc = roc_auc_score(y, pred_prob)  # Corrected the function name


In [15]:
acc_score


1.0

In [16]:
roc_auc

1.0

In [17]:
print(train)

         id  gender   age  hypertension  heart_disease  ever_married  \
0     15528    Male  58.0             1              0          True   
1     24892    Male  64.0             0              0          True   
2     51257    Male  32.0             0              0         False   
3     58936    Male  59.0             0              0          True   
4     47456    Male  30.0             0              0          True   
...     ...     ...   ...           ...            ...           ...   
5085  19498  Female  81.0             0              1         False   
5086  59872  Female  38.0             0              0          True   
5087  11176    Male   9.0             0              0         False   
5088  72361  Female  37.0             0              0          True   
5089   9404  Female  44.0             0              0          True   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Rural             223.36  

In [18]:
print(holdout)

        id  gender   age  hypertension  heart_disease  ever_married  \
0    43172  Female  60.0             0              0          True   
1    14147    Male  49.0             0              0          True   
2     6731  Female  53.0             0              0         False   
3    59164  Female  24.0             0              0         False   
4    69979    Male  73.0             0              0          True   
..     ...     ...   ...           ...            ...           ...   
295  44177  Female  60.0             0              0          True   
296  25643    Male  36.0             0              0          True   
297  32457    Male  62.0             0              0          True   
298  31091    Male  34.0             0              1          True   
299  38951  Female  50.0             0              0          True   

         work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0          Private          Urban              57.89  30.9  formerly

In [19]:
FEATURES = ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
           'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
           'smoking_status', 'stroke', 'doctor']
GROUP = "doctor"
TARGET = "stroke"
    

# check on Holdouts


In [20]:
x_holdouts = holdout[FEATURES]
y_holdouts = holdout[TARGET]

In [21]:
x_holdouts.shape
y_holdouts.shape

(300,)

In [22]:
y_holdouts.shape

(300,)

In [23]:
predict = clf.predict(x_holdouts)
predict_proba = clf.predict_proba(x_holdouts)[:, 1]
predict_proba.shape

(300,)

In [24]:
ac = accuracy_score(y_holdouts , predict)
auc = roc_auc_score(y_holdouts, predict_proba)

print(f"The accuracy :{ac:0.4f} and auc :{auc:0.4f}" )
 

The accuracy :1.0000 and auc :1.0000
