## Model building and training

In [69]:
import pandas as pd

In [70]:
import lightgbm

In [71]:
df = pd.read_csv("heart.csv")

In [72]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [73]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [74]:
x = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [75]:
s = (x.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [76]:
from sklearn.model_selection import train_test_split

In [77]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=22)

### One hot encode categorical variables

In [78]:
from sklearn.preprocessing import OneHotEncoder

In [79]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(x_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(x_test[object_cols]))

In [80]:
OH_cols_train.index = x_train.index
OH_cols_test.index = x_test.index

In [81]:
num_X_train = x_train.drop(object_cols, axis=1)
num_X_test = x_test.drop(object_cols, axis=1)

In [82]:
oh_x_train = pd.concat([num_X_train, OH_cols_train], axis=1)
oh_x_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [83]:
oh_x_train.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,0,1,2,3,4,5,6,7,8,9,10,11,12,13
414,54,130,0,1,110,3.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
676,51,130,305,0,142,1.2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
753,34,118,210,0,192,0.7,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
351,43,140,0,0,140,0.5,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
224,55,120,256,1,137,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


### LightGBM model

In [84]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [85]:
from lightgbm import LGBMClassifier

In [86]:
lgbmc=LGBMClassifier(random_state=0, objective='binary')

#### GridSearch

In [87]:
gkf = KFold(n_splits=5, shuffle=True, random_state=42).split(X=oh_x_train, y=y_train)

param_grid = {
    'num_leaves': [5, 25],
    'min_data_in_leaf': [10,20,30,40,50],
    }

In [88]:
gsearch = GridSearchCV(estimator=lgbmc, param_grid=param_grid, cv=gkf)
lgb_model = gsearch.fit(X=oh_x_train, y=y_train)



In [89]:
print(lgb_model.best_params_, lgb_model.best_score_)

{'min_data_in_leaf': 40, 'num_leaves': 25} 0.8733016494268939


In [90]:
preds = lgb_model.predict(oh_x_test)

### evaluation

In [91]:
from sklearn.metrics import classification_report

In [92]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87        76
           1       0.90      0.93      0.91       108

    accuracy                           0.90       184
   macro avg       0.90      0.89      0.89       184
weighted avg       0.90      0.90      0.90       184



# API 

### Pickle

In [93]:
model = LGBMClassifier(random_state=0, objective='binary', min_data_in_leaf=40, num_leaves=25)

In [94]:
model.fit(oh_x_train, y_train)



LGBMClassifier(min_data_in_leaf=40, num_leaves=25, objective='binary',
               random_state=0)

In [95]:
model.predict_proba(oh_x_test)[0][0]

0.05012792933560184

In [96]:
import pickle
pickl = {'model': model}
pickle.dump( pickl, open( 'model_file' + ".p", "wb" ) )

In [97]:
with open("encoder", "wb") as f: 
    pickle.dump(OH_encoder, f)

In [98]:
with open('encoder', 'rb') as pickle_file:
    encoder = pickle.load(pickle_file)