In [1]:
import pandas as pd
import numpy as np
import itertools
import scipy
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import math
from datetime import datetime

from sklearn.decomposition import PCA
import plotly.express as px

from dateutil import parser
from datetime import date

from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from flaml import AutoML
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from catboost import CatBoostClassifier

In [129]:
rawData = pd.read_csv("./diabetes_train.csv")
rawTest = pd.read_csv("./diabetes_test.csv")
data = rawData
testData = rawTest


In [130]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  576 non-null    int64  
 1   num_times_pregnant  576 non-null    int64  
 2   plasma_glucose      576 non-null    int64  
 3   DBP                 576 non-null    int64  
 4   triceps_skin        576 non-null    int64  
 5   serum_insulin       576 non-null    int64  
 6   BMI                 576 non-null    float64
 7   pedigree            576 non-null    float64
 8   age                 576 non-null    int64  
 9   diabetes            576 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 45.1 KB


## Data clean up and transformation

In [131]:
### apply log transformation numeric columns

data["plasma_glucose"] = np.log10(data["plasma_glucose"] + 1)
data["pedigree"] = np.log10(data["pedigree"] + 1)
data["diabetes"] = data["diabetes"].astype("category")

testData["plasma_glucose"] = np.log10(testData["plasma_glucose"] + 1)
testData["pedigree"] = np.log10(testData["pedigree"] + 1)

In [132]:
data["DBP_Null"] = np.where(data["DBP"] > 0 , 0, 1)
data["DBP_Null"] = data["DBP_Null"].astype("category")
data["DBP"] = np.log10(data["DBP"] + 1)

testData["DBP_Null"] = np.where(testData["DBP"] > 0 , 0, 1)
testData["DBP_Null"] = testData["DBP_Null"].astype("category")
testData["DBP"] = np.log10(testData["DBP"] + 1)


In [133]:
data["triceps_skin_Null"] = np.where(data["triceps_skin"] > 0 , 0, 1)
data["triceps_skin_Null"] = data["triceps_skin_Null"].astype("category")
data["triceps_skin"] = np.log10(data["triceps_skin"] + 1)

testData["triceps_skin_Null"] = np.where(testData["triceps_skin"] > 0 , 0, 1)
testData["triceps_skin_Null"] = testData["triceps_skin_Null"].astype("category")
testData["triceps_skin"] = np.log10(testData["triceps_skin"] + 1)

In [134]:
data["serum_insulin_Null"] = np.where(data["serum_insulin"] > 0 , 0, 1)
data["serum_insulin_Null"] = data["serum_insulin_Null"].astype("category")
data["serum_insulin"] = np.log10(data["serum_insulin"] + 1)

testData["serum_insulin_Null"] = np.where(testData["serum_insulin"] > 0 , 0, 1)
testData["serum_insulin_Null"] = testData["serum_insulin_Null"].astype("category")
testData["serum_insulin"] = np.log10(testData["serum_insulin"] + 1)

In [135]:
data["num_times_pregnant_Null"] = np.where(data["num_times_pregnant"] > 0 , 0, 1)
data["num_times_pregnant_Null"] = data["num_times_pregnant_Null"].astype("category")

testData["num_times_pregnant_Null"] = np.where(testData["num_times_pregnant"] > 0 , 0, 1)
testData["num_times_pregnant_Null"] = testData["num_times_pregnant_Null"].astype("category")

In [136]:
data.head(10)

Unnamed: 0,Id,num_times_pregnant,plasma_glucose,DBP,triceps_skin,serum_insulin,BMI,pedigree,age,diabetes,DBP_Null,triceps_skin_Null,serum_insulin_Null,num_times_pregnant_Null
0,358,13,2.113943,0.0,1.491362,0.0,39.9,0.195623,44,1,1,0,1,0
1,74,4,2.113943,1.939519,1.322219,2.432969,35.1,0.090258,23,0,0,0,0,0
2,353,3,1.792392,1.919078,1.462398,0.0,34.4,0.094471,46,0,0,0,1,0
3,498,2,1.913814,1.863323,1.20412,1.886491,30.1,0.18949,25,0,0,0,0,0
4,146,0,2.012837,1.880814,1.380211,0.0,0.0,0.196453,21,0,0,0,1,1
5,515,3,2.0,1.740363,1.30103,1.939519,25.6,0.062206,24,0,0,0,0,0
6,292,0,2.033424,1.799341,1.491362,1.875061,36.6,0.244772,25,1,0,0,0,1
7,133,3,2.232996,1.812913,1.579784,2.354108,34.5,0.13226,30,1,0,0,0,0
8,560,11,1.934498,1.875061,0.0,0.0,30.1,0.113943,35,0,0,1,1,0
9,632,0,2.012837,1.897627,1.612784,1.959041,34.5,0.092721,24,0,0,0,0,1


In [137]:
target = data["diabetes"]
data = data.drop(columns=["Id", "diabetes"])

testData = testData.drop(columns=["Id"])

In [138]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   num_times_pregnant       576 non-null    int64   
 1   plasma_glucose           576 non-null    float64 
 2   DBP                      576 non-null    float64 
 3   triceps_skin             576 non-null    float64 
 4   serum_insulin            576 non-null    float64 
 5   BMI                      576 non-null    float64 
 6   pedigree                 576 non-null    float64 
 7   age                      576 non-null    int64   
 8   DBP_Null                 576 non-null    category
 9   triceps_skin_Null        576 non-null    category
 10  serum_insulin_Null       576 non-null    category
 11  num_times_pregnant_Null  576 non-null    category
dtypes: category(4), float64(6), int64(2)
memory usage: 38.9 KB


## Model trainning

In [139]:
## Spliting the data
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3)

In [140]:
## Try out AutoML models

automl = AutoML()
automl.fit(X_train, y_train, task="classification")

[flaml.automl: 07-28 21:28:04] {912} INFO - Evaluation method: cv
[flaml.automl: 07-28 21:28:04] {606} INFO - Using StratifiedKFold
[flaml.automl: 07-28 21:28:04] {933} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 07-28 21:28:04] {952} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 07-28 21:28:04] {1018} INFO - iteration 0, current learner lgbm
[flaml.automl: 07-28 21:28:04] {1173} INFO -  at 0.2s,	best lgbm's error=0.2174,	best lgbm's error=0.2174
[flaml.automl: 07-28 21:28:04] {1018} INFO - iteration 1, current learner lgbm
[flaml.automl: 07-28 21:28:04] {1173} INFO -  at 0.4s,	best lgbm's error=0.2174,	best lgbm's error=0.2174
[flaml.automl: 07-28 21:28:04] {1018} INFO - iteration 2, current learner lgbm
[flaml.automl: 07-28 21:28:04] {1173} INFO -  at 0.5s,	best lgbm's error=0.1977,	best lgbm's error=0.1977
[flaml.automl: 07-28 21:28:04] {1018} INFO - iteration 3, current learner xgboost
[flaml.automl

[flaml.automl: 07-28 21:28:11] {1018} INFO - iteration 42, current learner lgbm
[flaml.automl: 07-28 21:28:11] {1173} INFO -  at 6.9s,	best lgbm's error=0.1941,	best lgbm's error=0.1941
[flaml.automl: 07-28 21:28:11] {1018} INFO - iteration 43, current learner xgboost
[flaml.automl: 07-28 21:28:11] {1173} INFO -  at 7.1s,	best xgboost's error=0.1970,	best lgbm's error=0.1941
[flaml.automl: 07-28 21:28:11] {1018} INFO - iteration 44, current learner extra_tree
[flaml.automl: 07-28 21:28:11] {1173} INFO -  at 7.3s,	best extra_tree's error=0.2404,	best lgbm's error=0.1941
[flaml.automl: 07-28 21:28:11] {1018} INFO - iteration 45, current learner xgboost
[flaml.automl: 07-28 21:28:11] {1173} INFO -  at 7.4s,	best xgboost's error=0.1970,	best lgbm's error=0.1941
[flaml.automl: 07-28 21:28:11] {1018} INFO - iteration 46, current learner lgbm
[flaml.automl: 07-28 21:28:12] {1173} INFO -  at 7.6s,	best lgbm's error=0.1941,	best lgbm's error=0.1941
[flaml.automl: 07-28 21:28:12] {1018} INFO - i

[flaml.automl: 07-28 21:28:17] {1173} INFO -  at 13.4s,	best xgboost's error=0.1970,	best lgbm's error=0.1906
[flaml.automl: 07-28 21:28:17] {1018} INFO - iteration 86, current learner xgboost
[flaml.automl: 07-28 21:28:18] {1173} INFO -  at 13.6s,	best xgboost's error=0.1970,	best lgbm's error=0.1906
[flaml.automl: 07-28 21:28:18] {1018} INFO - iteration 87, current learner lgbm
[flaml.automl: 07-28 21:28:18] {1173} INFO -  at 13.7s,	best lgbm's error=0.1906,	best lgbm's error=0.1906
[flaml.automl: 07-28 21:28:18] {1018} INFO - iteration 88, current learner catboost
[flaml.automl: 07-28 21:28:19] {1173} INFO -  at 14.8s,	best catboost's error=0.1875,	best catboost's error=0.1875
[flaml.automl: 07-28 21:28:19] {1018} INFO - iteration 89, current learner catboost
[flaml.automl: 07-28 21:28:20] {1173} INFO -  at 15.7s,	best catboost's error=0.1875,	best catboost's error=0.1875
[flaml.automl: 07-28 21:28:20] {1018} INFO - iteration 90, current learner xgboost
[flaml.automl: 07-28 21:28:20

[flaml.automl: 07-28 21:28:34] {1173} INFO -  at 30.1s,	best lgbm's error=0.1906,	best catboost's error=0.1826
[flaml.automl: 07-28 21:28:34] {1018} INFO - iteration 128, current learner catboost
[flaml.automl: 07-28 21:28:35] {1173} INFO -  at 30.7s,	best catboost's error=0.1826,	best catboost's error=0.1826
[flaml.automl: 07-28 21:28:35] {1018} INFO - iteration 129, current learner xgboost
[flaml.automl: 07-28 21:28:35] {1173} INFO -  at 30.9s,	best xgboost's error=0.1970,	best catboost's error=0.1826
[flaml.automl: 07-28 21:28:35] {1018} INFO - iteration 130, current learner catboost
[flaml.automl: 07-28 21:28:36] {1173} INFO -  at 31.6s,	best catboost's error=0.1826,	best catboost's error=0.1826
[flaml.automl: 07-28 21:28:36] {1018} INFO - iteration 131, current learner catboost
[flaml.automl: 07-28 21:28:36] {1173} INFO -  at 32.4s,	best catboost's error=0.1826,	best catboost's error=0.1826
[flaml.automl: 07-28 21:28:36] {1018} INFO - iteration 132, current learner xgboost
[flaml.

[flaml.automl: 07-28 21:28:47] {1018} INFO - iteration 170, current learner lgbm
[flaml.automl: 07-28 21:28:47] {1173} INFO -  at 43.0s,	best lgbm's error=0.1796,	best lgbm's error=0.1796
[flaml.automl: 07-28 21:28:47] {1018} INFO - iteration 171, current learner lgbm
[flaml.automl: 07-28 21:28:47] {1173} INFO -  at 43.2s,	best lgbm's error=0.1796,	best lgbm's error=0.1796
[flaml.automl: 07-28 21:28:47] {1018} INFO - iteration 172, current learner lgbm
[flaml.automl: 07-28 21:28:47] {1173} INFO -  at 43.3s,	best lgbm's error=0.1796,	best lgbm's error=0.1796
[flaml.automl: 07-28 21:28:47] {1018} INFO - iteration 173, current learner lgbm
[flaml.automl: 07-28 21:28:47] {1173} INFO -  at 43.4s,	best lgbm's error=0.1796,	best lgbm's error=0.1796
[flaml.automl: 07-28 21:28:47] {1018} INFO - iteration 174, current learner lgbm
[flaml.automl: 07-28 21:28:48] {1173} INFO -  at 43.6s,	best lgbm's error=0.1796,	best lgbm's error=0.1796
[flaml.automl: 07-28 21:28:48] {1018} INFO - iteration 175, 

[flaml.automl: 07-28 21:28:54] {1173} INFO -  at 50.4s,	best xgboost's error=0.1970,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:28:54] {1018} INFO - iteration 214, current learner xgboost
[flaml.automl: 07-28 21:28:55] {1173} INFO -  at 50.6s,	best xgboost's error=0.1970,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:28:55] {1018} INFO - iteration 215, current learner lgbm
[flaml.automl: 07-28 21:28:55] {1173} INFO -  at 50.7s,	best lgbm's error=0.1793,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:28:55] {1018} INFO - iteration 216, current learner xgboost
[flaml.automl: 07-28 21:28:55] {1173} INFO -  at 50.9s,	best xgboost's error=0.1970,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:28:55] {1018} INFO - iteration 217, current learner lgbm
[flaml.automl: 07-28 21:28:55] {1173} INFO -  at 51.0s,	best lgbm's error=0.1793,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:28:55] {1018} INFO - iteration 218, current learner lgbm
[flaml.automl: 07-28 21:28:55] {1173} INFO - 

[flaml.automl: 07-28 21:29:01] {1018} INFO - iteration 257, current learner lgbm
[flaml.automl: 07-28 21:29:02] {1173} INFO -  at 57.6s,	best lgbm's error=0.1793,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:29:02] {1018} INFO - iteration 258, current learner extra_tree
[flaml.automl: 07-28 21:29:02] {1173} INFO -  at 57.9s,	best extra_tree's error=0.2230,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:29:02] {1018} INFO - iteration 259, current learner xgboost
[flaml.automl: 07-28 21:29:02] {1173} INFO -  at 58.1s,	best xgboost's error=0.1970,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:29:02] {1018} INFO - iteration 260, current learner extra_tree
[flaml.automl: 07-28 21:29:02] {1173} INFO -  at 58.3s,	best extra_tree's error=0.2230,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:29:02] {1018} INFO - iteration 261, current learner xgboost
[flaml.automl: 07-28 21:29:02] {1173} INFO -  at 58.4s,	best xgboost's error=0.1970,	best lgbm's error=0.1793
[flaml.automl: 07-28 21:

In [141]:
print(automl.model)

<flaml.model.LGBMEstimator object at 0x000002C89DF49340>


In [145]:

automl_pred = automl.predict(X_test)
automl_predProb = automl.predict_proba(X_test)
print(classification_report(y_test, automl_pred))

              precision    recall  f1-score   support

           0       0.87      0.90      0.89       120
           1       0.76      0.70      0.73        53

    accuracy                           0.84       173
   macro avg       0.81      0.80      0.81       173
weighted avg       0.84      0.84      0.84       173



In [146]:
print(confusion_matrix(y_test, automl_pred))

[[108  12]
 [ 16  37]]


## Predicting test set

In [147]:
output = automl.predict(testData)

In [149]:
my_submission = pd.DataFrame({'Id': rawTest.Id, 'Predicted': output})

my_submission.head()

# you could use any filename. We choose submission here
my_submission.to_csv('./my_submission.csv', index=False)