In [1]:
%load_ext nb_black
# !pip install nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics as m
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE  # doctest: +NORMALIZE_WHITESPACE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
from thundersvm import SVC as svmgpu

import warnings

warnings.filterwarnings("ignore")
randomseed = 7
np.random.seed(randomseed)

Using TensorFlow backend.


<IPython.core.display.Javascript object>

# 1. Read the dataset

In [3]:
x_original = pd.read_csv("XLable_onlyDiabeticRemoved.txt")

conditions = [
    (x_original.L100800 < 100),
    (x_original.L100800 >= 100) & (x_original.L100800 < 126),
    (x_original.L100800 >= 126),
]
choices = [0, 1, 2]
x_original["CLASS"] = np.select(conditions, choices, default=0)
x_original = x_original[
    [
        "Unnamed: 0",
        "L100800",
        "L104600",
        "S000300",
        "AGE",
        "SEX",
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
        "FIELD_1",
    ]
]

<IPython.core.display.Javascript object>

In [4]:
x_original2 = pd.read_csv("XLable_onlyDiabeticRemoved2.txt")

conditions = [
    (x_original2.L100800 < 100),
    (x_original2.L100800 >= 100) & (x_original2.L100800 < 126),
    (x_original2.L100800 >= 126),
]
choices = [0, 1, 2]
x_original2["CLASS"] = np.select(conditions, choices, default=0)
x_original2 = x_original2[
    [
        "Unnamed: 0",
        "L100800",
        "L104600",
        "S000300",
        "AGE",
        "SEX",
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
        "FIELD_1",
    ]
]

print(x_original2.shape)

(48095, 13)


<IPython.core.display.Javascript object>

In [5]:
x_original3 = pd.read_csv("XLable_onlyDiabeticRemoved3.txt")

conditions = [
    (x_original3.L100800 < 100),
    (x_original3.L100800 >= 100) & (x_original3.L100800 < 126),
    (x_original3.L100800 >= 126),
]

choices = [0, 1, 2]
x_original3["CLASS"] = np.select(conditions, choices, default=0)

x_original3 = x_original3[
    [
        "Unnamed: 0",
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",  #'CLASS',
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
        "FIELD_1",
    ]
]

print(x_original3.shape)

(48095, 20)


<IPython.core.display.Javascript object>

In [6]:
y_original = pd.read_csv("TargetLable_onlyDiabeticRemoved.txt")

conditions = [
    (y_original.L100800 < 100),
    (y_original.L100800 >= 100) & (y_original.L100800 < 126),
    (y_original.L100800 >= 126),
]

choices = [0, 1, 2]
y_original["CLASS"] = np.select(conditions, choices, default=0)

y_original = y_original[["Unnamed: 0", "FIELD_1", "CLASS"]]

print(y_original.shape)

(48095, 3)


<IPython.core.display.Javascript object>

In [7]:
data = pd.merge(
    x_original,
    x_original2,
    how="inner",
    left_on=["Unnamed: 0", "FIELD_1"],
    right_on=["Unnamed: 0", "FIELD_1"],
)
data.head()

Unnamed: 0.1,Unnamed: 0,L100800_x,L104600_x,S000300_x,AGE_x,SEX_x,FIELD_16_x,FIELD_23_x,FIELD_15_x,FIELD_22_x,...,L104600_y,S000300_y,AGE_y,SEX_y,FIELD_16_y,FIELD_23_y,FIELD_15_y,FIELD_22_y,FIELD_17_y,FIELD_24_y
0,0,77.0,,20.1,44.0,1.0,0.0,0.0,0.0,0.0,...,,19.7,45.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,98.0,,24.7,50.0,1.0,0.0,0.0,0.0,0.0,...,,24.8,51.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,100.0,,22.1,35.0,1.0,0.0,0.0,0.0,0.0,...,,22.6,36.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,78.0,,23.5,67.0,1.0,0.0,0.0,0.0,0.0,...,,23.2,68.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
4,4,92.0,,23.2,68.0,1.0,0.0,0.0,0.0,0.0,...,5.97,23.5,69.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


<IPython.core.display.Javascript object>

In [8]:
data = pd.merge(
    data,
    x_original3,
    how="inner",
    left_on=["Unnamed: 0", "FIELD_1"],
    right_on=["Unnamed: 0", "FIELD_1"],
)
data.head()

Unnamed: 0.1,Unnamed: 0,L100800_x,L104600_x,S000300_x,AGE_x,SEX_x,FIELD_16_x,FIELD_23_x,FIELD_15_x,FIELD_22_x,...,FIELD_40,FIELD_31,SEX,AGE,FIELD_16,FIELD_23,FIELD_15,FIELD_22,FIELD_17,FIELD_24
0,0,77.0,,20.1,44.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,98.0,,24.7,50.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,100.0,,22.1,35.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,78.0,,23.5,67.0,1.0,0.0,0.0,0.0,0.0,...,4.0,0.0,1.0,69.0,0.0,0.0,0.0,0.0,1.0,1.0
4,4,92.0,,23.2,68.0,1.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,70.0,0.0,0.0,0.0,0.0,1.0,1.0


<IPython.core.display.Javascript object>

In [9]:
data = pd.merge(
    data,
    y_original,
    how="inner",
    left_on=["Unnamed: 0", "FIELD_1"],
    right_on=["Unnamed: 0", "FIELD_1"],
)

<IPython.core.display.Javascript object>

In [10]:
# filter the data set
data = data[data.FIELD_16_x != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23_x != 1]  # exclude people who are on medication for diabetes

data = data[data.FIELD_16_y != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23_y != 1]  # exclude people who are on medication for diabetes

data = data[data.FIELD_16 != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23 != 1]  # exclude people who are on medication for diabetes

# data = data[
#     data.FIELD_15_x != 1
# ]  # exclude people who are diagnosed for (high blood pressure)
# data = data[
#     data.FIELD_22_x != 1
# ]  # exclude people who are on medication for high blood pressure

# data = data[
#     data.FIELD_15_y != 1
# ]  # exclude people who are diagnosed for (high blood pressure)
# data = data[
#     data.FIELD_22_y != 1
# ]  # exclude people who are on medication for high blood pressure

# data = data[data.FIELD_17_x != 1]  # exclude people who are diagnosed for hyperlipidemia
# data = data[
#     data.FIELD_24_x != 1
# ]  # exclude people who are on medication for hyperlipidemia

# data = data[data.FIELD_17_y != 1]  # exclude people who are diagnosed for hyperlipidemia
# data = data[
#     data.FIELD_24_y != 1
# ]  # exclude people who are on medication for hyperlipidemia

# print(data.shape)

<IPython.core.display.Javascript object>

In [11]:
data.head()

Unnamed: 0.1,Unnamed: 0,L100800_x,L104600_x,S000300_x,AGE_x,SEX_x,FIELD_16_x,FIELD_23_x,FIELD_15_x,FIELD_22_x,...,FIELD_31,SEX,AGE,FIELD_16,FIELD_23,FIELD_15,FIELD_22,FIELD_17,FIELD_24,CLASS
0,0,77.0,,20.1,44.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,98.0,,24.7,50.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2,100.0,,22.1,35.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3,78.0,,23.5,67.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,69.0,0.0,0.0,0.0,0.0,1.0,1.0,0
4,4,92.0,,23.2,68.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,70.0,0.0,0.0,0.0,0.0,1.0,1.0,1


<IPython.core.display.Javascript object>

In [12]:
data = data.dropna()
print(data.shape)
data = data[
    [
        "L100800_x",
        "L104600_x",
        "S000300_x",
        "AGE_x",
        "SEX_x",
        "L100800_y",
        "L104600_y",
        "S000300_y",
        "AGE_y",
        "SEX_y",
        "L100800",
        "L104600",
        "S000300",
        "AGE",
        "SEX",
        "CLASS",
    ]
]
data.head()

(12830, 43)


Unnamed: 0,L100800_x,L104600_x,S000300_x,AGE_x,SEX_x,L100800_y,L104600_y,S000300_y,AGE_y,SEX_y,L100800,L104600,S000300,AGE,SEX,CLASS
13,84.0,5.0,19.9,19.0,1.0,86.0,5.3,20.3,20.0,1.0,87.0,5.06,20.4,21.0,1.0,0
14,86.0,5.3,20.3,20.0,1.0,87.0,5.06,20.4,21.0,1.0,80.0,5.02,20.0,22.0,1.0,0
15,81.0,4.9,18.4,19.0,1.0,70.0,4.9,18.6,20.0,1.0,75.0,5.11,18.8,21.0,1.0,0
16,70.0,4.9,18.6,20.0,1.0,75.0,5.11,18.8,21.0,1.0,77.0,5.23,18.9,22.0,1.0,0
17,90.0,5.2,18.7,20.0,1.0,89.0,5.3,20.0,21.0,1.0,85.0,5.13,18.9,22.0,1.0,0


<IPython.core.display.Javascript object>

In [13]:
# data["L100800diff"] = np.power(data.L100800_x - data.L100800_y, 3)
# data["L104600diff"] = np.power(data.L104600_x - data.L104600_y, 3)
# data["S000300diff"] = np.power(data.S000300_x - data.S000300_y, 3)

# tempclass = data.CLASS
# data = data.drop(columns="CLASS")
# data["CLASS"] = tempclass

<IPython.core.display.Javascript object>

In [14]:
data.head()

Unnamed: 0,L100800_x,L104600_x,S000300_x,AGE_x,SEX_x,L100800_y,L104600_y,S000300_y,AGE_y,SEX_y,L100800,L104600,S000300,AGE,SEX,CLASS
13,84.0,5.0,19.9,19.0,1.0,86.0,5.3,20.3,20.0,1.0,87.0,5.06,20.4,21.0,1.0,0
14,86.0,5.3,20.3,20.0,1.0,87.0,5.06,20.4,21.0,1.0,80.0,5.02,20.0,22.0,1.0,0
15,81.0,4.9,18.4,19.0,1.0,70.0,4.9,18.6,20.0,1.0,75.0,5.11,18.8,21.0,1.0,0
16,70.0,4.9,18.6,20.0,1.0,75.0,5.11,18.8,21.0,1.0,77.0,5.23,18.9,22.0,1.0,0
17,90.0,5.2,18.7,20.0,1.0,89.0,5.3,20.0,21.0,1.0,85.0,5.13,18.9,22.0,1.0,0


<IPython.core.display.Javascript object>

# 2. Downsample the majority class and upsample the minority

In [15]:
diabetic = data[data.CLASS == 2]
prediabetic = data[data.CLASS == 1]
normal = data[data.CLASS == 0]

print(diabetic.shape[0], prediabetic.shape[0], normal.shape[0])

324 4731 7775


<IPython.core.display.Javascript object>

In [16]:
diabetic_test = diabetic.sample(50, random_state=randomseed)
prediabetic_test = prediabetic.sample(50, random_state=randomseed)
normal_test = normal.sample(50, random_state=randomseed)
test = pd.concat([diabetic_test, prediabetic_test, normal_test])

diabetic_train = diabetic.drop(diabetic_test.index)
prediabetic_train = prediabetic.drop(prediabetic_test.index)
# .sample(
#     10 * diabetic_train.shape[0], random_state=randomseed
# )
normal_train = normal.drop(normal_test.index).sample(
    prediabetic_train.shape[0],
    random_state=randomseed
    #     10 * diabetic_train.shape[0], random_state=randomseed
)
train = pd.concat([diabetic_train, diabetic_train, prediabetic_train, normal_train])

<IPython.core.display.Javascript object>

In [17]:
xtrain = train.iloc[:, :-1]
ytrain = train.iloc[:, -1]
xtest = test.iloc[:, :-1]
ytest = test.iloc[:, -1]

<IPython.core.display.Javascript object>

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

<IPython.core.display.Javascript object>

In [19]:
from imblearn.over_sampling import SMOTE, SMOTENC  # doctest: +NORMALIZE_WHITESPACE

randomseed = 42

sm = SMOTENC(
    random_state=randomseed, categorical_features=[4], sampling_strategy="minority"
)
X_res, y_res = sm.fit_resample(xtrain, ytrain)

print("Resampled dataset shape %s" % Counter(y_res))
print(
    y_res[y_res == 0].shape[0], y_res[y_res == 1].shape[0], y_res[y_res == 2].shape[0]
)
print(X_res.shape, y_res.shape)

xtrain = X_res
ytrain = y_res

Resampled dataset shape Counter({2: 4681, 1: 4681, 0: 4681})
4681 4681 4681
(14043, 15) (14043,)


<IPython.core.display.Javascript object>

# 3. Generate the classifier models based on the selected 12 features

# 3.1.  Features : 12

# a. Random forest

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

parameters = {
    "max_features": ("auto", "sqrt", "log2"),
    "n_estimators": [10, 100, 200, 700],
    "max_depth": [2, 8, 10],
    "min_samples_split": [2, 8, 12],
    "min_samples_leaf": [2, 8, 12],
    "criterion": ["gini", "entropy"],
}

rf_clf = RandomForestClassifier(n_jobs=-1, verbose=1)
rf_clf = RandomizedSearchCV(rf_clf, parameters)
rf_clf.fit(xtrain, ytrain)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.4s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Paral

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      |

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs

<IPython.core.display.Javascript object>

In [21]:
rf_clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=1,
                       warm_start=False)

<IPython.core.display.Javascript object>

In [22]:
rf_5 = RandomForestClassifier(
    random_state=randomseed,
    n_estimators=100,
    max_depth=12,
    min_samples_split=2,
    min_samples_leaf=10,
    max_features="auto",
)

# # rf_5=rf_clf.best_estimator_
# rf_5 = RandomForestClassifier(
#     bootstrap=True,
#     class_weight=None,
#     criterion="entropy",
#     max_depth=10,
#     max_features="log2",
#     max_leaf_nodes=None,
#     min_impurity_decrease=0.0,
#     min_impurity_split=None,
#     min_samples_leaf=2,
#     min_samples_split=2,
#     min_weight_fraction_leaf=0.0,
#     n_estimators=200,
#     n_jobs=-1,
#     oob_score=False,
#     random_state=randomseed,
#     verbose=1,
#     warm_start=False,
# )

<IPython.core.display.Javascript object>

In [23]:
rf_5.fit(xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

<IPython.core.display.Javascript object>

In [24]:
ypred = rf_5.predict(xtest)
print("Accuracy = ", m.accuracy_score(ytest, ypred))

Accuracy =  0.7333333333333333


<IPython.core.display.Javascript object>

In [25]:
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

Unnamed: 0,Normal,Prediabetes,diabetes
Normal,37,13,0
Prediabetes,13,35,2
diabetes,1,11,38


<IPython.core.display.Javascript object>

In [26]:
print(m.classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.73      0.74      0.73        50
           1       0.59      0.70      0.64        50
           2       0.95      0.76      0.84        50

    accuracy                           0.73       150
   macro avg       0.76      0.73      0.74       150
weighted avg       0.76      0.73      0.74       150



<IPython.core.display.Javascript object>

In [27]:
from sklearn import model_selection

scores = model_selection.cross_val_score(
    rf_5, xtrain, ytrain, cv=10, scoring="accuracy"
)
print(
    "Accuracy: %0.2f (+/- %0.2f) [%s] \n [%s]"
    % (scores.mean(), scores.std(), "RandomForestClassifier", scores)
)

Accuracy: 0.77 (+/- 0.05) [RandomForestClassifier] 
 [[0.63539446 0.73717949 0.79131054 0.79700855 0.77421652 0.82122507
 0.78703704 0.7485755  0.79487179 0.81980057]]


<IPython.core.display.Javascript object>

# b. XGBOOST 

In [28]:
parameters = {
    "max_depth": [2, 8, 10],
    "n_estimators": [10, 100, 700],
    "learning_rate": [0.05, 0.15, 0.25],
    "min_child_weight": [1, 3, 5],
    "gamma": [0.0, 0.2, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5],
}
xgb_clf = xgb.XGBClassifier()
xgb_clf = RandomizedSearchCV(xgb_clf, parameters, verbose=2)
xgb_clf.fit(xtrain, ytrain)
xgb_clf.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.5, total=   1.1s
[CV] n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.5, total=   0.9s
[CV] n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.5 
[CV]  n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.5, total=   0.9s
[CV] n_estimators=10, min_child_weight=1, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.4 
[CV]  n_estimators=10, min_child_weight=1, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.4, total=   0.1s
[CV] n_estimators=10, min_child_weight=1, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.4 
[CV]  n_estimators=10, min_child_weight=1, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.4, total=   0.1s
[CV] n_estimators=10, min_child_weight=1, max_depth=2, learning_rate=0.05, gamma=0.2, colsample_bytree=0.4 
[CV]  n_estimators=10, min_child_weight=1, max_depth=2, learning_rate=0.0

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.8min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.0,
              learning_rate=0.05, max_delta_step=0, max_depth=10,
              min_child_weight=3, missing=None, n_estimators=700, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

<IPython.core.display.Javascript object>

In [29]:
import xgboost as xgb
from sklearn import metrics as m

xgb_model_5 = xgb.XGBClassifier(objective="binary:logistic", random_state=randomseed)
# objective="multi:softmax"
# objective="binary:logistic"
# xgb_model=xgb_clf.best_estimator_

# xgb_model_5=xgb_clf.best_estimator_
# xgb_model_5=xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#               colsample_bynode=1, colsample_bytree=0.3, gamma=0.0,
#               learning_rate=0.05, max_delta_step=0, max_depth=10,
#               min_child_weight=3, missing=None, n_estimators=700, n_jobs=1,
#               nthread=None, objective='multi:softprob', random_state=randomseed,
#               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
#               silent=None, subsample=1, verbosity=1)


<IPython.core.display.Javascript object>

In [30]:
xgb_model_5.fit(xtrain, ytrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

<IPython.core.display.Javascript object>

In [31]:
ypred = xgb_model_5.predict(xtest)
print("Accuracy = ", m.accuracy_score(ytest, ypred))

Accuracy =  0.7133333333333334


<IPython.core.display.Javascript object>

In [32]:
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

Unnamed: 0,Normal,Prediabetes,diabetes
Normal,34,16,0
Prediabetes,13,33,4
diabetes,1,9,40


<IPython.core.display.Javascript object>

In [33]:
print(m.classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.71      0.68      0.69        50
           1       0.57      0.66      0.61        50
           2       0.91      0.80      0.85        50

    accuracy                           0.71       150
   macro avg       0.73      0.71      0.72       150
weighted avg       0.73      0.71      0.72       150



<IPython.core.display.Javascript object>

In [34]:
scores = model_selection.cross_val_score(
    xgb_model_5,xtrain, ytrain, cv=10, scoring="accuracy"
)
print(
    "Accuracy: %0.2f (+/- %0.2f) [%s] \n [%s]"
    % (scores.mean(), scores.std(), "xgb_model", scores)
)


Accuracy: 0.73 (+/- 0.05) [xgb_model] 
 [[0.61975835 0.68589744 0.752849   0.75712251 0.73219373 0.78490028
 0.73860399 0.67806268 0.73575499 0.79558405]]


<IPython.core.display.Javascript object>

# c. SVM

In [35]:
from thundersvm import SVC as svmgpu

parameters = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "kernel": ["linear", "rbf"],
}

SVC_clf = svmgpu()
SVC_clf2 = RandomizedSearchCV(SVC_clf, parameters, verbose=2)
SVC_clf2.fit(xtrain, ytrain)


# sorted(SVC_clf2.cv_results_.keys())

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] kernel=rbf, gamma=1, C=1 ........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ......................... kernel=rbf, gamma=1, C=1, total=   0.8s
[CV] kernel=rbf, gamma=1, C=1 ........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ......................... kernel=rbf, gamma=1, C=1, total=   0.6s
[CV] kernel=rbf, gamma=1, C=1 ........................................
[CV] ......................... kernel=rbf, gamma=1, C=1, total=   0.6s
[CV] kernel=rbf, gamma=0.0001, C=100 .................................
[CV] .................. kernel=rbf, gamma=0.0001, C=100, total=   0.6s
[CV] kernel=rbf, gamma=0.0001, C=100 .................................
[CV] .................. kernel=rbf, gamma=0.0001, C=100, total=   0.7s
[CV] kernel=rbf, gamma=0.0001, C=100 .................................
[CV] .................. kernel=rbf, gamma=0.0001, C=100, total=   0.6s
[CV] kernel=linear, gamma=0.1, C=1 ...................................
[CV] .................... kernel=linear, gamma=0.1, C=1, total=   0.7s
[CV] kernel=linear, gamma=0.1, C=1 ...................................
[CV] .................... kernel=linear, gamma=0.1, C=1, total=   0.7s
[CV] kernel=linear, gamma=0.1, C=1 ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.1min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=None, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovo',
                                 degree=3, gamma='auto', gpu_id=0, kernel='rbf',
                                 max_iter=-1, max_mem_size=-1, n_jobs=-1,
                                 probability=False, random_state=None,
                                 shrinking=False, tol=0.001, verbose=False),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'C': [0.1, 1, 10, 100, 1000],
                                        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                                        'kernel': ['linear', 'rbf']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=2)

<IPython.core.display.Javascript object>

In [36]:
SVC_clf2.best_estimator_

SVC(C=1000, cache_size=None, class_weight={}, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma=1, gpu_id=0, kernel='linear',
    max_iter=-1, max_mem_size=-1, n_jobs=-1, probability=False,
    random_state=None, shrinking=False, tol=0.001, verbose=False)

<IPython.core.display.Javascript object>

In [37]:
# scv_5=SVC_clf2.best_estimator_
scv_5 = svmgpu(
    C=1,
    cache_size=None,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=False,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

<IPython.core.display.Javascript object>

In [38]:
# scv_5 = svmgpu(
#     C=70,
#     cache_size=200,
#     class_weight={},
#     coef0=0.0,
#     decision_function_shape="ovr",
#     degree=3,
#     gamma=0.001,
#     gpu_id=0,
#     kernel="linear",
#     max_iter=-1,
#     max_mem_size=-1,
#     n_jobs=-1,
#     probability=True,
#     random_state=42,
#     shrinking=True,
#     tol=0.001,
#     verbose=False,
# )

<IPython.core.display.Javascript object>

In [39]:
scv_5.fit(xtrain, ytrain)

SVC(C=1, cache_size=None, class_weight={}, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma=0.1, gpu_id=0,
    kernel='linear', max_iter=-1, max_mem_size=-1, n_jobs=-1, probability=False,
    random_state=42, shrinking=False, tol=0.001, verbose=False)

<IPython.core.display.Javascript object>

In [40]:
ypred = scv_5.predict(xtest)
print("Accuracy = ", m.accuracy_score(ytest, ypred))

Accuracy =  0.7466666666666667


<IPython.core.display.Javascript object>

In [41]:
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

Unnamed: 0,Normal,Prediabetes,diabetes
Normal,35,15,0
Prediabetes,13,32,5
diabetes,0,5,45


<IPython.core.display.Javascript object>

In [42]:
print(m.classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.73      0.70      0.71        50
           1       0.62      0.64      0.63        50
           2       0.90      0.90      0.90        50

    accuracy                           0.75       150
   macro avg       0.75      0.75      0.75       150
weighted avg       0.75      0.75      0.75       150



<IPython.core.display.Javascript object>

In [43]:
scores = model_selection.cross_val_score(
    scv_5, xtrain, ytrain, cv=10, scoring="accuracy"
)
print(
    "Accuracy: %0.2f (+/- %0.2f) [%s] \n [%s]"
    % (scores.mean(), scores.std(), "SVC_clf", scores)
)

Accuracy: 0.75 (+/- 0.03) [SVC_clf] 
 [[0.67732765 0.74002849 0.76923077 0.75569801 0.73931624 0.77706553
 0.76638177 0.752849   0.75783476 0.76566952]]


<IPython.core.display.Javascript object>