In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# pd.read_csv('./dataset.csv').to_parquet('dataset.parquet')

In [17]:
df = pd.read_parquet('dataset_preprocessed.parquet')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,height,weight,waistline,SBP,DBP,BLDS,tot_chole,hemoglobin,...,SMK_stat_type_cd,sex,SIGHT,HEAR,BMI,TOTAL_CHOLESTEROL,KIDNEY_FUNCTION_INDEX,LIVER_FUNCTION_INDEX,CARDIOVASCULAR_HEALTH_INDEX,DRK_YN
0,0,35.0,170,75,90.0,120.0,80.0,99.0,193.0,17.1,...,1.0,0,2.0,2.0,25.951557,266.0,1.0,1.252381,2.479167,1
1,1,30.0,180,80,89.0,130.0,82.0,106.0,228.0,15.8,...,3.0,0,2.1,2.0,24.691358,324.0,0.916667,0.952778,2.643607,0
2,2,40.0,165,75,91.0,120.0,70.0,98.0,136.0,15.8,...,1.0,0,2.7,2.0,27.548209,219.0,0.916667,1.457779,2.522648,0
3,3,50.0,175,80,91.0,145.0,87.0,95.0,201.0,17.6,...,1.0,0,2.7,2.0,26.122449,286.0,1.083333,0.736815,1.902047,0
4,4,50.0,165,60,80.0,138.0,82.0,101.0,199.0,13.8,...,1.0,0,2.2,2.0,22.038567,282.0,0.833333,1.449561,2.216713,0


In [4]:
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
df.head()

Unnamed: 0,age,height,weight,waistline,SBP,DBP,BLDS,tot_chole,hemoglobin,urine_protein,...,SMK_stat_type_cd,sex,SIGHT,HEAR,BMI,TOTAL_CHOLESTEROL,KIDNEY_FUNCTION_INDEX,LIVER_FUNCTION_INDEX,CARDIOVASCULAR_HEALTH_INDEX,DRK_YN
0,35.0,170,75,90.0,120.0,80.0,99.0,193.0,17.1,1.0,...,1.0,0,2.0,2.0,25.951557,266.0,1.0,1.252381,2.479167,1
1,30.0,180,80,89.0,130.0,82.0,106.0,228.0,15.8,1.0,...,3.0,0,2.1,2.0,24.691358,324.0,0.916667,0.952778,2.643607,0
2,40.0,165,75,91.0,120.0,70.0,98.0,136.0,15.8,1.0,...,1.0,0,2.7,2.0,27.548209,219.0,0.916667,1.457779,2.522648,0
3,50.0,175,80,91.0,145.0,87.0,95.0,201.0,17.6,1.0,...,1.0,0,2.7,2.0,26.122449,286.0,1.083333,0.736815,1.902047,0
4,50.0,165,60,80.0,138.0,82.0,101.0,199.0,13.8,1.0,...,1.0,0,2.2,2.0,22.038567,282.0,0.833333,1.449561,2.216713,0


In [6]:
df.columns

Index(['age', 'height', 'weight', 'waistline', 'SBP', 'DBP', 'BLDS',
       'tot_chole', 'hemoglobin', 'urine_protein', 'serum_creatinine',
       'SGOT_AST', 'SGOT_ALT', 'gamma_GTP', 'SMK_stat_type_cd', 'sex', 'SIGHT',
       'HEAR', 'BMI', 'TOTAL_CHOLESTEROL', 'KIDNEY_FUNCTION_INDEX',
       'LIVER_FUNCTION_INDEX', 'CARDIOVASCULAR_HEALTH_INDEX', 'DRK_YN'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [19]:
data_df = df.iloc[:,:-1]

>**We have split our dataset into two train and test datasets down below**

In [64]:
x_train, x_test, y_train, y_test = train_test_split(data_df, df["DRK_YN"], random_state=42, test_size=0.2)

>**The train and test datasets are scaled differently. So we have used standard scaling to scale each feature**

In [65]:
standard_scaler = StandardScaler().fit(x_train)

In [66]:
x_train_scaled = standard_scaler.transform(x_train)

In [67]:
x_test_scaled = standard_scaler.transform(x_test)

In [68]:
x_train_scaled.shape, x_test_scaled.shape

((793076, 24), (198270, 24))

>**Since our dataset is large, instead of using the whole dataset to evaluate the models, we have used the small part of training dataset to evaluate the models**

In [20]:
dff = df.sample(20000, random_state=42).reset_index(drop=True).copy()

In [21]:
dff.shape

(20000, 24)

In [23]:
y_dff = dff['DRK_YN']

In [24]:
dff = dff.iloc[:,:-1]

>**Our dataset has 24 features. So we can use Principal Component Analysis to reduce the size of the feature and improving the overall time complexity of the model**

In [128]:
pca = PCA(n_components=0.95)

In [26]:
dff = StandardScaler().fit(dff).transform(dff)

In [132]:
dff_pca = pca.fit_transform(dff)

In [133]:
dff_pca.shape

(20000, 15)

In [34]:
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [152]:
model_results = pd.DataFrame()
for method in ['Train_acc', 'Validation_acc', 'f1_score', 'precision', 'recall', 'auc']:
    model_results[method] = None

In [58]:
models = {
    'Logistic Regression': LogisticRegression(random_state=12345),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=12345),
    'Random Forest': RandomForestClassifier(random_state=12345),
    'XGB': XGBClassifier(random_state=12345, tree_method="hist", device="cuda:0"),
    'catboost': CatBoostClassifier(task_type='GPU', verbose=False, random_state=12345)
}

In [333]:
score_type = ['f1', 'accuracy', 'precision', 'recall', 'roc_auc']

In [153]:
for ind, set in {0: dff, 1:dff_pca}.items():
    for model_name, model in models.items():
        score = cross_validate(model, set, y_dff, cv=5,
                           scoring = score_type,
                            return_train_score=True)
    
        label = model_name if ind == 0 else ('pca_' + model_name)
        model_results.loc[label] = [round(score['train_accuracy'].mean(), 4), round(score['test_accuracy'].mean(), 4), round(score['test_f1'].mean(), 4), round(score['test_precision'].mean(), 4), round(score['test_recall'].mean(), 4), round(score['test_roc_auc'].mean(), 4)]


In [154]:
model_results

Unnamed: 0,Train_acc,Validation_acc,f1_score,precision,recall,auc
Logistic Regression,0.7264,0.7254,0.721,0.7329,0.7097,0.8039
KNN,0.7847,0.6823,0.6861,0.6781,0.6944,0.7387
Decision Tree,1.0,0.6456,0.6452,0.6461,0.6445,0.6456
Random Forest,1.0,0.7276,0.7273,0.7283,0.7266,0.8061
XGB,0.9092,0.7179,0.7186,0.7168,0.7205,0.7969
catboost,0.7822,0.7364,0.7381,0.7332,0.7434,0.8164
pca_Logistic Regression,0.7194,0.7195,0.7134,0.7294,0.6983,0.7932
pca_KNN,0.7841,0.683,0.6875,0.6783,0.6971,0.7368
pca_Decision Tree,1.0,0.6275,0.6267,0.6282,0.6253,0.6275
pca_Random Forest,1.0,0.7128,0.7173,0.7064,0.7287,0.7885


>**The model performance of all the models is shown above**
>**As we can see, catboost, Random Forest and Logistic Regression has the highest validation accuracy.**
>**We can also see that, the accuracy of the PCA training set is not much lower than for the model compared to the actual dataset**
>**As shown above, the training accuracy of the decision tree and random forest is much higher than of the validation accuracy. So we can say that, these models are overfitting.**

In [26]:
import tensorflow as tf
from tensorflow import keras

In [106]:
nn_model = keras.models.Sequential([
keras.layers.Flatten(),
keras.layers.Dense(300, activation="relu"),
keras.layers.Dense(100, activation="relu"),
keras.layers.Dense(1, activation="sigmoid")   
])
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy", 'AUC', keras.metrics.Precision(), keras.metrics.Recall()])

>**Here, we have used the Sequential method to create the Neural Network with 2 hidden layers. We have used Rectified Linear Unit function for the hidden layers and since we are using Neural Network as a binary classifier, we have used the sigmoid activation function for the output layer.**

>**For loss function, we have used the binary crossentropy and we have used Adam as the optimizer for our model.**

In [39]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("keras_model.keras")
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [171]:
from sklearn.model_selection import StratifiedKFold

In [201]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [306]:
acc = np.array([])
val_acc = np.array([])
precision = np.array([])
recall = np.array([])
auc = np.array([])

In [308]:
for train_index, test_index in skf.split(dff, y_dff):
    x_train_fold, x_test_fold = dff[train_index], dff[test_index]
    y_train_fold, y_test_fold = y_dff[train_index], y_dff[test_index]
    history = nn_model.fit(dff, y_dff, epochs=10, batch_size=32, validation_data=(x_test_fold, y_test_fold), callbacks=[early_stopping_cb])
    
    acc = np.append(acc, history.history['accuracy'])
    val_acc = np.append(val_acc, history.history['val_accuracy'])
    precision = np.append(precision, history.history['val_precision_13'])
    recall = np.append(recall, history.history['val_recall_13'])
    auc = np.append(auc, history.history['val_auc'])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [325]:
pre, re = precision.mean(), recall.mean()
model_results.loc['NN'] = [round(acc.mean(), 4), round(val_acc.mean(), 4), round(2*(pre*re)/(pre+re),4), round(pre, 4), round(re, 4), round(auc.mean(), 4)]

In [326]:
model_results

Unnamed: 0,Train_acc,Validation_acc,f1_score,precision,recall,auc
Logistic Regression,0.7264,0.7254,0.721,0.7329,0.7097,0.8039
KNN,0.7847,0.6823,0.6861,0.6781,0.6944,0.7387
Decision Tree,1.0,0.6456,0.6452,0.6461,0.6445,0.6456
Random Forest,1.0,0.7276,0.7273,0.7283,0.7266,0.8061
XGB,0.9092,0.7179,0.7186,0.7168,0.7205,0.7969
catboost,0.7822,0.7364,0.7381,0.7332,0.7434,0.8164
pca_Logistic Regression,0.7194,0.7195,0.7134,0.7294,0.6983,0.7932
pca_KNN,0.7841,0.683,0.6875,0.6783,0.6971,0.7368
pca_Decision Tree,1.0,0.6275,0.6267,0.6282,0.6253,0.6275
pca_Random Forest,1.0,0.7128,0.7173,0.7064,0.7287,0.7885


In [342]:
model_dict = {
    'Logistic Regression':  {
        'model': LogisticRegression(random_state=22, n_jobs=-1),
        'params': {'solver': ['liblinear'], 'C': np.logspace(-4, 4, 20), 'penalty': ['l2']}
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=22, n_jobs=-1),
        'params': {'n_estimators': list(range(10, 100, 10)), 'max_depth': list(range(1, 20, 5)), 
                    'min_samples_split': [2, 5, 8]},
    },
    'Catboost': {
        'model': CatBoostClassifier(random_state=22, verbose=False),
        'params': {'n_estimators': list(range(10,100,10)),'max_depth': list(range(1, 20, 5)), 
                   'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1]}
        
    },
}

In [337]:
from sklearn.model_selection import RandomizedSearchCV
import math

In [343]:
def hyperparameter_tuning():
    
    best_model = None
    best_score = -math.inf
    
    for model_name, clasi_model in model_dict.items():
        
        hyper_tuning_model = RandomizedSearchCV(clasi_model['model'], clasi_model['params'], n_iter=10, cv=5, return_train_score=True, verbose=2, scoring=score_type, refit="accuracy")
        hyper_tuning_model.fit(x_train_scaled, y_train)
        
        model_res = hyper_tuning_model.best_estimator_
        best_model_score = hyper_tuning_model.best_score_
        
        res = hyper_tuning_model.cv_results_
        
        print(model_name, " :: ", best_model_score, hyper_tuning_model.best_params_)
        
        if best_model_score > best_score:
            best_score = best_model_score
            best_model = model_res

    print("Best Model :: ", best_model)
    
    return best_model

In [346]:
import warnings
warnings.filterwarnings('ignore')

In [347]:
best_model = hyperparameter_tuning()

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ..C=78.47599703514607, penalty=l2, solver=liblinear; total time=   4.8s
[CV] END ..C=78.47599703514607, penalty=l2, solver=liblinear; total time=   4.3s
[CV] END ..C=78.47599703514607, penalty=l2, solver=liblinear; total time=   4.3s
[CV] END ..C=78.47599703514607, penalty=l2, solver=liblinear; total time=   3.9s
[CV] END ..C=78.47599703514607, penalty=l2, solver=liblinear; total time=   4.0s
[CV] END ..C=0.615848211066026, penalty=l2, solver=liblinear; total time=   3.9s
[CV] END ..C=0.615848211066026, penalty=l2, solver=liblinear; total time=   3.9s
[CV] END ..C=0.615848211066026, penalty=l2, solver=liblinear; total time=   3.9s
[CV] END ..C=0.615848211066026, penalty=l2, solver=liblinear; total time=   5.5s
[CV] END ..C=0.615848211066026, penalty=l2, solver=liblinear; total time=   4.4s
[CV] END ..C=4.281332398719396, penalty=l2, solver=liblinear; total time=   4.5s
[CV] END ..C=4.281332398719396, penalty=l2, solv

In [35]:
import keras_tuner as kt

In [82]:
x_train_scaled.shape

(793076, 24)

In [83]:
def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.InputLayer(x_test_scaled.shape[1], ))

    for i in range(hp.Int('num_layers', 2, 5, default=3)):
        model.add(keras.layers.Dense(units=hp.Int('units_' + str(i), 32, 256, step=32), activation='relu'))

        if hp.Boolean('dropout'):
            model.add(keras.layers.Dropout(rate=0.5))

    model.add(keras.layers.Dense(1, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', [1e-3, 1e-4, 1e-5])), metrics=["accuracy", 'AUC', keras.metrics.Precision(), keras.metrics.Recall()])

    return model

In [91]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=3,
    executions_per_trial=2,
    overwrite=True
)

In [None]:
tuner.search(x_train_scaled, y_train, validation_split=0.1, epochs=30, batch_size=32, callbacks=[early_stopping_cb, checkpoint_cb])

Trial 3 Complete [00h 59m 31s]
val_accuracy: 0.7388031482696533

Best val_accuracy So Far: 0.7388031482696533
Total elapsed time: 02h 21m 51s


In [71]:
x_test_scaled.shape

(198270, 24)

In [94]:
best_nn_model = tuner.get_best_models()[0]
best_nn_model

<keras.src.engine.sequential.Sequential at 0x18ba1d3a050>

In [112]:
best_nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               6400      
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 160)               41120     
                                                                 
 dropout_1 (Dropout)         (None, 160)               0         
                                                                 
 dense_2 (Dense)             (None, 160)               25760     
                                                                 
 dropout_2 (Dropout)         (None, 160)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 1

In [105]:
nn_loss = best_nn_model.evaluate(x_test_scaled, y_test)



In [115]:
y_pred = best_nn_model.predict(x_test_scaled)



In [404]:
predict = best_model.predict(x_test_scaled)

In [116]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [405]:
acc = accuracy_score(predict, y_test)

In [406]:
acc

0.7370656175921723