In [1]:
import time
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression, RidgeCV, Lasso, BayesianRidge, SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
# HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, recall_score

In [2]:
df_tr = pd.read_parquet('../dataset/train.parquet') 
df_te = pd.read_parquet('../dataset/test.parquet')

In [3]:
def training(model, X_train, y_train, X_test, y_test, model_name):
    t1 = time.time()
    
    model.fit(X_train, y_train)
    predicts = model.predict(X_test)

    print("\t\t\t--- Model:", model_name,"---")
    # if model_name in ["Linear","Ridge","Lasso","Bayesian Regression"]:
    #     roc = roc_auc_score(y_test, predicts)
    #     print("ROC: ", roc,"\n")
    # else:
    roc = roc_auc_score(y_test, predicts)
    acc = accuracy_score(y_test, predicts)
    rec = recall_score(y_test, predicts)
    print("Accuracy: ", acc, "\t","ROC: ",roc, "Recall: ",rec,"\n")
    

In [4]:
linear = LogisticRegression()
ridge = RidgeClassifier(solver = "auto", alpha=.5)
sgd = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
# svr = SVR(kernel='rbf', C=1, epsilon=0.3)
# svc = SVC()
knc =  KNeighborsClassifier(n_neighbors=3)
gnb = GaussianNB()
tree = DecisionTreeClassifier(random_state=0, max_depth=10)
rf = RandomForestClassifier(n_estimators=10)
et = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
ada = AdaBoostClassifier(n_estimators=100)
gradient = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
mlp = MLPClassifier()
# gbdt = HistGradientBoostingClassifier(min_samples_leaf=1,max_depth=2,learning_rate=1,max_iter=1)

m = [linear,ridge,sgd,knc,gnb,tree,rf,et,ada,gradient,mlp]
mn = ["Linear","Ridge","SGD",
# "Support Vector",
"K Neighbors Classifier",
"Gaussian Naive Bayes","Decision Tree","Random Forest","Extra Trees","AdaBoost","GradientBoosting","MLP"]


In [5]:
def ml_result(df_tr, df_te):
    x_train, x_test = train_test_split(df_tr, train_size=0.8, test_size=0.2, shuffle=True)

    cols = list(x_train.columns.values)
    cols.remove('encounter_id')
    cols.remove('diabetes_mellitus')

    x_tr = x_train[cols].values
    y_tr = x_train['diabetes_mellitus'].values
    x_te = x_test[cols].values
    y_te = x_test['diabetes_mellitus'].values

    for i in range(0,len(m)):
        training(model=m[i], X_train=x_tr, y_train=y_tr, X_test=x_te, y_test=y_te, model_name=mn[i])


In [6]:
ml_result(df_tr, df_te)

			--- Model: Linear ---
Accuracy:  0.81399815611555 	 ROC:  0.6242529892065527 Recall:  0.29216705502778273 

			--- Model: Ridge ---
Accuracy:  0.8114244007375537 	 ROC:  0.6063863870499347 Recall:  0.24753540060942822 

			--- Model: SGD ---
Accuracy:  0.8076982175783651 	 ROC:  0.5890899062388539 Recall:  0.20648861803190535 

			--- Model: K Neighbors Classifier ---
Accuracy:  0.7745082974800246 	 ROC:  0.5853049945881196 Recall:  0.25416741351496686 

			--- Model: Gaussian Naive Bayes ---
Accuracy:  0.7541103257529195 	 ROC:  0.6461026849231811 Recall:  0.4570711597060405 

			--- Model: Decision Tree ---
Accuracy:  0.8079287031346035 	 ROC:  0.6705756136328086 Recall:  0.4301846208998028 

			--- Model: Random Forest ---
Accuracy:  0.8038567916410572 	 ROC:  0.6133672339436131 Recall:  0.27997849076895504 

			--- Model: Extra Trees ---
Accuracy:  0.8047403196066379 	 ROC:  0.6006988712298453 Recall:  0.24359204158451336 

			--- Model: AdaBoost ---
Accuracy:  0.821681007990166

## Imputation: KNN

In [7]:
df_tr_knn = pd.read_parquet('../dataset/train_scale_knn_.parquet') 
df_te_knn = pd.read_parquet('../dataset/test_scale_knn_.parquet')

In [8]:
ml_result(df_tr_knn, df_te_knn)

			--- Model: Linear ---
Accuracy:  0.8176475107559926 	 ROC:  0.6378394186545819 Recall:  0.32306038344382726 

			--- Model: Ridge ---
Accuracy:  0.8142286416717885 	 ROC:  0.6171634427743263 Recall:  0.2721734456190647 

			--- Model: SGD ---
Accuracy:  0.8157652120467117 	 ROC:  0.6406150340462247 Recall:  0.33399032431463893 

			--- Model: Gaussian Naive Bayes ---
Accuracy:  0.272280270436386 	 ROC:  0.5208194788245368 Recall:  0.9559218777996775 

			--- Model: Decision Tree ---
Accuracy:  0.8078518746158574 	 ROC:  0.6749889087267392 Recall:  0.4423938362300663 

			--- Model: Random Forest ---
Accuracy:  0.8056622618315918 	 ROC:  0.6040898899849603 Recall:  0.2512094606701308 

			--- Model: Extra Trees ---
Accuracy:  0.7998617086662569 	 ROC:  0.5803998528750937 Recall:  0.1962013975989966 

			--- Model: AdaBoost ---
Accuracy:  0.8227181929932391 	 ROC:  0.6750702451687218 Recall:  0.4165920086006092 

			--- Model: GradientBoosting ---
Accuracy:  0.823140749846343 	 ROC:  

## Imputation: Mean

In [9]:
df_tr_mean = pd.read_parquet('../dataset/train_scale_mean_.parquet') 
df_te_mean = pd.read_parquet('../dataset/test_scale_mean_.parquet')

In [10]:
ml_result(df_tr_mean, df_te_mean)

			--- Model: Linear ---
Accuracy:  0.8192224953902889 	 ROC:  0.6344986313189841 Recall:  0.311938151743977 

			--- Model: Ridge ---
Accuracy:  0.8154963122311002 	 ROC:  0.6116372843359056 Recall:  0.255663430420712 

			--- Model: SGD ---
Accuracy:  0.8148048555623848 	 ROC:  0.6104774494805372 Recall:  0.253685724559511 

			--- Model: Gaussian Naive Bayes ---
Accuracy:  0.2720881991395206 	 ROC:  0.5236651649206607 Recall:  0.9629629629629629 

			--- Model: Decision Tree ---
Accuracy:  0.8108097725875845 	 ROC:  0.6793646238950994 Recall:  0.44983818770226536 

			--- Model: Random Forest ---
Accuracy:  0.8090427166564229 	 ROC:  0.6106107955318972 Recall:  0.2641136281912981 

			--- Model: Extra Trees ---
Accuracy:  0.8069299323909035 	 ROC:  0.5889717492925597 Recall:  0.20837828119381518 

			--- Model: AdaBoost ---
Accuracy:  0.8245620774431469 	 ROC:  0.6742295449247608 Recall:  0.41172240201366417 

			--- Model: GradientBoosting ---
Accuracy:  0.8261754763368162 	 ROC:  

## Imputation: Softimpute

In [11]:
df_tr_soft = pd.read_parquet('../dataset/train_scale_mean_.parquet') 
df_te_soft = pd.read_parquet('../dataset/test_scale_mean_.parquet')

In [12]:
ml_result(df_tr_soft, df_te_soft)

			--- Model: Linear ---
Accuracy:  0.8150737553779963 	 ROC:  0.6314716660144453 Recall:  0.30900981266726135 

			--- Model: Ridge ---
Accuracy:  0.8123847572218807 	 ROC:  0.6117636803924001 Recall:  0.2594112399643176 

			--- Model: SGD ---
Accuracy:  0.8085433312845728 	 ROC:  0.6041376469643408 Recall:  0.24513826940231936 

			--- Model: Gaussian Naive Bayes ---
Accuracy:  0.2783497234173325 	 ROC:  0.5243736327533826 Recall:  0.9564674397859054 

			--- Model: Decision Tree ---
Accuracy:  0.8083896742470805 	 ROC:  0.6791897100385801 Recall:  0.45227475468331846 

			--- Model: Random Forest ---
Accuracy:  0.8005147510755992 	 ROC:  0.5970800090677768 Recall:  0.23978590544157002 

			--- Model: Extra Trees ---
Accuracy:  0.8016671788567916 	 ROC:  0.58512751419111 Recall:  0.2048171275646744 

			--- Model: AdaBoost ---
Accuracy:  0.8241011063306699 	 ROC:  0.6750901220581966 Recall:  0.4133809099018733 

			--- Model: GradientBoosting ---
Accuracy:  0.8262138905961893 	 ROC: