In [2]:
import time
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression, RidgeCV, Lasso, BayesianRidge, SGDClassifier, RidgeClassifier, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
# HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score

In [3]:
df_tr = pd.read_parquet('../dataset/train.parquet') 
df_te = pd.read_parquet('../dataset/test.parquet')

In [4]:
def training(model, X_train, y_train, X_test, y_test, model_name):
    t1 = time.time()
    
    model.fit(X_train, y_train)
    predicts = model.predict(X_test)

    print("\t\t\t--- Model:", model_name,"---")
    # if model_name in ["Linear","Ridge","Lasso","Bayesian Regression"]:
    #     roc = roc_auc_score(y_test, predicts)
    #     print("ROC: ", roc,"\n")
    # else:
    acc = accuracy_score(y_test, predicts)
    print("Accuracy: ", acc, "\n")
    

In [5]:
linear = LogisticRegression()
ridge = RidgeClassifier(solver = "auto", alpha=.5)
sgd = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
# svr = SVR(kernel='rbf', C=1, epsilon=0.3)
# svc = SVC()
# knc =  KNeighborsClassifier(n_neighbors=3)
gnb = GaussianNB()
tree = DecisionTreeClassifier(random_state=0, max_depth=10)
rf = RandomForestClassifier(n_estimators=10)
et = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
ada = AdaBoostClassifier(n_estimators=100)
gradient = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
# gbdt = HistGradientBoostingClassifier(min_samples_leaf=1,max_depth=2,learning_rate=1,max_iter=1)

m = [linear,ridge,sgd,gnb,tree,rf,et,ada,gradient]
mn = ["Linear","Ridge","SGD",
# "Support Vector",
# "K Neighbors Classifier",
"Gaussian Naive Bayes","Decision Tree","Random Forest","Extra Trees","AdaBoost","GradientBoosting"]


In [6]:
def ml_result(df_tr, df_te):
    x_train, x_test = train_test_split(df_tr, train_size=0.8, test_size=0.2, shuffle=True)

    cols = list(x_train.columns.values)
    cols.remove('encounter_id')
    cols.remove('diabetes_mellitus')

    x_tr = x_train[cols].values
    y_tr = x_train['diabetes_mellitus'].values
    x_te = x_test[cols].values
    y_te = x_test['diabetes_mellitus'].values

    for i in range(0,len(m)):
        training(model=m[i], X_train=x_tr, y_train=y_tr, X_test=x_te, y_test=y_te, model_name=mn[i])


In [7]:
ml_result(df_tr, df_te)

			--- Model: Linear ---
Accuracy:  0.8106561155500922 

			--- Model: Ridge ---
Accuracy:  0.8067762753534112 

			--- Model: SGD ---
Accuracy:  0.7945221266133989 

			--- Model: Gaussian Naive Bayes ---
Accuracy:  0.7483481868469576 

			--- Model: Decision Tree ---
Accuracy:  0.8098110018438844 

			--- Model: Random Forest ---
Accuracy:  0.8044714197910264 

			--- Model: Extra Trees ---
Accuracy:  0.8022433927473879 

			--- Model: AdaBoost ---
Accuracy:  0.8203365089121082 

			--- Model: GradientBoosting ---
Accuracy:  0.822679778733866 



## Imputation: KNN

In [8]:
df_tr_knn = pd.read_parquet('../dataset/train_scale_knn_.parquet') 
df_te_knn = pd.read_parquet('../dataset/test_scale_knn_.parquet')

In [9]:
ml_result(df_tr_knn, df_te_knn)

			--- Model: Linear ---
Accuracy:  0.8133066994468346 

			--- Model: Ridge ---
Accuracy:  0.8078518746158574 

			--- Model: SGD ---
Accuracy:  0.8161493546404426 

			--- Model: Gaussian Naive Bayes ---
Accuracy:  0.28353564843269824 

			--- Model: Decision Tree ---
Accuracy:  0.8053165334972342 

			--- Model: Random Forest ---
Accuracy:  0.7954440688383528 

			--- Model: Extra Trees ---
Accuracy:  0.7957129686539643 

			--- Model: AdaBoost ---
Accuracy:  0.8193761524277812 

			--- Model: GradientBoosting ---
Accuracy:  0.8227950215119852 



## Imputation: Mean

In [10]:
df_tr_mean = pd.read_parquet('../dataset/train_scale_mean_.parquet') 
df_te_mean = pd.read_parquet('../dataset/test_scale_mean_.parquet')

In [11]:
ml_result(df_tr_mean, df_te_mean)

			--- Model: Linear ---
Accuracy:  0.819798709280885 

			--- Model: Ridge ---
Accuracy:  0.8164950829748002 

			--- Model: SGD ---
Accuracy:  0.8143438844499078 

			--- Model: Gaussian Naive Bayes ---
Accuracy:  0.2660955746773202 

			--- Model: Decision Tree ---
Accuracy:  0.8096957590657652 

			--- Model: Random Forest ---
Accuracy:  0.8068146896127842 

			--- Model: Extra Trees ---
Accuracy:  0.8053933620159803 

			--- Model: AdaBoost ---
Accuracy:  0.8259449907805777 

			--- Model: GradientBoosting ---
Accuracy:  0.8304778733866011 



## Imputation: Softimpute

In [12]:
df_tr_soft = pd.read_parquet('../dataset/train_scale_mean_.parquet') 
df_te_soft = pd.read_parquet('../dataset/test_scale_mean_.parquet')

In [13]:
ml_result(df_tr_soft, df_te_soft)

			--- Model: Linear ---
Accuracy:  0.8147664413030117 

			--- Model: Ridge ---
Accuracy:  0.8114244007375537 

			--- Model: SGD ---
Accuracy:  0.8118853718500307 

			--- Model: Gaussian Naive Bayes ---
Accuracy:  0.27884910878918256 

			--- Model: Decision Tree ---
Accuracy:  0.8136140135218193 

			--- Model: Random Forest ---
Accuracy:  0.8050860479409957 

			--- Model: Extra Trees ---
Accuracy:  0.8010141364474493 

			--- Model: AdaBoost ---
Accuracy:  0.8222188076213891 

			--- Model: GradientBoosting ---
Accuracy:  0.8230639213275968 

