In [1]:
import sys
sys.path.append("../src/")
from results import Experiment_Results
# Regression Example With Boston Dataset: Baseline
import math
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from joblib import dump, load
from datetime import datetime
# Standard ML Models for comparison
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, classification_report
# Splitting data into training/testing
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

#Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, confusion_matrix

# Distributions
import scipy

In [2]:
multiclass = False
folds=2
experiment_name = "acute-inflammations-diagnosis-discretized"
#experiment_name = "acute-inflammations-diagnosis-discretized-oversampled"
if experiment_name == "acute-inflammations-diagnosis-discretized":
    df = pd.read_csv("./acute-inflammations-diagnosis-discretized.csv")
if experiment_name == "acute-inflammations-diagnosis-discretized":
    df = pd.read_csv("./acute-inflammations-diagnosis-discretized.csv")
cols = df.columns
print("Number of frame rows: " + str(df['X1'].count()))

Number of frame rows: 120


In [15]:
folds=2
#folds=5
#folds=10
#experiment_name = "audiology.standardized-discretized"
experiment_name = "audiology.standardized-discretized-resampled"
if experiment_name == "audiology.standardized-discretized":
    df = pd.read_csv("./audiology.standardized-discretized.csv")
if experiment_name == "audiology.standardized-discretized-resampled":
    df = pd.read_csv("./audiology.standardized-discretized-resampled.csv")
df = df.sample(frac=1)
cols = df.columns
X = df.iloc[:,0:len(cols)-1]
Y = df.iloc[:,len(cols)-1]

In [22]:
#oversample = SMOTE()
#X = df.iloc[:,0:len(cols)-1]
#Y = df.iloc[:,len(cols)-1]
#print("Number of rows before SMOTE: " + str(len(Y)))
#X, Y = oversample.fit_resample(X, Y)
#print("Number of rows after SMOTE: " + str(len(Y)))

In [68]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state=42)

In [17]:
def model_report(model_name, model, folds, train_time, y_true, y_pred):
    model_dict = {'model_name': model_name, 'model': model, 'folds': folds, 'train_time': train_time}
    cm = confusion_matrix(y_true, y_pred)

    FP = (cm.sum(axis=0) - np.diag(cm) ).sum()
    FN = (cm.sum(axis=1) - np.diag(cm)).sum()
    TP = (np.diag(cm)).sum()
    TN = (cm.sum() - (FP + FN + TP)).sum()

    with np.errstate(divide='ignore'):
        # Sensitivity, hit rate, recall, or true positive rate
        TPR = TP/(TP+FN) if (TP+FN) else 0
        # Specificity or true negative rate
        TNR = TN/(TN+FP) if (TN+FP) else 0
        # Precision or positive predictive value
        PPV = TP/(TP+FP) if (TP+FP) else 0
        # Negative predictive value
        NPV = TN/(TN+FN) if (TN+FN) else 0
        # Fall out or false positive rate
        FPR = FP/(FP+TN) if (FP+TN) else 0
        # False negative rate
        FNR = FN/(TP+FN) if (TP+FN) else 0
        # False discovery rate
        FDR = FP/(TP+FP) if (TP+FP) else 0

        # Overall accuracy
        ACC = (TP+TN)/(TP+FP+FN+TN)
    
    model_dict['accuracy'] = accuracy_score(y_true, y_pred)
    model_dict['recall'] = recall_score(y_true, y_pred, average='micro')
    model_dict['precision'] = precision_score(y_true, y_pred, average='micro')
    model_dict['roc_auc_score'] = 0 #roc_auc_score(y_true, y_pred, average='micro', multi_class="ovo")
    with np.errstate(divide='ignore'):
        model_dict['TN'] = TN
        model_dict['FN'] = FN
        model_dict['TP'] = TP
        model_dict['FP'] = FP
        model_dict['population'] = model_dict['TN']+model_dict['FN']+model_dict['TP']+model_dict['FP']
        #model_dict['NPV'] = model_dict['TN']/(model_dict['TN']+model_dict['FN'])
        model_dict['NPV'] = TN/(TN+FN) if (TN+FN) else 0
        #model_dict['PPV'] = model_dict['TP']/(model_dict['TP']+model_dict['FP'])
        model_dict['PPV'] = TP/(TP+FP) if (TP+FP) else 0
        #model_dict['sensitivity'] = model_dict['TP']/(model_dict['TP']+model_dict['FN'])
        model_dict['sensitivity'] = TP/(TP+FN) if (TP+FN) else 0
        #model_dict['specificity'] = model_dict['TN']/(model_dict['TN']+model_dict['FP'])
        model_dict['specificity'] = TN/(TN+FP) if (TN+FP) else 0
    
    return model_dict

In [18]:
def evaluate(folds=2):
    # Instantiate the models
    all_models = []
    scoring_metrics = ['accuracy', 'precision', 'recall']
    model_names = ['MLPClassifier', 'KNeighborsClassifier', 'SVC-Linear', 'SVC', \
                    'GaussianProcessClassifier', 'DecisionTreeClassifier', \
                    'RandomForestClassifier', 'AdaBoostClassifier', 'GaussianNB', \
                    'QuadraticDiscriminantAnalysis']
    model1 = MLPClassifier(alpha=1, max_iter=1000)
    model2 = KNeighborsClassifier(3)
    model3 = SVC(kernel="linear", C=0.025)
    model4 = SVC(gamma=2, C=1)
    model5 = GaussianProcessClassifier(1.0 * RBF(1.0))
    model6 = DecisionTreeClassifier(max_depth=5)
    model7 = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    model8 = AdaBoostClassifier()
    model9 = GaussianNB()
    model10 = QuadraticDiscriminantAnalysis()
    
    # Train and predict with each model
    for i, model in enumerate([model1, model2, model3, model4, model5, model6, model7, model8, model9, model10]):
        cv = KFold(n_splits=folds)
        grid = GridSearchCV(estimator=model, param_grid={}, n_jobs=-1, cv=cv, scoring='accuracy')
        start_training_time = datetime.now()
        grid_result = grid.fit(X, Y)
        total_training_time = datetime.now() - start_training_time
        model = grid_result.best_estimator_
        y_pred = model.predict(X)
        report = model_report(model_names[i], model, folds, total_training_time, Y, y_pred)
        all_models.append(report)
    
    return all_models
models = evaluate(folds)

KeyboardInterrupt: 

In [35]:
erc = Experiment_Results()
erc.load_df('../results/pd_results_dataframe.pkl')
for model in models:
    results_row = {'dt': datetime.now(),
                                'experiment_name': experiment_name,
                                'model_name': model['model_name'],
                                'graph': 0,
                                'runtime': model['train_time'].total_seconds() * 1000.0,
                                'folds': model['folds'],
                                'samples': model['population'],
                                'accuracy': model['accuracy'],
                                'ROC': model['roc_auc_score'],
                                'recall': model['recall'],
                                'precision': model['precision'],
                                'TN': model['TN'],
                                'FN': model['FN'],
                                'TP': model['TP'],
                                'FP': model['FP'],
                                'NPV': model['NPV'],
                                'PPV': model['PPV'],
                                'sensitivity': model['sensitivity'],
                                'specificity': model['specificity'],
                                'PRC': 0}
    erc.add_row_as_dict(results_row)
erc.save_df('../results/pd_results_dataframe.pkl')

In [36]:
erc.get_df()

Unnamed: 0,dt,experiment_name,model_name,graph,vertices,edge,runtime,folds,samples,accuracy,...,precision,TN,FN,TP,FP,NPV,PPV,sensitivity,specificity,PRC
0,2021-08-22 14:20:55.768391,acute-inflammations-diagnosis-discretized,MLPClassifier,0,,,2354.816,2,120,1.0,...,1.0,70.0,0.0,50.0,0.0,1.0,1.0,1.0,1.0,0.0
1,2021-08-22 14:20:55.781948,acute-inflammations-diagnosis-discretized,KNeighborsClassifier,0,,,1045.547,2,120,1.0,...,1.0,70.0,0.0,50.0,0.0,1.0,1.0,1.0,1.0,0.0
2,2021-08-22 14:20:55.793478,acute-inflammations-diagnosis-discretized,SVC-Linear,0,,,996.142,2,120,0.908333,...,0.867925,63.0,4.0,46.0,7.0,0.940299,0.867925,0.92,0.9,0.0
3,2021-08-22 14:20:55.804719,acute-inflammations-diagnosis-discretized,SVC,0,,,1013.24,2,120,1.0,...,1.0,70.0,0.0,50.0,0.0,1.0,1.0,1.0,1.0,0.0
4,2021-08-22 14:20:55.815834,acute-inflammations-diagnosis-discretized,GaussianProcessClassifier,0,,,745.378,2,120,1.0,...,1.0,70.0,0.0,50.0,0.0,1.0,1.0,1.0,1.0,0.0
5,2021-08-22 14:20:55.826900,acute-inflammations-diagnosis-discretized,DecisionTreeClassifier,0,,,25.915,2,120,1.0,...,1.0,70.0,0.0,50.0,0.0,1.0,1.0,1.0,1.0,0.0
6,2021-08-22 14:20:55.838017,acute-inflammations-diagnosis-discretized,RandomForestClassifier,0,,,114.314,2,120,1.0,...,1.0,70.0,0.0,50.0,0.0,1.0,1.0,1.0,1.0,0.0
7,2021-08-22 14:20:55.849048,acute-inflammations-diagnosis-discretized,AdaBoostClassifier,0,,,236.776,2,120,1.0,...,1.0,70.0,0.0,50.0,0.0,1.0,1.0,1.0,1.0,0.0
8,2021-08-22 14:20:55.860089,acute-inflammations-diagnosis-discretized,GaussianNB,0,,,19.105,2,120,0.966667,...,1.0,70.0,4.0,46.0,0.0,0.945946,1.0,0.92,1.0,0.0
9,2021-08-22 14:20:55.871343,acute-inflammations-diagnosis-discretized,QuadraticDiscriminantAnalysis,0,,,24.162,2,120,1.0,...,1.0,70.0,0.0,50.0,0.0,1.0,1.0,1.0,1.0,0.0
