In [1]:
import sys
sys.path.append("../src/")
from results import Experiment_Results
# Regression Example With Boston Dataset: Baseline
import math
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from joblib import dump, load
from datetime import datetime
# Standard ML Models for comparison
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, classification_report
# Splitting data into training/testing
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

#Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, confusion_matrix

# Distributions
import scipy

In [2]:
dataset1 = {'name': "parkinsons-discretized", \
            'filename': "./parkinsons-discretized.csv", \
            'folds': [2,5,10]}

dataset2 = {'name': "parkinsons-discretized-oversampled", \
            'filename': "./parkinsons-discretized-oversampled.csv", \
            'folds': [2,5,10]}
datasets = [dataset1, dataset2]

In [22]:
#oversample = SMOTE()
#X = df.iloc[:,0:len(cols)-1]
#Y = df.iloc[:,len(cols)-1]
#print("Number of rows before SMOTE: " + str(len(Y)))
#X, Y = oversample.fit_resample(X, Y)
#print("Number of rows after SMOTE: " + str(len(Y)))

In [3]:
def model_report(model_name, model, folds, train_time, y_true, y_pred):
    model_dict = {'model_name': model_name, 'model': model, 'folds': folds, 'train_time': train_time}
    cm = confusion_matrix(y_true, y_pred)
    model_dict['accuracy'] = accuracy_score(y_true, y_pred)
    model_dict['recall'] = recall_score(y_true, y_pred)
    model_dict['precision'] = precision_score(y_true, y_pred)
    model_dict['roc_auc_score'] = roc_auc_score(y_true, y_pred)
    model_dict['TN'] = cm[0][0]
    model_dict['FN'] = cm[1][0]
    model_dict['TP'] = cm[1][1]
    model_dict['FP'] = cm[0][1]
    model_dict['population'] = model_dict['TN']+model_dict['FN']+model_dict['TP']+model_dict['FP']
    model_dict['NPV'] = model_dict['TN']/(model_dict['TN']+model_dict['FN'])
    model_dict['PPV'] = model_dict['TP']/(model_dict['TP']+model_dict['FP'])
    model_dict['sensitivity'] = model_dict['TP']/(model_dict['TP']+model_dict['FN'])
    model_dict['specificity'] = model_dict['TN']/(model_dict['TN']+model_dict['FP'])
    
    return model_dict

def evaluate(folds=2):
    # Instantiate the models
    all_models = []
    scoring_metrics = ['accuracy', 'precision', 'recall']
    model_names = ['MLPClassifier', 'KNeighborsClassifier', 'SVC-Linear', 'SVC', \
                    'GaussianProcessClassifier', 'DecisionTreeClassifier', \
                    'RandomForestClassifier', 'AdaBoostClassifier', 'GaussianNB', \
                    'QuadraticDiscriminantAnalysis']
    model1 = MLPClassifier(alpha=1, max_iter=1000)
    model2 = KNeighborsClassifier(3)
    model3 = SVC(kernel="linear", C=0.025)
    model4 = SVC(gamma=2, C=1)
    model5 = GaussianProcessClassifier(1.0 * RBF(1.0))
    model6 = DecisionTreeClassifier(max_depth=5)
    model7 = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    model8 = AdaBoostClassifier()
    model9 = GaussianNB()
    model10 = QuadraticDiscriminantAnalysis()
    
    # Train and predict with each model
    for i, model in enumerate([model1, model2, model3, model4, model5, model6, model7, model8, model9, model10]):
        cv = KFold(n_splits=folds)
        grid = GridSearchCV(estimator=model, param_grid={}, n_jobs=-1, cv=cv, scoring='accuracy')
        start_training_time = datetime.now()
        grid_result = grid.fit(X, Y)
        total_training_time = datetime.now() - start_training_time
        model = grid_result.best_estimator_
        y_pred = model.predict(X)
        report = model_report(model_names[i], model, folds, total_training_time, Y, y_pred)
        all_models.append(report)
    
    return all_models

def save_model_results(models, experiment_name):
    erc = Experiment_Results()
    erc.load_df('../results/pd_results_dataframe.pkl')
    for model in models:
        results_row = {'dt': datetime.now(),
                                'experiment_name': experiment_name,
                                'model_name': model['model_name'],
                                'graph': 0,
                                'runtime': model['train_time'].total_seconds() * 1000.0,
                                'folds': model['folds'],
                                'samples': df.iloc[:,0].count(),
                                'population': model['population'],
                                'accuracy': model['accuracy'],
                                'ROC': model['roc_auc_score'],
                                'recall': model['recall'],
                                'precision': model['precision'],
                                'TN': model['TN'],
                                'FN': model['FN'],
                                'TP': model['TP'],
                                'FP': model['FP'],
                                'NPV': model['NPV'],
                                'PPV': model['PPV'],
                                'sensitivity': model['sensitivity'],
                                'specificity': model['specificity'],
                                'PRC': 0}
        erc.add_row_as_dict(results_row)
    erc.save_df('../results/pd_results_dataframe.pkl')
    
for dataset in datasets:
    df = pd.read_csv(dataset['filename'])
    df = df.sample(frac=1)
    cols = df.columns
    X = df.iloc[:,0:len(cols)-1]
    Y = df.iloc[:,len(cols)-1]
    for fold in dataset['folds']:
        models = evaluate(fold)
        save_model_results(models, dataset['name'])

