In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time
import itertools
import pathlib
from IPython.display import display
%matplotlib inline

# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
# Scalars
from sklearn.preprocessing import StandardScaler,PowerTransformer,MinMaxScaler,QuantileTransformer,normalize
# Features
from sklearn.feature_selection import VarianceThreshold, SelectKBest

# Models
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier as MLP

# required for importin modules from other directories
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
from common import misc
from common.data_parser import *
from common.model_trainer import *
from common.misc import *
from config import *
from heart_helpers import *

# NEW --> contains plot_params, plot_confusion_matrix and plot_corr_heatmap
from common.plotting import *

plt.style.use("seaborn")

In [None]:
# Parameter setup for this notebook
MODEL = MLP
MODEL_TYPE = "MLP"
hidden_layers = []
for i in range(5,10):
    for j in range(5,10):
        hidden_layers.append((i,j))
print("hidden_layers:")
print(hidden_layers)
params = {
    "hidden_layer_sizes" : hidden_layers, 
    "alpha" : [0.001, 0.0001],
    #solver" : ["lbfgs", "sgd", "adam"]
    }
TEST_SIZE = 0.25
RND_STATE = 42
OUT_DIR = f"out/{MODEL_TYPE}/"
SET = "multi" # "multi"

In [None]:

def create_out_dirs():
    import os
    if not os.path.exists("./out"):
        os.makedirs("./out")
    if not os.path.exists("./out/runtimes"):
        os.makedirs("./out/runtimes")

    for md in ["RFC", "KNN", "MLP"]:
        if not os.path.exists(f"./out/{md}"):
            os.makedirs(f"./out/{md}")
        if not os.path.exists(f"./out/{md}/cms"):
            os.makedirs(f"./out/{md}/cms")
        if not os.path.exists(f"./out/{md}/params"):
            os.makedirs(f"./out/{md}/params")
create_out_dirs()

# Filename function
def get_fname(save_cm=False,force_filename:str=None, file_format:str="pdf"):
    """Calculate filename based on parameter and variable values
    OR
    force a filenamme by appending it to the OUT_DIR set above
    """
    if force_filename:
        return OUT_DIR + force_filename
    if save_cm:
        return OUT_DIR + "cms/" + f"{SET}_{MODEL_TYPE}_{impute_mode}_{BW}_CM.pdf" if not scaler else OUT_DIR + "cms/" + f"{SET}_{MODEL_TYPE}_{impute_mode}_{BW}_CM_scaler.pdf"
    return OUT_DIR + f"{SET}_{MODEL_TYPE}_{impute_mode}_{SCORES}.pdf" if not scaler else OUT_DIR + f"{SET}_{MODEL_TYPE}_{impute_mode}_{SCORES}_scaler.pdf"

In [None]:
if SET=="binary":
    df_raw = parse_heart_disease("sma")
    SET_CLASSES = [0, 1]
else:
    df_raw = parse_heart_disease("big")
    SET_CLASSES = [0, 1, 2, 3, 4]

df_raw.info()

# FIRST TEST

In [None]:
# Setup test specific parameters/variables here
scaler = StandardScaler() # None
SCALER_NAME = "standard"
impute_mode = 0

# Or also modeltrainer parameters
eval_func = accuracy_score

In [None]:
x, y  = process_heart(df_raw, impute_mode = impute_mode, scaler=scaler, ret_xy=True)
# OR
#df = process_heart(df_raw, , impute_mode = impute_mode, scaler=scaler, ret_xy=False)
#display(df.info())
#x, y = df[VOTING_FEATS], df[VOTING_TARGET]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RND_STATE)
y

In [None]:
# Instantiate modeltrainer and train models
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    x_train, y_train, x_test, y_test, 
    eval_func,
    thread_cnt=4
    )
# Setup cm config if wanted
# modeltrainer.cm_setup([0, 1])
# or
modeltrainer.cm_setup(SET_CLASSES)
modeltrainer.train()

#modeltrainer.save_result("out/knn_params.csv")
result = modeltrainer.result
result.head()

SCORES = "accuracy"
# if fileName is set, make sure that the directory exists
plot_params(result, params=params,scores=SCORES, ylims=(0.4,1.1));

# TESTS

In [None]:
scaler = StandardScaler() # None
SCALER_NAME = "standard"

# IMPUTE Mode = 0

In [None]:
impute_mode = 0

# Or also modeltrainer parameters
eval_func = accuracy_score

# Prepare data and do split
x, y  = process_heart(df_raw, impute_mode = impute_mode, scaler=scaler, ret_xy=True)
# OR
#df = process_heart(df_raw, , impute_mode = impute_mode, scaler=scaler, ret_xy=False)
#display(df.info())
#x, y = df[VOTING_FEATS], df[VOTING_TARGET]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RND_STATE)
y

# Instantiate modeltrainer and train models
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    x_train, y_train, x_test, y_test, 
    eval_func,
    thread_cnt=4
    )
# Setup cm config if wanted
# modeltrainer.cm_setup([0, 1])
# or
modeltrainer.cm_setup(SET_CLASSES)
modeltrainer.train()

#modeltrainer.save_result("out/knn_params.csv")
result = modeltrainer.result
result.head()

SCORES = "accuracy"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "f1"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "recall"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "precision"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));


bestscore, bestidx = modeltrainer.best_score(ret_index=True)
worstscore, worstidx = modeltrainer.worst_score(ret_index=True)
print("Best params")
plt.clf()
modeltrainer.plot_confusion_matrix(bestidx, title=f"Confusion matrix\n(acc={bestscore})");
BW = "best" # for naming of plot output file
plt.savefig(get_fname(save_cm=True))
print("Worst params")
plt.clf()
BW = "worst" # for naming of plot output file
modeltrainer.plot_confusion_matrix(worstidx, title=f"Confusion matrix\n(acc={worstscore})");
plt.savefig(get_fname(save_cm=True))

# IMPUTE Mode = 1

In [None]:
impute_mode = 1

# Or also modeltrainer parameters
eval_func = accuracy_score

# Prepare data and do split
x, y  = process_heart(df_raw, impute_mode = impute_mode, scaler=scaler, ret_xy=True)
# OR
#df = process_heart(df_raw, , impute_mode = impute_mode, scaler=scaler, ret_xy=False)
#display(df.info())
#x, y = df[VOTING_FEATS], df[VOTING_TARGET]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RND_STATE)
y

# Instantiate modeltrainer and train models
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    x_train, y_train, x_test, y_test, 
    eval_func,
    thread_cnt=4
    )
# Setup cm config if wanted
# modeltrainer.cm_setup([0, 1])
# or
modeltrainer.cm_setup(SET_CLASSES)
modeltrainer.train()

#modeltrainer.save_result("out/knn_params.csv")
result = modeltrainer.result
result.head()

SCORES = "accuracy"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "f1"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "recall"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "precision"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));


bestscore, bestidx = modeltrainer.best_score(ret_index=True)
worstscore, worstidx = modeltrainer.worst_score(ret_index=True)
print("Best params")
plt.clf()
modeltrainer.plot_confusion_matrix(bestidx, title=f"Confusion matrix\n(acc={bestscore})");
BW = "best" # for naming of plot output file
plt.savefig(get_fname(save_cm=True))
print("Worst params")
plt.clf()
BW = "worst" # for naming of plot output file
modeltrainer.plot_confusion_matrix(worstidx, title=f"Confusion matrix\n(acc={worstscore})");
plt.savefig(get_fname(save_cm=True))

# IMPUTE Mode = 2

In [None]:
impute_mode = 2

# Or also modeltrainer parameters
eval_func = accuracy_score

# Prepare data and do split
x, y  = process_heart(df_raw, impute_mode = impute_mode, scaler=scaler, ret_xy=True)
# OR
#df = process_heart(df_raw, , impute_mode = impute_mode, scaler=scaler, ret_xy=False)
#display(df.info())
#x, y = df[VOTING_FEATS], df[VOTING_TARGET]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RND_STATE)
y

# Instantiate modeltrainer and train models
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    x_train, y_train, x_test, y_test, 
    eval_func,
    thread_cnt=4
    )
# Setup cm config if wanted
# modeltrainer.cm_setup([0, 1])
# or
modeltrainer.cm_setup(SET_CLASSES)
modeltrainer.train()

#modeltrainer.save_result("out/knn_params.csv")
result = modeltrainer.result
result.head()

SCORES = "accuracy"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "f1"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "recall"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));
SCORES = "precision"
plot_params(result, params=params,scores=SCORES, fileName=get_fname(), ylims=(0.4,1.1));


bestscore, bestidx = modeltrainer.best_score(ret_index=True)
worstscore, worstidx = modeltrainer.worst_score(ret_index=True)
print("Best params")
plt.clf()
modeltrainer.plot_confusion_matrix(bestidx, title=f"Confusion matrix\n(acc={bestscore})");
BW = "best" # for naming of plot output file
plt.savefig(get_fname(save_cm=True))
print("Worst params")
plt.clf()
BW = "worst" # for naming of plot output file
modeltrainer.plot_confusion_matrix(worstidx, title=f"Confusion matrix\n(acc={worstscore})");
plt.savefig(get_fname(save_cm=True))