In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time
from IPython.display import display
%matplotlib inline

# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
# Scalars
from sklearn.preprocessing import StandardScaler,PowerTransformer,MinMaxScaler,QuantileTransformer,normalize
# Features
from sklearn.feature_selection import VarianceThreshold, SelectKBest

# Models
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN

# required for importin modules from other directories
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
from common import misc
from common.data_parser import *
from common.model_trainer import *
from common.misc import *
from config import *
from heart_helpers import *

plt.style.use("seaborn")

MODEL = KNN
params = {
    "n_neighbors" : list(range(3,50)), 
    "weights" : ["uniform", "distance"],
    "p" : [1,2]
    }
TEST_SIZE = 0.25
RND_STATE = 42

x_train, x_test, y_train, y_test = train_test_split(preprocessed_data, labels, test_size=0.3, random_state=1 )

hidden_layers = []
for i in range(5,10):
    for j in range(5,10):
        hidden_layers.append((i,j))
print(hidden_layers)
params = {
    "hidden_layer_sizes" : hidden_layers, 
    "alpha" : [0.001, 0.0001],
    #solver" : ["lbfgs", "sgd", "adam"]
    }

In [None]:
df_raw = parse_heart_disease("big")
df_raw.info()

In [None]:
def plot_params(results, scores="score",fileName=None):
    gini = results.loc[(results['criterion'] == "gini")]
    entropy = results.loc[(results['criterion'] == "entropy")]

    plt.style.use('seaborn')
    if isinstance(scores, str):
        fix, ax = plt.subplots(figsize=(8,6))
        for f in params["max_features"]:
            for s in [2, 8]:
                entropy[entropy["max_features"]==f].plot(x="n_estimators", y=SCORE, label=f"gini + {f} + {s}",
                    ax=ax, marker="o", logx=True);
                gini[gini["max_features"]==f].plot(x="n_estimators", y=SCORE, label=f"gini + {f} + {s}", 
                    ax=ax, marker="o", logx=True);
        plt.legend()
        ax.set_title(scores, fontsize=18)
        plt.ylim(0, 1.)
        plt.show()
        if fileName:
            plt.savefig(fileName)
        return plt.gcf()

    if isinstance(scores, list):
        fix, ax = plt.subplots(nrows=len(scores),figsize=(8,6*len(scores)))
        for score in scores:
            for f in params["max_features"]:
                for s in [2, 8]:
                    entropy[entropy["max_features"]==f].plot(x="n_estimators", y=score, label=f"gini + {f} + {s}",
                        ax=ax, marker="o", logx=True);
                    gini[gini["max_features"]==f].plot(x="n_estimators", y=score, label=f"gini + {f} + {s}", 
                        ax=ax, marker="o", logx=True);
            plt.legend()
            ax.set_title(score, fontsize=18)
        plt.ylim(0, 1.)
        plt.show()
        if fileName:
            plt.savefig(fileName)
        return plt.gcf()


    

# Impute mode = 0

In [None]:
impute_mode = 0

df = process_heart(df_raw, impute_mode=impute_mode)
x, y = df[HEART_FEATS], df[HEART_TARGET]
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=TEST_SIZE, random_state=RND_STATE)
df.info()

In [None]:
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    x_train, y_train, x_test, y_test, 
    accuracy_score,
    thread_cnt=4
    )

modeltrainer.train()
#modeltrainer.save_result("out/knn_params.csv")
result = modeltrainer.result
result.head()

SCORES = "score"
plot_params(result, scores=SCORES)


# Impute mode = 1

In [None]:
impute_mode = 0

df = process_heart(df_raw, impute_mode=impute_mode)
x, y = df[HEART_FEATS], df[HEART_TARGET]
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=TEST_SIZE, random_state=RND_STATE)
df.info()

In [None]:
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    x_train, y_train, x_test, y_test, 
    accuracy_score,
    thread_cnt=8
    )

modeltrainer.train()
#modeltrainer.save_result("out/knn_params.csv")
result = modeltrainer.result
result.head()

SCORES = "score"
plot_params(result, scores=SCORES)

# Impute mode = 2

In [None]:
impute_mode = 0

df = process_heart(df_raw, impute_mode=impute_mode)
x, y = df[HEART_FEATS], df[HEART_TARGET]
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=TEST_SIZE, random_state=RND_STATE)
df.info()

In [None]:
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    x_train, y_train, x_test, y_test, 
    accuracy_score,
    thread_cnt=8
    )

modeltrainer.train()
#modeltrainer.save_result("out/knn_params.csv")
result = modeltrainer.result
result.head()

SCORES = "score"
plot_params(result, scores=SCORES)

# Confusion matrices

In [None]:
modeltrainer = ModelTrainer(MODEL, params, x_train, y_train, x_test, y_test, accuracy_score, thread_cnt=4)
modeltrainer.cm_setup([0, 1, 2, 3, 4])
modeltrainer.train()
modeltrainer.save_result("out/rf.csv")

cms = modeltrainer.cms

df_results = modeltrainer.result

In [None]:
modeltrainer.plot_confusion_matrix(0);

In [None]:
plot_confusion_matrix(cms[0][1], [0, 1, 2, 3, 4], normalize=True, title="Confusion matrix", cmap=plt.cm.Reds)