In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time
import itertools
import pathlib
from IPython.display import display
%matplotlib inline

# Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
# Scalars
from sklearn.preprocessing import StandardScaler,PowerTransformer,MinMaxScaler,QuantileTransformer,normalize
# Features
from sklearn.feature_selection import VarianceThreshold, SelectKBest

# Models
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier as MLP

# required for importin modules from other directories
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
from common import misc
from common.data_parser import *
from common.model_trainer import *
from common.misc import *
from config import *
from functions import *

# NEW --> contains plot_params, plot_confusion_matrix and plot_corr_heatmap
from common.plotting import *

plt.style.use("seaborn")

In [None]:
# Parameter setup for this notebook
MODEL = RFC
MODEL_TYPE = "RFC"
params = {
    "n_estimators": [1, 8, 10, 12, 15, 20, 50, 100,  1000],
    "max_features": ["sqrt", "log2"],
    "criterion": ["gini", "entropy"]
}
TEST_SIZE = 0.25
RND_STATE = 42
OUT_DIR = f"out/{MODEL_TYPE}/"

In [None]:
# Filename function
def get_fname(force_filename:str=None, file_format:str="pdf"):
    """Calculate filename based on parameter and variable values
    OR
    force a filenamme by appending it to the OUT_DIR set above
    """
    if force_filename:
        return OUT_DIR + force_filename

    return OUT_DIR + f"{MODEL_TYPE}_{SCALER_NAME}.{file_format}"

In [None]:
df_raw = parse_congressional_voting("train")
df_raw.info()

# FIRST TEST

In [None]:
# Setup test specific parameters/variables here

scaler = StandardScaler() # None
SCALER_NAME = "standard"

# Or also modeltrainer parameters
eval_func = accuracy_score

In [None]:
# Prepare data and do split
fname = get_fname()

x, y  = process_voting(df_raw, answer_mapping=DEF_MAPPING, scaler=scaler, ret_xy=True)
# OR
#df = process_voting(df_raw, answer_mapping=DEF_MAPPING, scaler=None, ret_xy=False)
#display(df.info())
#x, y = df[VOTING_FEATS], df[VOTING_TARGET]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RND_STATE)
y

In [None]:
# Instantiate modeltrainer and train models
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    x_train, y_train, x_test, y_test, 
    eval_func,
    thread_cnt=4
    )
modeltrainer.train()
# Setup cm config if wanted
# modeltrainer.cm_setup([0, 1])
# or
modeltrainer.cm_setup(VOTING_CLASSES) # VOTING_CLASSES = ["democrat", "republican"]

#modeltrainer.save_result("out/knn_params.csv")
result = modeltrainer.result
result.head()

SCORES = "accuracy"
# if fileName is set, make sure that the directory exists
plot_params(result, scores=SCORES, fileName=fname, ylims=(0.4,1.1));

In [None]:
# Can also give a modified version of params dict to plot less!
param2 = {'n_estimators': [1, 8, 10, 12, 15, 20, 50, 100, 1000],
 'max_features': ['sqrt'],
 'criterion': ['gini', 'entropy']}

 # function returns the fig, so you can do smth with that
fig = plot_params(result, scores="inference_time", params=param2, ylims=(0,0.15));


# Confusion matrices

In [None]:
cms = modeltrainer.cms

df_results = modeltrainer.result

display(modeltrainer.cms)

In [None]:
modeltrainer.plot_confusion_matrix(1)

In [None]:
# or like this
plot_confusion_matrix(cms[0][1], VOTING_CLASSES, normalize=True, title="Confusion matrix", cmap=plt.cm.Reds)