In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import time
%matplotlib inline

# load different classifieres
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier as RFC

from sklearn.model_selection import train_test_split

# evaluation
from sklearn.metrics import accuracy_score

# preprocessing 
from sklearn.preprocessing import StandardScaler

# for selection the right path
import os,sys,inspect,pathlib
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)


from common import misc
from common.data_parser import *
from common.model_trainer import *
from common.misc import *
from config import *
from common.plotting import *

In [None]:
module_path = pathlib.Path(os.getcwd()).parent
if str(module_path) not in sys.path:
    sys.path.append(str(module_path))
print(sys.path)

from common.dataset_grabber import get_data_path

In [None]:
datapath_train = get_data_path("Congressional_Voting", "CongressionalVotingID.shuf.lrn.csv")
display(datapath_train)
datapath_test = get_data_path("Congressional_Voting", "CongressionalVotingID.shuf.tes.csv")
display(datapath_test)
datapath_examp = get_data_path("Congressional_Voting", "CongressionalVotingID.shuf.sol.ex.csv")
display(datapath_test)

# Data exploration
Data consinst out of of class for the classification and the instances are mostly "y,n,unknown".

In [None]:
df_train = pd.read_csv(datapath_train).drop("ID",axis = 1)
df_test = pd.read_csv(datapath_test).drop("ID",axis = 1)

df_train = df_train.rename(columns={"class": "Class"})
df_train

In [None]:
df_train = df_train.replace("n",-1)
df_train = df_train.replace("y",1)
df_train = df_train.replace("unknown",0)


df_test = df_test.replace("n",-1)
df_test = df_test.replace("y",1)
df_test = df_test.replace("unknown",0)

# Training

In [None]:
#X_lrn = df_train.loc[:,"handicapped-infants":]
X_lrn = df_train.loc[:,"handicapped-infants":]
Y_lrn = df_train.loc[:,"Class":"Class"]
#df_test = df_test.loc[:,red_list[1]:red_list[-1]]

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_lrn, Y_lrn, test_size=0.4) #random noch einfügen

In [None]:
X_train

# MLP Classifierer

In [None]:
MODEL = MLP
MODEL_TYPE = "MLP"
params = {
    "alpha" : [1e-3,1e-2,1e-1,1,1e1], 
    "hidden_layer_sizes" : [(20,20),(50,50),(100,100)],
    "solver" : ["adam","lbfgs"],
    "activation" : ["tanh", "relu"]
    }
SCORES = "accuracy"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"

In [None]:
LOGX = True
YLIMS = (0.5, 1)


def plot_params(results, scores="score", fileName=None, params=params, ylims=YLIMS):
    param_keys = list(params)
    first_key = param_keys[0]
    rest = param_keys[1:]

    plt.style.use('seaborn')
    if isinstance(scores, str):
        fix, ax = plt.subplots(figsize=(8,6))
        for vals in tuple(itertools.product(*tuple(x for x in tuple(params.values())[1:]))):
            label = " / ".join([str(x) for x in vals])
            filters = " & ".join([str(x)+' == "'+str(v)+'"' for x, v in zip(rest, vals)])
            results.query(filters).plot(
                x=first_key, y=scores, label=label,
                    ax=ax, marker="o", logx=LOGX);
        plt.legend()
        ax.set_title(scores, fontsize=18)

        plt.ylim(*ylims)
        if fileName:
            plt.savefig(fileName)
        plt.show()
        return plt.gcf() 

In [None]:
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    X_train, Y_train, X_valid, Y_valid, 
    accuracy_score,
    thread_cnt=8
    )
modeltrainer.cm_setup(["democrat", "republican"])    
modeltrainer.train()

modeltrainer.save_result("out/mlp_params.csv")
cms = modeltrainer.cms
result_MLP = modeltrainer.result

bestscore, bestidx = modeltrainer.best_score(ret_index=True)

In [None]:
result_MLP

In [None]:
SCORES = "accuracy"
plot_mlp(result_MLP, scores=SCORES, fileName=fname,params=params, ylims=(0.8,1))

In [None]:
SCORES = "train_time"
OUT_DIR = f"out/{MODEL_TYPE}/"
fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_mlp(result_MLP, scores=SCORES, fileName=fname,params=params, ylims=(0,10))

In [None]:
SCORES = "f1"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_mlp(result_MLP, scores=SCORES, fileName=fname,params=params, ylims=(0.94,1));

In [None]:
SCORES = "recall"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_mlp(result_MLP, scores=SCORES, fileName=fname,params=params, ylims=(0.94,1));

In [None]:
SCORES = "precision"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_mlp(result_MLP, scores=SCORES, fileName=fname,params=params, ylims=(0.94,1));

In [None]:
plot_confusion_matrix(cms[0][1], ["democrat", "republican"], normalize=True, title="Confusion matrix", cmap=plt.cm.Reds)
plt.savefig(OUT_DIR + "CM_" + MODEL_TYPE + ".pdf")

In [None]:
result_MLP[result_MLP["score"] == np.max(result_MLP["score"])]

In [None]:
bestscore, bestidx

# KNN Classifierer

In [None]:
MODEL = KNN
MODEL_TYPE = "KNN"
params = {
    "n_neighbors" : list(range(1, 50)), 
    "weights" : ["uniform", "distance"],
    #"algorithm" : ["auto"]
    "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"]
    }

SCORES = "accuracy"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"

In [None]:
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    X_train, Y_train, X_valid, Y_valid, 
    accuracy_score,
    thread_cnt=8
    )
modeltrainer.cm_setup(["democrat", "republican"])    
modeltrainer.train()

modeltrainer.save_result("out/knn_params.csv")
cms = modeltrainer.cms
result_KNN = modeltrainer.result

In [None]:
result_KNN

In [None]:
SCORES = "accuracy"
plot_params(result_KNN, scores=SCORES, fileName=fname,params=params, ylims=(0.8,1));

In [None]:
SCORES = "train_time"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_params(result_KNN, scores=SCORES, fileName=fname,params=params, ylims=(0,0.02));

In [None]:
SCORES = "recall"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_params(result_KNN, scores=SCORES, fileName=fname,params=params, ylims=(0.8,1));

In [None]:
SCORES = "precision"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_params(result_KNN, scores=SCORES, fileName=fname,params=params, ylims=(0.8,1));

In [None]:
SCORES = "f1"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_params(result_KNN, scores=SCORES, fileName=fname,params=params, ylims=(0.8,1));

In [None]:
plot_confusion_matrix(cms[0][1], ["democrat", "republican"], normalize=True, title="Confusion matrix", cmap=plt.cm.Reds)
plt.savefig(OUT_DIR + "CM_" + MODEL_TYPE + ".pdf")

In [None]:
result_KNN[result_KNN["score"] == np.max(result_KNN["score"])]

# RandomForrest classifierer

In [None]:
MODEL = RFC
MODEL_TYPE = "RFC"
params = {
    "n_estimators": [1, 8, 10, 12, 15, 20, 50, 100,  1000],
    "max_features": ["sqrt", "log2"],
    "criterion": ["gini", "entropy"]
}
SCORES = "accuracy"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"

In [None]:
modeltrainer = ModelTrainer(
    MODEL, 
    params, 
    X_train, Y_train, X_valid, Y_valid, 
    accuracy_score,
    thread_cnt=8
    )
modeltrainer.cm_setup(["democrat", "republican"])    
modeltrainer.train()

modeltrainer.save_result("out/rf_params.csv")
cms = modeltrainer.cms
result_RFC = modeltrainer.result

In [None]:
df_results_RFC = pd.read_csv("out/rf_params.csv")

In [None]:
SCORES = "accuracy"
plot_params(result_RFC, scores=SCORES, fileName=fname,params=params, ylims=(0.8,1));

In [None]:
SCORES = "train_time"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_params(result_RFC, scores=SCORES, fileName=fname,params=params, ylims=(0,2.5));

In [None]:
SCORES = "f1"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_params(result_RFC, scores=SCORES, fileName=fname,params=params, ylims=(0.8,1));

In [None]:
SCORES = "recall"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_params(result_RFC, scores=SCORES, fileName=fname,params=params, ylims=(0,1));

In [None]:
SCORES = "precision"
OUT_DIR = f"out/{MODEL_TYPE}/"

fname = OUT_DIR + f"{MODEL_TYPE}_{SCORES}.pdf"
plot_params(result_RFC, scores=SCORES, fileName=fname,params=params, ylims=(0,1));

In [None]:
plot_confusion_matrix(cms[0][1], ["democrat", "republican"], normalize=True, title="Confusion matrix", cmap=plt.cm.Reds)
plt.savefig(OUT_DIR + "CM_" + MODEL_TYPE + ".pdf")

In [None]:
result_RFC[result_RFC["score"] == np.max(result_RFC["score"])]

# Filepreparation for the Kagglecompetition

In [None]:
df_test = pd.read_csv(datapath_test).drop("ID",axis = 1)
df_test.head()

In [None]:
df_test = df_test.replace("n",-1)
df_test = df_test.replace("y",1)
df_test = df_test.replace("unknown",0)
df_test.head()

In [None]:
scaler = StandardScaler()  
scaler.fit(df_test) 
df_test_SC = scaler.transform(df_test) 

In [None]:
clf = RFC(n_estimators = 2, max_features = 'sqrt', criterion =  'entropy')
clf.fit(X_lrn,Y_lrn)
Y_pred = clf.predict(df_test_SC)

# Upload the file for Kaggle

In [None]:
df_examp = pd.read_csv(datapath_examp)
ID_df = df_examp["ID"]
ID_df

In [None]:
solution_data = list(zip(list(ID_df), Y_pred))

In [None]:
#Y_pred = Y_pred.drop("ID",axis = 1)
#solution_data = list(zip(list(range(0,len(Y_pred))), Y_pred))

In [None]:
solution_table = pd.DataFrame(solution_data, columns=['ID', 'Class'])
solution_table

In [None]:
solution_table.to_csv(f"results/RFC.csv", index=False)