# Model Evaluation

In [None]:
# Load libraries
import sys
import os
import warnings
import numpy as np
import matplotlib.pylab as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, \
accuracy_score, ConfusionMatrixDisplay, f1_score
from sklearn.preprocessing import binarize

In [None]:
# Set working directory - ONLY RUN ONCE
os.chdir('../')

In [None]:
import resources.data_preprocessing as dp
import resources.split_normalization as sn
import resources.dummies_colinearity as dc
import models.random_forest as rf
import models.endline_analysis as ea
import plots.plots as plts

In [None]:
# Hyde warnings
warnings.filterwarnings('ignore')

### Data loading and preprocessing

In [None]:
X_train, X_test, X_dev, y_train, y_test, y_dev, X_attr = dp.load_data()

In [None]:
# Political variables to remove for model testing 
political_attr = ["PLR", "PDC", "PS", "UDC", "PEV_PCS", "PVL", "PBD", "PST_Sol", "PES", "small_right_parties"]
X_train_apol = X_train
X_test_apol = X_test
X_dev_apol = X_dev

for col in political_attr:
    idx = political_attr.index(col)
    X_train_apol = X_train_apol.drop(X_train_apol.columns[idx], axis=1)
    X_test_apol = X_test_apol.drop(X_test_apol.columns[idx], axis=1)
    X_dev_apol = X_dev_apol.drop(X_dev_apol.columns[idx], axis=1)

X_attr_apol = [c for c in X_attr if c not in political_attr]

### Modelling 

In [None]:
labels = ["no", "yes"]

#### Logistic Regression

In [None]:
# Find best params for logistic regression
lg_dict, accuracies_lg = ea.logistic_reg_hp(X_train, y_train)
C = lg_dict["max_params"]["C"]
solver = lg_dict["max_params"]["solver"]
penalty = lg_dict["max_params"]["penalty"]

In [None]:
accuracies

In [None]:
# Make prediction with best params

# !TO MOVE TO A FUNCTION
logreg = LogisticRegression(random_state=0, C = C, solver = solver, penalty = penalty)
logreg.fit(X_train, y_train)
y_pred_lg = logreg.predict(X_test)

In [None]:
# Accuracy & F-Score
print("Accuracy: ", accuracy_score(y_test, y_pred_lg))
print("F score: ", f1_score(y_test, y_pred_lg))
print("Classification report: \n", classification_report(y_test, y_pred_lg))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_lg)
ConfusionMatrixDisplay(cm, display_labels=labels).plot()

In [None]:
# Plot % Yes & Error
plts.plot_yes_perc_error(X_test_all, y_test, y_pred_lg) 

#### KNN

In [None]:
# Find best k for KNN model
best_k_dict, accuracies = ea.knn_analysis_hp(X_train, X_dev, y_train, y_dev)
best_k = best_k_dict["max_k"]

In [None]:
# Plot k vs accuracy - Move Pavan's code to function from knn_analysis.py

In [None]:
# Make prediction with best k

# !TO MOVE TO A FUNCTION
classifier = KNeighborsClassifier(n_neighbors=best_k)
classifier.fit(X_train, y_train)
y_pred_knn = classifier.predict(X_test)

In [None]:
# Accuracy & F-Score
print("Accuracy: ", accuracy_score(y_test, y_pred_knn))
print("F score: ", f1_score(y_test, y_pred_knn))
print("Classification report: \n", classification_report(y_test, y_pred_knn))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_knn)
ConfusionMatrixDisplay(cm, display_labels=labels).plot()

In [None]:
# Plot % Yes & Error
plts.plot_yes_perc_error(X_test_all, y_test, y_pred_knn) 

#### Decision Tree

In [None]:
# Find best params for decision tree
dt_best_params_dict, accuracies_dt = ea.decision_tree_hp(X_train, X_dev, y_train, y_dev)
accuracies_dt

In [None]:
# Make prediction with best params

# !TO MOVE TO A FUNCTION
def xx(dt_best_params_dict):
    clf_model = DecisionTreeClassifier(criterion=dt_best_params_dict["best_criterion"], 
                                       random_state=42, 
                                       max_depth=dt_best_params_dict["best_depth_tree"])
    clf_model.fit(X_train,y_train)
    y_pred = clf_model.predict(X_test)
    return (clf_model, y_pred)

In [None]:
# Plot best decision tree - Move Pavan's code in decision_tree.py to function

In [None]:
# Accuracy & F-Score
print("Accuracy: ", accuracy_score(y_test, y_pred_dt))
print("F score: ", f1_score(y_test, y_pred_dt))
print("Classification report: \n", classification_report(y_test, y_pred_dt))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_dt)
ConfusionMatrixDisplay(cm, display_labels=labels).plot()

In [None]:
# Plot % Yes & Error
plts.plot_yes_perc_error(X_test_all, y_test, y_pred_dt) 

#### Random Forest

In [None]:
# Find best params for Random Forest model
best_params_rf, best_acc_rf, accuracies_rf = rf.random_forest(X_train=X_train, X_dev=X_dev, 
                                            y_train=y_train, y_dev=y_dev, random_state = True)

In [None]:
accuracies_rf

In [None]:
# Plot random forests' accuracies based on parameters
rf.plot_random_forest(accuracies_rf)

In [None]:
# Make prediction with best model
y_test_pred_rf = rf.predict_random_forest(best_params_rf, X_train, y_train, X_test, random_state = True)

In [None]:
# Evaluation metrics 
print("Test Data - Classification Report: \n", classification_report(y_test, y_test_pred_rf))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred_rf)
ConfusionMatrixDisplay(cm, display_labels=labels).plot()

In [None]:
# REVIEW PLOT 
# Plot % Yes & Error
ys = plts.plot_yes_perc_error(X_test_all, y_test, y_pred_rf) 
print(ys)

#### Neural Networks 

In [None]:
# Find best params for neural net

# MISSING IMPLEMENTATION
nn_best_params, accuracies = nn(X_train, X_dev, y_train, y_dev)

In [None]:
# Plots??

In [None]:
# Make prediction with best model 

# MISSING IMPLEMENTATION 
y_pred_nn = []

In [None]:
# Accuracy & F-Score
print("Accuracy: ", accuracy_score(y_test, y_pred_nn))
print("F score: ", f1_score(y_test, y_pred_nn))
print("Classification report: \n", classification_report(y_test, y_pred_nn))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_nn)
ConfusionMatrixDisplay(cm, display_labels=labels).plot()

In [None]:
# Plot % Yes & Error
plts.plot_yes_perc_error(X_test_all, y_test, y_pred_nn) 

### Conclusions

In [None]:
# Table comparing best models and accuracies?