In [None]:
# relevant imports

# base
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# data prep
from sklearn import preprocessing
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from scipy import stats

# nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

# import metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, average_precision_score
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, log_loss, precision_recall_curve
# !pip install seaborn
import seaborn as sn
import pandas as pd

# import matplotlib
import matplotlib.pyplot as plt

# uncomment !pip install commands if you get an error

In [None]:
# import feature data - make sure csv files are in the same directory
# x_train = pd.read_csv("./x_train.csv")
# x_val = pd.read_csv("./x_val.csv")

In [None]:
x_train_val = pd.read_csv("./x_train_val.csv")

In [None]:
# import state data => no header, so you have to specify header=None
y_name = ["state"]
# y_train = pd.read_csv("./y_train.csv", names=y_name, header=None)
# y_val = pd.read_csv("./y_val.csv", names=y_name, header=None)
y_train_val = pd.read_csv("./y_train_val.csv", names=y_name, header=None)

In [None]:
# y_train['state'] = y_train['state'].astype('category')
# y_val['state'] = y_val['state'].astype('category')
y_train_val['state'] = y_train_val['state'].astype('category')

In [None]:
# train = pd.concat([x_train,y_train],axis=1)
# val = pd.concat([x_val,y_val],axis=1)
train_val = pd.concat([x_train_val,y_train_val],axis=1)

In [None]:
# print(train.isnull().values.any())
# print(val.isnull().values.any())
print(train_val.isnull().values.any())

In [None]:
df1 = train_val.sample(15000)
df2 = train_val.sample(25000)
df3 = train_val.sample(35000)

In [None]:
y = df1['state'] # define target variable
x = df1.drop('state', axis=1)  
x_train_1, x_val_1, y_train_1, y_val_1 = train_test_split(x, y, test_size=0.25, random_state=1)

In [None]:
y = df2['state'] # define target variable
x = df2.drop('state', axis=1)  
x_train_2, x_val_2, y_train_2, y_val_2 = train_test_split(x, y, test_size=0.25, random_state=1)

In [None]:
y = df3['state'] # define target variable
x = df3.drop('state', axis=1)  
x_train_3, x_val_3, y_train_3, y_val_3 = train_test_split(x, y, test_size=0.25, random_state=1)

In [None]:
# flatten 1-column dataframe into series
# y_train = y_train['state']
# y_val = y_val['state']

In [None]:
#Start of KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=30,weights='uniform',algorithm='auto')

In [None]:
classifier.fit(x_train_3, y_train_3)

In [None]:
predictions = classifier.predict(x_val_3)
probabilities = classifier.predict_proba(x_val_3)
train_predictions = classifier.predict(x_train_3)

In [None]:
print ("Training Accuracy: ", accuracy_score(y_train_3, train_predictions))
print ("Validation Accuracy: ", accuracy_score(y_val_3, predictions))
# training and validation balanced accuracy
print ("Training Balanced Accuracy: ", balanced_accuracy_score(y_train_3, train_predictions))
print ("Validation Balanced Accuracy: ", balanced_accuracy_score(y_val_3, predictions))

In [None]:
# confusion matrix basic stats
# cm = confusion_matrix(y_val_2, predictions)
# print("Confusion matrix")
# print(cm)
# better confusion matrix depiction
# pd.crosstab(y_val_2, predictions, rownames=['Actual Status'], colnames=['Predicted Status'])

In [None]:
# best confusion matrix depiction using seaborn heatmap
class_names = y_val_3.unique()
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
fig = plt.figure()
heatmap = sn.heatmap(df_cm, annot=True, fmt="d")
plt.xlabel('Predicted status')
plt.ylabel('Actual status')
plt.title('Confusion Matrix - kNN with k=30 and 35000 instances')

In [None]:
# classification report with relevant statistics
# cr = classification_report(y_val_2, predictions)
# print(cr)

In [None]:
print("Log loss: ", log_loss(y_val_3, probabilities))
# get probabilities for the positive class (second column in probabilities)
positive_probabilities = probabilities[:,1]
# is over 0.5 - which is the standard for a no-skill model
print("ROC_AUC score: ", roc_auc_score(y_val_3, positive_probabilities))

In [None]:
# our dataset is balanced - this one is better
# plot ROC curve
# https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/
fpr, tpr, thresholds = roc_curve(y_val_3.cat.codes, positive_probabilities, pos_label=1)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.title("ROC Curve - kNN with k=30 and 35000 instances")
plt.xlabel("FPR")
plt.ylabel("TPR")
# show the plot
plt.show()

In [None]:
# plot precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_val_3.cat.codes, positive_probabilities)
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
plt.plot(recall, precision, marker='.')
plt.title("Precision-Recall Curve - kNN with k=30 and 35000 instances")
plt.xlabel("Recall")
plt.ylabel("Precision")
# show the plot
plt.show()