In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import imblearn
from collections import Counter
from matplotlib import pyplot
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
data = pd.read_excel('/content/drive/MyDrive/thesis/subset.xlsx')
data.isnull().sum()

In [None]:
data.describe()

In [None]:
short_long = []
length = data['length'].tolist()

for i in range (0, len(length)):
  if length[i] < 50:
    short_long.append("short")
  else:
    short_long.append("long")

data['short_long'] = short_long
data

In [None]:
index = []
for i in range (0, 5895):
  index.append(i)

data['index'] = index

In [None]:
#SMOTE
X = data['rhyme_score']
y = data['hit']
X = X.array
X = X.reshape(-1, 1)
indices = data['index']

# summarize distribution
counter = Counter(y)
for k,v in counter.items():
	per = v / len(y) * 100
	print('Class=%d, n=%d (%.3f%%)' % (k, v, per))
# plot the distribution
x_labels = [0, 1]
pyplot.bar(counter.keys(), counter.values())
pyplot.show()

#SMOTE oversample
X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(X, y, indices, test_size=0.2, random_state=42, stratify=y)
# transform the dataset
oversample = SMOTE(random_state=42)
X_train, y_train = oversample.fit_resample(X_train, y_train)
# summarize distribution
counter = Counter(y_train)
for k,v in counter.items():
	per = v / len(y) * 100
	print('Class=%d, n=%d (%.3f%%)' % (k, v, per))
# plot the distribution
pyplot.bar(counter.keys(), counter.values())
pyplot.show()

In [None]:
knn_model = KNeighborsClassifier()

hyperparameters = {'n_neighbors': list(range(1,30)),
                   'p': [1, 2],
                   'weights': ['uniform','distance'],
                   'metric': ['minkowski','euclidean','manhattan']}

gs = GridSearchCV(estimator = knn_model, param_grid=hyperparameters, cv = 5)
best_model = gs.fit(X_train, y_train)

#Print The value of best Hyperparameters
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
print('Best weights:', best_model.best_estimator_.get_params()['weights'])
print('Best metric:', best_model.best_estimator_.get_params()['metric'])

In [None]:
#Create KNN Object.
knn = KNeighborsClassifier(p = 1, n_neighbors = 6, weights = 'distance', metric = 'minkowski')
#Training the model.
knn.fit(X_train, y_train)
#Predict train and test data set.
y_hat = knn.predict(X_train)
y_pred = knn.predict(X_test)
#Checking performance our model with classification report.
print('Training set accuracy: ', metrics.accuracy_score(y_train, y_hat))
print('Test set accuracy: ',metrics.accuracy_score(y_test, y_pred))

In [None]:
#get subset of the dataset which was test set and add predicted label
test_set_indices = index_test.tolist()
test_data = data.iloc[test_set_indices]
test_data['predicted_value'] = y_pred

In [None]:
#make the dataset with incorrect predictions
wrong_classification = test_data[test_data["hit"] != test_data["predicted_value"]]

In [None]:
#add column with the type of error
type_of_error = []
length_of_dataset = len(wrong_classification.index)

for i in range(0, length_of_dataset):
  if wrong_classification['hit'].values[i] == 0 and wrong_classification['predicted_value'].values[i] == 1:
    type_of_error.append("FP")
  else:
    type_of_error.append("FN")
  
wrong_classification["type_of_error"] = type_of_error

In [None]:
wrong_classification.groupby(['short_long', 'type_of_error']).size()