In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay


doc = pd.read_csv("C:/Users/imruh/Documents/Uni/thesis/text generation/datafiles/clan_data/processed_merge.csv")
# doc = pd.read_csv("C:/Users/imruh/Documents/Uni/thesis/text generation/datafiles/generated output/merge_all.csv")
results_fp = "knn-results.csv"
labels = []
sentences = []

for index, x in doc.iterrows():
    y = sent_tokenize(x["preprocessed_text"])
    sentences.extend(y)
    labels.extend([x['label']]*len(y))
    
doc = pd.DataFrame({"text":sentences, "label":labels})

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\imruh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\imruh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\imruh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(doc['text'], doc['label'], train_size=.6, stratify=doc['label'])

#target column can have bad, good, and neutral. 
#label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

# create tf-idf feature vector. Word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(doc['text'])
xtrain_tfidf_word =  tfidf_vect.transform(train_x)
xvalid_tfidf_word =  tfidf_vect.transform(valid_x)


In [44]:
# odd k for preventing ties
grid_params = { 'n_neighbors' : [3, 5, 7, 9, 11],
               'metric' : ['manhattan', 'minkowski', 'cosine']}

In [45]:
knn=KNeighborsClassifier()
clf = GridSearchCV(knn, grid_params, n_jobs=10, verbose=1, return_train_score=True)
clf.fit(xtrain_tfidf_word,train_y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


KeyboardInterrupt: 

In [None]:
result_df = pd.concat([pd.DataFrame(clf.cv_results_["params"]),
                       pd.DataFrame(clf.cv_results_["mean_test_score"], columns=["mean_test_acc"]),
                       pd.DataFrame(clf.cv_results_["mean_train_score"], columns=["mean_train_acc"]),
                       pd.DataFrame(clf.cv_results_["std_train_score"], columns=["std_train_acc"]),
                       pd.DataFrame(clf.cv_results_["std_test_score"], columns=["std_test_acc"])], axis=1)

In [None]:
result_df.to_csv(results_fp)

In [None]:
clf.best_params_

In [None]:
import matplotlib.pyplot as plt

result_df = pd.read_csv("new-knn.csv", encoding="utf-8")
cols = ['red', 'orange', 'green']
for metric, col in zip(grid_params['metric'], cols):
    plt.plot(result_df[result_df.metric == metric].n_neighbors, result_df[result_df.metric == metric].mean_test_acc, color=col, label=metric)

plt.xlabel("# neighbours")
plt.ylabel("CV Test accuracy")
plt.xticks(grid_params['n_neighbors'])
plt.legend()
plt.show()

In [None]:
for metric, col in zip(grid_params['metric'], cols):
    plt.plot(result_df[result_df.metric == metric].n_neighbors, result_df[result_df.metric == metric].mean_train_acc, color=col, label=metric)
    # plt.fill_between(result_df[result_df.metric == metric].n_neighbors, result_df[result_df.metric == metric].mean_train_acc - result_df[result_df.metric == metric].std_train_acc,result_df[result_df.metric == metric].mean_train_acc + result_df[result_df.metric == metric].std_train_acc)

plt.xlabel("# neighbours")
plt.ylabel("CV Train accuracy")
plt.xticks(grid_params['n_neighbors'])
plt.legend()
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=clf.best_params_['n_neighbors'], metric=clf.best_params_['metric'])
knn.fit(xtrain_tfidf_word, train_y)

In [None]:
result = knn.predict(xvalid_tfidf_word)

In [None]:
cm = confusion_matrix(valid_y, result, normalize='true')
tn, fp, fn, tp = cm.ravel()
ConfusionMatrixDisplay.from_estimator(knn, xvalid_tfidf_word, valid_y, normalize="true", cmap=plt.cm.Blues)
plt.grid(False)
plt.show()

In [None]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2 * precision * recall)/ (precision+recall)
print(accuracy_score(valid_y, result))
print(precision) 
print(recall)
print(f1)