## Show similarity between reviews using the embeddings

In [9]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
import os

x_train_df = pd.read_csv('../data_reviews/x_train.csv')
x_test_df = pd.read_csv('../data_reviews/x_test.csv')
y_train_df = pd.read_csv('../data_reviews/y_train.csv')

save_dir = os.path.abspath('../data_reviews/')
tr_embeddings_ND = np.load(os.path.join(save_dir, 'x_train_BERT_embeddings.npy'))
te_embeddings_ND = np.load(os.path.join(save_dir, 'x_test_BERT_embeddings.npy'))

tr_text_list = x_train_df['text'].values.tolist()
te_text_list = x_test_df['text'].values.tolist()

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

SEED = 12345

In [11]:
x_train, x_val, y_train, y_val = train_test_split(tr_embeddings_ND, y_train_df, test_size=0.2, random_state=SEED)

# ON TRAINING SET

In [40]:
# choose some query sentences
neighbors = np.linspace(2, 500, dtype=int)
yhat_val_pred_list = list()
results = np.empty(neighbors.shape)
for i, neighbor in enumerate(neighbors): 
# use K-nearest neighbors to find the 5 reviews that most closely resemble the query review
# for test_id in range(len(x_train)):
#     query_QF = tr_embeddings_ND[test_id][np.newaxis, :]
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(x_train, y_train.values.ravel())

    yhat_val_pred = knn.predict_proba(x_val)
    yhat_val_pred_list.append(yhat_val_pred)

    score = roc_auc_score(y_val, yhat_val_pred[:,1])
    results[i] = score

best_neighbors = np.argmax(results)
print(results[best_neighbors])
print(neighbors[best_neighbors])

0.9184247538677919
123


# TESTING

In [45]:
# choose some query sentences

# use K-nearest neighbors to find the 5 reviews that most closely resemble the query review
# for test_id in range(len(x_train)):
#     query_QF = tr_embeddings_ND[test_id][np.newaxis, :]
knn_best = KNeighborsClassifier(n_neighbors=123)
knn_best.fit(tr_embeddings_ND, y_train_df.values.ravel())

yhat = knn.predict_proba(te_embeddings_ND)
yhat_pred = list()
for value in yhat[:, 1]:
    if value < 0.5:
        yhat_pred.append(0)
    else:
        yhat_pred.append(1)
print(yhat_pred)

file = open("yproba1_test.txt", "w+")
for value in yhat_pred:
    line = str(value) + "\n"
    file.write(line)
file.close()


[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 