## Show similarity between reviews using the embeddings

In [1]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
import os

x_train_df = pd.read_csv('../data_reviews/x_train.csv')
x_test_df = pd.read_csv('../data_reviews/x_test.csv')
y_train_df = pd.read_csv('../data_reviews/y_train.csv')

save_dir = os.path.abspath('../data_reviews/')
tr_embeddings_ND = np.load(os.path.join(save_dir, 'x_train_BERT_embeddings.npy'))
te_embeddings_ND = np.load(os.path.join(save_dir, 'x_test_BERT_embeddings.npy'))

# from sklearn.preprocessing import normalize
# tr_embeddings_ND = normalize(tr_embeddings_ND)
# te_embeddings_ND = normalize(te_embeddings_ND)


tr_text_list = x_train_df['text'].values.tolist()
te_text_list = x_test_df['text'].values.tolist()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score

SEED = 12345

In [3]:
# x_train, x_val, y_train, y_val = train_test_split(tr_embeddings_ND, y_train_df, test_size=0.2, random_state=SEED)

# ON TRAINING SET

In [4]:
# Initialize the KFold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

neighbors = np.linspace(2, 500, dtype=int)
results = np.empty(neighbors.shape)

# Perform cross-validation for each choice of neighbors
for i, neighbor in enumerate(neighbors): 
    knn = KNeighborsClassifier(n_neighbors=neighbor, weights='distance')
    
    # Use cross_val_predict to get the predicted probabilities from cross-validation
    yhat_cv_pred = cross_val_predict(knn, tr_embeddings_ND, y_train_df.values.ravel(), 
                                     cv=cv, method='predict_proba')
    
    # Calculate AUROC using the probabilities from cross-validation
    score = roc_auc_score(y_train_df, yhat_cv_pred[:,1])  # Second column is probability for class 1
    results[i] = score

# Find the best number of neighbors
best_neighbors = np.argmax(results)
print(f"Best AUROC score: {results[best_neighbors]}")
print(f"Optimal number of neighbors: {neighbors[best_neighbors]}")


Best AUROC score: 0.9299777777777778
Optimal number of neighbors: 83


# TESTING

In [5]:
# choose some query sentences

# use K-nearest neighbors to find the 5 reviews that most closely resemble the query review
# for test_id in range(len(x_train)):
#     query_QF = tr_embeddings_ND[test_id][np.newaxis, :]
knn = KNeighborsClassifier(neighbors[best_neighbors], weights='distance')
knn.fit(tr_embeddings_ND, y_train_df.values.ravel())

yhat = knn.predict_proba(te_embeddings_ND)
yhat_pred = list()
for value in yhat[:, 1]:
    if value < 0.5:
        yhat_pred.append(0)
    else:
        yhat_pred.append(1)

file = open("yproba1_test.txt", "w+")
for value in yhat_pred:
    line = str(value) + "\n"
    file.write(line)
file.close()
