# Data Cell

In [None]:
#RUN ME!!!
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

balanced_file_path = '/content/drive/My Drive/NLP Final Project/balanced_df.csv'
tech_file_path = '/content/drive/My Drive/NLP Final Project/tech_data.csv'


try:
    balanced_df = pd.read_csv(balanced_file_path)
    tech_data = pd.read_csv(tech_file_path)
    print("DataFrame loaded successfully from Google Drive.")
except FileNotFoundError:
    print("Csv not found in Google Drive.")

## BoW with TF-IDF ##

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split

In [None]:
def get_tfidf_embeddings(train_data, test_data):
  # Create BOW
  vectorizer = CountVectorizer()
  bow = vectorizer.fit_transform(train_data)

  # TF-IDF weighting
  tfidf_transformer = TfidfTransformer()
  tfidf = tfidf_transformer.fit_transform(bow)

  # Get TF-IDF embeddings for test data
  test_bow = vectorizer.transform(test_data)
  test_tfidf = tfidf_transformer.transform(test_bow)
  return test_tfidf

In [None]:
# Getting the P@1 accuracy
def p_at_1_accuracy(cosine_distances_matrix, keyword_rows):
  best_matches = np.argmin(cosine_distances_matrix, axis=1)
  correct_indices = np.arange(len(keyword_rows))
  correct_matches = (best_matches == correct_indices)
  accuracy = np.mean(correct_matches)
  print(f"Top-1 Accuracy: {accuracy:.4f}")

In [None]:
def p_at_1_accuracy_bow(cosine_distances_matrix, duplicates_df):
  predicted_indices = np.argmin(cosine_distances_matrix, axis=1)
  correct_predictions = 0
  for i, pred_idx in enumerate(predicted_indices):
    if duplicates_df.index[i] == pred_idx:
      correct_predictions += 1
  accuracy = correct_predictions / len(duplicates_df)
  print(f"Top-1 Accuracy: {accuracy:.4f}")

In [None]:
# Getting the P@3 accuracy
def p_at_3_accuracy(cosine_distances_matrix, keyword_rows):
  top_3_matches = np.argsort(cosine_distances_matrix, axis=1)[:,:3]
  correct_matches = np.array([i in top_3_matches[i] for i in range(len(keyword_rows))])
  top_3_accuracy = np.mean(correct_matches)
  print(f"Top-3 Accuracy: {top_3_accuracy:.4f}")

In [None]:
def p_at_3_accuracy_bow(cosine_distances_matrix, duplicates_df):
  correct_predictions = 0
  for i in range(len(duplicates_df)):
    top_3_matches = np.argsort(cosine_distances_matrix[i])[:3]
    if duplicates_df.index[i] in top_3_matches:
      correct_predictions += 1
  top_3_accuracy = correct_predictions / len(duplicates_df)
  print(f"Top-3 Accuracy: {top_3_accuracy:.4f}")

In [None]:
# Getting the MRR
def mrr(cosine_distances_matrix, keyword_rows):
  correct_ranks = []
  for i in range(len(keyword_rows)):
    sorted_indices = np.argsort(cosine_distances_matrix[i])
    rank = np.where(sorted_indices == i)
    correct_ranks.append(rank[0][0] + 1)
  mrr = np.mean([1 / rank for rank in correct_ranks])
  print(f"MRR: {mrr:.4f}")

In [None]:
def mrr_bow(cosine_distances_matrix, duplicates_df):
  correct_ranks = []
  for i in range(len(duplicates_df)):
    sorted_indices = np.argsort(cosine_distances_matrix[i])
    if duplicates_df.index[i] in sorted_indices:
      rank = np.where(sorted_indices == duplicates_df.index[i])
      correct_ranks.append(1 / (rank[0][0] + 1))
    else:
      correct_ranks.append(0)
  mrr = np.mean(correct_ranks)
  print(f"MRR: {mrr:.4f}")

In [None]:
# Getting accuracy with tau threshold
def tau_accuracy(cosine_distances_matrix, balanced_df):
  threshold = 0.8
  cosine_similarity_matrix = 1 - cosine_distances_matrix

  row_accuracies = []
  for idx, row in balanced_df.iterrows():
    is_duplicate = row["is_duplicate"]
    similarity = cosine_similarity_matrix[idx, idx]

    if is_duplicate == 1:
      row_accuracies.append(similarity >= threshold)
    else:
      row_accuracies.append(similarity < threshold)

  row_wise_accuracy = np.mean(row_accuracies)
  print(f"Overall Accuracy with threshold 0.8: {row_wise_accuracy:.4f}")

For the tech dataset

In [None]:
train_df, test_df = train_test_split(tech_data, test_size=0.3, random_state=42)

In [None]:
train_questions = train_df["question1"].tolist() + train_df["question2"].tolist()
test_questions = test_df["question1"].tolist() + test_df["question2"].tolist()

In [None]:
test_embeddings = get_tfidf_embeddings(train_questions, test_questions)

question1_embeddings = test_embeddings[:len(test_df)]
question2_embeddings = test_embeddings[len(test_df):]

cosine_distances_matrix = cosine_distances(question1_embeddings, question2_embeddings)

In [None]:
test_df.reset_index(drop=True, inplace=True)

In [None]:
duplicates = test_df[test_df["is_duplicate"] == 1]
duplicate_indices = duplicates.index.tolist()
cosine_distances_duplicates = cosine_distances_matrix[duplicate_indices, :]

p_at_1_accuracy_bow(cosine_distances_duplicates, duplicates)

In [None]:
p_at_3_accuracy_bow(cosine_distances_duplicates, duplicates)

In [None]:
mrr_bow(cosine_distances_duplicates, duplicates)

In [None]:
tau_accuracy(cosine_distances_matrix, test_df)

For the general dataset

In [None]:
train_df, test_df = train_test_split(balanced_df, test_size=0.3, random_state=42)

In [None]:
train_questions = train_df["question1"].tolist() + train_df["question2"].tolist()
test_questions = test_df["question1"].tolist() + test_df["question2"].tolist()

In [None]:
test_embeddings = get_tfidf_embeddings(train_questions, test_questions)

question1_embeddings = test_embeddings[:len(test_df)]
question2_embeddings = test_embeddings[len(test_df):]

cosine_distances_matrix = cosine_distances(question1_embeddings, question2_embeddings)

In [None]:
test_df.reset_index(drop=True, inplace=True)

In [None]:
duplicates = test_df[test_df["is_duplicate"] == 1]
duplicate_indices = duplicates.index.tolist()
cosine_distances_duplicates = cosine_distances_matrix[duplicate_indices, :]

p_at_1_accuracy_bow(cosine_distances_duplicates, duplicates)

In [None]:
p_at_3_accuracy_bow(cosine_distances_duplicates, duplicates)

In [None]:
mrr_bow(cosine_distances_duplicates, duplicates)

In [None]:
tau_accuracy(cosine_distances_matrix, test_df)

## Pre-Trained Word2Vec ##

In [None]:
import gensim.models
model = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/NLP Final Project/GoogleNews-vectors-negative300.bin", binary=True)

For the tech dataset

In [None]:
all_questions = tech_data["question1"].tolist() + tech_data["question2"].tolist()

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(all_questions)

In [None]:
def get_weighted_question_embedding(question, tfidf_vectorizer, tfidf, i, word2vec):
  vocab_questions = tfidf_vectorizer.vocabulary_
  question_words = question.split()
  word_embeddings = []
  weights = []

  for word in question_words:
    if word in vocab_questions and word in word2vec:
      word_index = vocab_questions[word]
      word_weight = tfidf[i, word_index]
      word_embedding = word2vec[word]
      word_embeddings.append(word_embedding * word_weight)
      weights.append(word_weight)

  if word_embeddings:
    question_embedding = np.sum(word_embeddings, axis=0) / np.sum(weights)

  else:
    print("No words found")
    question_embedding = np.zeros(word2vec.vector_size)

  return question_embedding

In [None]:
question1_embeddings = np.array([get_weighted_question_embedding(q, tfidf_vectorizer, tfidf, i, model)
                                for i, q in enumerate(tech_data["question1"])])

In [None]:
question2_embeddings = np.array([get_weighted_question_embedding(q, tfidf_vectorizer, tfidf, i + len(tech_data), model)
                                for i, q in enumerate(tech_data["question2"])])

In [None]:
cosine_distances_matrix = cosine_distances(question1_embeddings, question2_embeddings)

In [None]:
duplicates = tech_data[tech_data["is_duplicate"] == 1]
duplicate_indices = duplicates.index.tolist()
cosine_distances_duplicates = cosine_distances_matrix[duplicate_indices, :]

p_at_1_accuracy_bow(cosine_distances_duplicates, duplicates)

In [None]:
p_at_3_accuracy_bow(cosine_distances_duplicates, duplicates)

In [None]:
mrr_bow(cosine_distances_duplicates, duplicates)

In [None]:
tau_accuracy(cosine_distances_matrix, tech_data)

For the general dataset

In [None]:
duplicates = balanced_df[balanced_df["is_duplicate"] == 1]
all_questions = balanced_df["question1"].tolist() + balanced_df["question2"].tolist()

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(all_questions)

In [None]:
question1_embeddings = np.array([get_weighted_question_embedding(q, tfidf_vectorizer, tfidf, i, model)
                                for i, q in enumerate(balanced_df["question1"])])

In [None]:
question2_embeddings = np.array([get_weighted_question_embedding(q, tfidf_vectorizer, tfidf, i + len(balanced_df), model)
                                for i, q in enumerate(balanced_df["question2"])])

In [None]:
q1_duplicate_embeddings = question1_embeddings[duplicates.index]
q2_duplicate_embeddings = question2_embeddings[duplicates.index]

In [None]:
cosine_distances_matrix = cosine_distances(q1_duplicate_embeddings, question2_embeddings)

In [None]:
p_at_1_accuracy(cosine_distances_matrix, duplicates)

In [None]:
p_at_3_accuracy(cosine_distances_matrix, duplicates)

In [None]:
mrr(cosine_distances_matrix, duplicates)

In [None]:
cosine_distances_matrix = cosine_distances(question1_embeddings, question2_embeddings)
tau_accuracy(cosine_distances_matrix, balanced_df)

## Training Our Own CBOW ##

In [None]:
import pandas as pd
text_corpus = pd.read_csv('balanced_df.csv')
# text_corpus = pd.read_csv('tech_data.csv')
train_corpus = text_corpus['question1'].sample(n=10500, random_state=35).tolist()
test_corpus = text_corpus[~text_corpus['question1'].isin(train_corpus)]

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

corpus = train_corpus

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
print("After converting our words in the corpus into vector of integers:")
print(sequences)

In [None]:
from sklearn.model_selection import train_test_split
# Define the parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_size = 400
window_size = 5

# Generate the context-target pairs
contexts = []
targets = []
for sequence in sequences:
    for i in range(window_size, len(sequence) - window_size):
        context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
        target = sequence[i]
        contexts.append(context)
        targets.append(target)

# Convert the contexts and targets to numpy arrays
X = np.array(contexts)
y = to_categorical(targets, num_classes=vocab_size)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the CBOW model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=2 * window_size))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))
model.add(Dense(units=vocab_size, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=200, verbose=0)

loss, accuracy = model.evaluate(X_test, y_test, verbose=1)

print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
embedding_layer = model.layers[0]

embedding_weights = embedding_layer.get_weights()[0]

# Function to obtain the sentence-level embedding by averaging word embeddings in the context
def get_sentence_embedding(context):
    word_embeddings = embedding_weights[context]

    # Return the average of the embeddings for the context (you could also sum, or use other aggregations)
    return np.mean(word_embeddings, axis=0)


context_example = X_test[0]
sentence_embedding = get_sentence_embedding(context_example)

print("Sentence Embedding (Vector for this context):")
print(sentence_embedding)

In [None]:
def get_sentence_embedding(context):
    word_embeddings = embedding_weights[context]

    return np.mean(word_embeddings, axis=0)

def embed_new_sentence(new_sentence, tokenizer, model, window_size=2):

    new_sequence = tokenizer.texts_to_sequences([new_sentence])[0]

    if len(new_sequence) <= 2 * window_size:

        return np.zeros((embedding_size,))


    contexts = []
    for i in range(window_size, len(new_sequence) - window_size):
        context = new_sequence[i - window_size:i] + new_sequence[i + 1:i + window_size + 1]
        contexts.append(context)

    X_new = np.array(contexts)

    embedding_layer = model.layers[0]
    embedding_weights = embedding_layer.get_weights()[0]


    sentence_embeddings = np.array([get_sentence_embedding(context) for context in X_new])

    final_sentence_embedding = np.mean(sentence_embeddings, axis=0)

    return final_sentence_embedding

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

In [None]:
# keyword_rows = test_corpus
test_corpus = pd.read_csv('tech_data.csv')


all_questions1 = keyword_rows['question1'].tolist()
all_questions2 = keyword_rows['question2'].tolist()

embeddings_question1 = np.array([
    embed_new_sentence(q, tokenizer, model) for q in all_questions1
])

embeddings_question2 = np.array([
    embed_new_sentence(q, tokenizer, model) for q in all_questions2
])

cosine_similarities = np.array([
    compute_cosine_similarity(emb1, emb2)
    for emb1, emb2 in zip(embeddings_question1, embeddings_question2)
])

In [None]:
max = 0
print("Cosine Similarities for Question Pairs:")
for i, similarity in enumerate(cosine_similarities):

    question1_text = all_questions1[i]
    question2_text = all_questions2[i]

    print(f"Question 1: {question1_text}")
    print(f"Question 2: {question2_text}")
    print(f"Cosine Similarity: {similarity}")
    max +=1
    if max == 10:
      break

In [None]:
correct_predictions = 0
total_predictions = len(cosine_similarities)


for i, similarity in enumerate(cosine_similarities):
    is_duplicate = test_corpus.iloc[i]['is_duplicate']

    if similarity > 0.8:
        predicted_label = 1
    else:
        predicted_label = 0


    if predicted_label == is_duplicate:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions


print(f"Number of correct predictions: {correct_predictions}")
print(f"Total number of comparisons: {total_predictions}")
print(f"Accuracy: {accuracy}")

In [None]:
correct_matches = 0
total_checked = 20

for i in range(min(total_checked, len(cosine_similarities))):
    is_duplicate = test_corpus.iloc[i]['is_duplicate']

    question1 = all_questions1[i]
    question2 = all_questions2[i]

    emb1 = embeddings_question1[i]

    similarities_with_others = np.array([compute_cosine_similarity(emb1, emb2) for emb2 in embeddings_question1 + embeddings_question2])

    similarities_with_others[i] = -1
    most_similar_index = np.argmax(similarities_with_others)
    most_similar_similarity = similarities_with_others[most_similar_index]

    if most_similar_index < len(embeddings_question1):
        most_similar_question = all_questions1[most_similar_index]
    else:
        most_similar_question = all_questions2[most_similar_index - len(embeddings_question1)]

    if most_similar_index == i + len(embeddings_question1):
        correct_matches += 1
        correct_match_status = "Correct"
    else:
        correct_match_status = "Incorrect"

    print(f"Question 1: {question1}")
    print(f"Question 2: {question2}")
    print(f"Cosine Similarity with most similar question: {most_similar_similarity:.4f}")
    print(f"Most Similar Question: {most_similar_question}")
    print(f"True Label (is_duplicate): {is_duplicate}")
    print(f"Prediction: {correct_match_status}\n")

In [None]:
from sklearn.metrics.pairwise import cosine_distances
## Try top 3
cosine_distances_matrix = cosine_distances(embeddings_question1, embeddings_question2)
top_3_matches = np.argsort(cosine_distances_matrix, axis=1)[:, :3]
correct_matches = np.array([i in top_3_matches[i] for i in range(len(keyword_rows))])
top_3_accuracy = np.mean(correct_matches)
print(f"P@3: {top_3_accuracy:.2f}")

correct_ranks = []
for i in range(len(cosine_similarities)):
    sorted_indices = np.argsort(cosine_similarities)
    rank = np.where(sorted_indices == i)[0]
    if len(rank) > 0:
        correct_ranks.append(rank[0] + 1)
    else:
        correct_ranks.append(float('inf'))

# P@1
p_at_1 = np.mean([1 if rank == 1 else 0 for rank in correct_ranks])

# MRR
mrr = np.mean([1 / rank for rank in correct_ranks if rank != float('inf')])


print(f"P@1: {p_at_1:.2f}")
print(f"MRR: {mrr:.2f}")

## Trying Binary Classification Approach -- As Is (No Fine-Tuning)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [None]:
class QuestionPairDataset(Dataset):
  def __init__(self, df, tokenizer):
    self.df = df
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    row = self.df.iloc[index]
    question1 = row['question1']
    question2 = row['question2']
    label = row['is_duplicate']

    inputs = self.tokenizer(
        text=question1,
        text_pair=question2,
        truncation=True,
        max_length=128,
        padding='max_length',
        add_special_tokens=True,
        return_tensors="pt")

    return {'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)}

In [None]:
def evaluate(model, data_loader):
  model.eval()
  predictions = []
  true_labels = []

  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)

      logits = outputs.logits
      preds = torch.argmax(logits, dim=1)

      predictions.extend(preds.cpu().numpy())
      true_labels.extend(labels.cpu().numpy())

  return predictions, true_labels

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
baseline_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)

For tech dataset

In [None]:
test_dataset = QuestionPairDataset(tech_data, tokenizer)

test_loader = DataLoader(test_dataset, batch_size=16)

baseline_predictions, baseline_true_labels = evaluate(baseline_model, test_loader)
baseline_accuracy = accuracy_score(baseline_true_labels, baseline_predictions)
baseline_report = classification_report(baseline_true_labels, baseline_predictions)

print(f"Baseline Test Accuracy: {baseline_accuracy}")
print(baseline_report)

For general dataset

In [None]:
test_dataset = QuestionPairDataset(balanced_df, tokenizer)

test_loader = DataLoader(test_dataset, batch_size=16)

baseline_predictions, baseline_true_labels = evaluate(baseline_model, test_loader)
baseline_accuracy = accuracy_score(baseline_true_labels, baseline_predictions)
baseline_report = classification_report(baseline_true_labels, baseline_predictions)

print(f"Baseline Test Accuracy: {baseline_accuracy}")
print(baseline_report)

## Trying Binary Classification Approach -- With Fine-Tuning

In [None]:
fined_tuned_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)

In [None]:
train_data, test_data = train_test_split(tech_data, test_size=0.3, random_state=42)

train_dataset = QuestionPairDataset(train_data, tokenizer)
test_dataset = QuestionPairDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
optimizer = AdamW(fined_tuned_model.parameters(), lr=2e-5)
epochs = 3
train_losses = []
train_accuracies = []

for epoch in range(epochs):
  epoch_loss = 0
  correct_predictions = 0
  total = 0
  fined_tuned_model.train()

  for batch in train_loader:
    optimizer.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = fined_tuned_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    loss = outputs.loss
    epoch_loss += loss.item()

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    correct_predictions += (preds == labels).sum().item()
    total += labels.size(0)

    loss.backward()
    optimizer.step()

  train_losses.append(epoch_loss / len(train_loader))
  train_accuracies.append(correct_predictions / total)

  print(f"Epoch loss: {train_losses[-1]}, Epoch accuracy: {train_accuracies[-1]}")

In [None]:
fine_tuned_predictions, fine_tuned_true_labels = evaluate(fined_tuned_model, test_loader)
fine_tuned_accuracy = accuracy_score(fine_tuned_true_labels, fine_tuned_predictions)
fine_tuned_report = classification_report(fine_tuned_true_labels, fine_tuned_predictions)

print(f"Fine-Tuned Test Accuracy: {fine_tuned_accuracy}")
print(fine_tuned_report)

## Trying BERT w/ Cosine Distance -- As Is (No Fine-Tuning) ##

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Pre-trained BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
bert_model.eval()

In [None]:
def cls_embeddings(questions, tokenizer, model):
  embeddings = []
  with torch.no_grad():
    for question in questions:
      inputs = tokenizer(text = question,
               truncation = True,
               padding = True,
               add_special_tokens = True,
               return_tensors = "pt")

      input_ids = inputs["input_ids"]
      attention_mask = inputs["attention_mask"]

      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)

      outputs = model(input_ids, attention_mask=attention_mask)

      last_hidden_state = outputs[0]
      cls_representation = last_hidden_state[:,0,:]
      embeddings.append(cls_representation.cpu().numpy())

  return np.vstack(embeddings)

For the tech dataset

In [None]:
question1_embeddings = cls_embeddings(tech_data["question1"], tokenizer, bert_model)
question2_embeddings = cls_embeddings(tech_data["question2"], tokenizer, bert_model)

In [None]:
cosine_distances_matrix = cosine_distances(question1_embeddings, question2_embeddings)

In [None]:
duplicates = tech_data[tech_data["is_duplicate"] == 1]
duplicate_indices = duplicates.index.tolist()
cosine_distances_duplicates = cosine_distances_matrix[duplicate_indices, :]

p_at_1_accuracy(cosine_distances_duplicates, duplicates)

In [None]:
p_at_3_accuracy(cosine_distances_duplicates, duplicates)

In [None]:
mrr(cosine_distances_duplicates, duplicates)

In [None]:
tau_accuracy(cosine_distances_matrix, tech_data)

For the general dataset

In [None]:
question1_embeddings = cls_embeddings(balanced_df["question1"], tokenizer, bert_model)
question2_embeddings = cls_embeddings(balanced_df["question2"], tokenizer, bert_model)

In [None]:
cosine_distances_matrix = cosine_distances(question1_embeddings, question2_embeddings)

In [None]:
# Getting P@1
duplicates = balanced_df[balanced_df["is_duplicate"] == 1]
duplicate_indices = duplicates.index.tolist()
cosine_distances_duplicates = cosine_distances_matrix[duplicate_indices, :]

p_at_1_accuracy(cosine_distances_duplicates, duplicates)

In [None]:
# Getting P@3
p_at_3_accuracy(cosine_distances_duplicates, duplicates)

In [None]:
# Getting MRR
mrr(cosine_distances_duplicates, duplicates)

In [None]:
tau_accuracy(cosine_distances_matrix, balanced_df)

## Trying BERT w/ Cosine Distance -- With Fine-Tuning ##

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Source
# https://huggingface.co/docs/transformers/en/tasks/masked_language_modeling

class TokenizeQuestion(Dataset):
  def __init__(self, q, tokenizer, max_length=128):
    self.questions = q
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.questions)

  def __getitem__(self, idx):
    quest = self.questions[idx]
    tokens = self.tokenizer(text = quest, truncation = True, padding= True, add_special_tokens = True, return_tensors="pt")
    return {key: val.squeeze(0) for key, val in tokens.items()}

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForMaskedLM.from_pretrained("bert-base-uncased")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
bert_model.eval()

In [None]:
training_args = TrainingArguments(
    output_dir="./bert_mlm",
    eval_strategy="no",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
)

In [None]:
def cls_embeddings(questions, tokenizer, model):
  embeddings = []
  with torch.no_grad():
    for question in questions:
      inputs = tokenizer(text = question,
               truncation = True,
               padding = True,
               add_special_tokens = True,
               return_tensors = "pt")

      input_ids = inputs["input_ids"]
      attention_mask = inputs["attention_mask"]

      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)

      outputs = model(input_ids, attention_mask=attention_mask)

      last_hidden_state = outputs[0]
      cls_representation = last_hidden_state[:,0,:]
      embeddings.append(cls_representation.cpu().numpy())

  return np.vstack(embeddings)

For tech dataset:

In [None]:
train_df, test_df = train_test_split(tech_data, test_size=0.3, random_state=42)

In [None]:
train_questions = train_df["question1"].tolist() + train_df["question2"].tolist()

In [None]:
dataset = TokenizeQuestion(train_questions, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm = True, mlm_probability = 0.15)

In [None]:
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
question1_embeddings = cls_embeddings(test_df["question1"], tokenizer, bert_model)
question2_embeddings = cls_embeddings(test_df["question2"], tokenizer, bert_model)

In [None]:
cosine_distances_matrix = cosine_distances(question1_embeddings, question2_embeddings)

In [None]:
test_df = test_df.reset_index(drop=True)
duplicates = test_df[test_df["is_duplicate"] == 1]
duplicate_indices = duplicates.index.tolist()
cosine_distances_duplicates = cosine_distances_matrix[duplicate_indices, :]

p_at_1_accuracy(cosine_distances_duplicates, duplicates)

In [None]:
tau_accuracy(cosine_distances_matrix, test_df)

For general dataset

In [None]:
train_df, test_df = train_test_split(balanced_df, test_size=0.3, random_state=42)

In [None]:
train_questions = train_df["question1"].tolist() + train_df["question2"].tolist()

In [None]:
dataset = TokenizeQuestion(train_questions, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm = True, mlm_probability = 0.15)

In [None]:
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
question1_embeddings = cls_embeddings(test_df["question1"], tokenizer, bert_model)
question2_embeddings = cls_embeddings(test_df["question2"], tokenizer, bert_model)

In [None]:
cosine_distances_matrix = cosine_distances(question1_embeddings, question2_embeddings)