In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

# Load the hkunlp/instructor-large model and tokenizer
model_name = "hkunlp/instructor-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load the Common Voice dataset
dataset_path = "cv-valid-dev.csv"
df = pd.read_csv(dataset_path)

# Define the hot words and their embeddings
hot_words = ["be careful", "destroy", "stranger"]
hot_word_embeddings = {word: tokenizer(word, return_tensors="pt")["input_ids"] for word in hot_words}

# Define a function to calculate similarity
def calculate_similarity(phrase):
    phrase_embedding = tokenizer(phrase, return_tensors="pt")["input_ids"]
    similarities = [torch.cosine_similarity(phrase_embedding, hot_word_embedding, dim=1).item() for hot_word_embedding in hot_word_embeddings.values()]
    return max(similarities)

# Calculate similarity for each phrase in the dataset
similarities = []
for phrase in tqdm(df["sentence"]):
    similarity = calculate_similarity(phrase)
    similarities.append(similarity)

# Add the similarity column to the dataset
df["similarity"] = [similarity > 0.7 for similarity in similarities]  # Adjust the threshold as needed

# Save the updated dataset
output_path = "cv-valid-dev-with-similarity.csv"
df.to_csv(output_path, index=False)
