# Hot Word Similarity Detection for `cv-valid-dev.csv`

This notebook demonstrates how to detect phrases similar to specific hot words (`'be careful'`, `'destroy'`, `'stranger'`) in a CSV file and add a `similarity` column. The comparison is performed against each hot word individually.

In [1]:
# Install required libraries
# !pip install transformers pandas scikit-learn

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

  from pandas.core import (


## Step 1: Load the Pre-trained Model and Tokenizer

In [3]:
model_name = 'hkunlp/instructor-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of T5Model were not initialized from the model checkpoint at hkunlp/instructor-large and are newly initialized: ['decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.o.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.0.layer.1.EncDecAttention.v.weight', 'decoder.block.0.layer.1.layer_norm.weight', 'decoder.block.0.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.2.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.k.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.0.SelfAttention.q.weight', 'deco

## Step 2: Load the CSV File

In [4]:
input_file = 'cv-valid-dev.csv'
output_file = 'cv-valid-dev-with-similarity.csv'
df = pd.read_csv(input_file)

## Step 3: Embed Each Hot Word

In [None]:
# Hot words and embedding computation
hot_words = ['be careful', 'destroy', 'stranger']
hot_word_embeddings = []

for word in hot_words:
    # Tokenize input
    inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        # Use the encoder and process input_ids
        outputs = model.encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        hot_word_embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())

hot_word_embeddings = torch.tensor(hot_word_embeddings)

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

## Step 4: Define Similarity Detection Function

In [None]:
# Calculate similarity for each transcription
def check_similarity(text):
    # Ensure text is a valid string
    if not isinstance(text, str):
        return False

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        # Use the encoder to generate embeddings for the transcription
        outputs = model.encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        text_embedding = outputs.last_hidden_state.mean(dim=1).numpy()

    # Compute cosine similarity with each hot word embedding
    for hot_word_embedding in hot_word_embeddings:
        similarity = cosine_similarity(text_embedding, hot_word_embedding.numpy().reshape(1, -1))
        if similarity.max() > 0.8:  # Set threshold for similarity
            return True
    return False

## Step 5: Apply the Similarity Function to Each Row

In [None]:
df['similarity'] = df['generated_text'].apply(lambda x: check_similarity(x))

## Step 6: Save the Updated CSV File

In [None]:
df.to_csv(output_file, index=False)
print(f'Updated file saved as {output_file}')