In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from InstructorEmbedding import INSTRUCTOR
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import trange


In [2]:
model = INSTRUCTOR('hkunlp/instructor-large')

df = pd.read_csv("../data/common_voice/cv-valid-dev.csv")

No sentence-transformers model found with name hkunlp/instructor-large. Creating a new one with mean pooling.


Methodology: The 3 hot words are: "be careful", "destroy" and "stranger".

Since "be careful" is a bigram, we will extract bigrams from the text, embed them using the embedding model and calculate the cosine similarity between the bigram embeddings and the embeddings of "be careful". As for "destroy" and "stranger", we will extract unigrams from the text, embed them using the embedding model and calculate the cosine similarity between the unigram embeddings and the embeddings of "destroy" and "stranger".

Note: Since the instructions in the PDF for task 5b did not specify if we should find similar phrases among the provided labels in cv-valid-dev.csv or among the transacribed results from the finetuned model in task 3, I assumed that we should find similar phrases among the labels itself.

In [4]:
def get_bigrams(words: list[str]):
    return [" ".join(words[i:i+2]) for i in range(len(words)-1)]

In [None]:
sentences_a = [
    ['Represent the phrase: ', 'be careful'],
    ['Represent the word: ', 'destroy'],
    ['Represent the word: ', 'stranger']
]

# Encode reference embeddings
embeddings_a = model.encode(sentences_a)

# Compute cosine similarities for each row
precomputed_sims = []

for row_idx, sentence in tqdm(enumerate(df['text'])):
    words = sentence.split()
    bigrams = get_bigrams(words)
    if len(bigrams) == 0:
        bigrams = words
    sentences_b = [['Represent the phrase: ', b] for b in bigrams]
    embeddings_b = model.encode(sentences_b)
    sim_matrix_bigram = cosine_similarity(embeddings_a, embeddings_b) 
    
    sentences_c = [['Represent the word: ', w] for w in words]
    embeddings_c = model.encode(sentences_c)
    sim_matrix_word = cosine_similarity(embeddings_a, embeddings_c)  
    
    precomputed_sims.append({
        "row_idx": row_idx,
        "bigram_similarities": sim_matrix_bigram,
        "word_similarities": sim_matrix_word
    })

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


0it [00:00, ?it/s]

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
`SentenceTransformer._target_device` 

In [None]:
def get_matches(threshold=0.7, ref_idx=None, use=('word', 'bigram')):
    """
    Return matches above the given threshold, checking one or both similarity matrices.

    Args:
        threshold (float): cosine similarity threshold.
        ref_idx (int or None): index of the reference sentence in sentences_a.
                               If None, returns matches for all references.
        use (tuple): which similarity matrices to check. 
                     Options: ('word',), ('bigram',), or ('word', 'bigram').
    """
    # Validate input
    valid_types = {'word', 'bigram'}
    use = tuple(u for u in use if u in valid_types)
    if not use:
        raise ValueError("Argument 'use' must include 'word' and/or 'bigram'")

    if ref_idx is not None:
        matches = []
    else:
        matches = {i: [] for i in range(len(sentences_a))}

    # Iterate through all precomputed results
    for item in precomputed_sims:
        row_idx = item["row_idx"]

        for sim_type in use:
            sim_matrix = item[f"{sim_type}_similarities"]

            if ref_idx is not None:
                sim_values = sim_matrix[ref_idx]
                if (sim_values > threshold).any():
                    matches.append(row_idx)
                    break
            else:
                for i, sim_values in enumerate(sim_matrix):
                    if (sim_values > threshold).any():
                        matches[i].append(row_idx)
                        break

    return matches

First, let's use bigram embedding cosine similarity to find text that are similar to "be careful"

In [32]:
be_careful_matches_idx = get_matches(threshold=0.86, ref_idx=0, use=['bigram'])

be_careful_matches = df.iloc[be_careful_matches_idx]['text'].values

print(len(be_careful_matches_idx))
print(be_careful_matches)

4
['be careful with your prognostications said the stranger'
 'be careful with your prognostications said the stranger'
 'watch out for his venom the boy said'
 'watch out for his venom the boy said']


We can see that text with "be careful" and "watch out" are highlighted.

Next, let's use the word embeddings to find text that are similar to "destroy"

In [33]:
destroy_matches_idx = get_matches(threshold=0.8, ref_idx=1, use=['word'])

destroy_matches = df.iloc[destroy_matches_idx]['text'].values

print(len(destroy_matches_idx))
print(destroy_matches)

8
['i thought that everything i owned would be destroyed'
 "that's what killed him" 'hitting the delete and escape keys did nothing'
 'hitting the delete and escape keys did nothing'
 'paracetamol should get rid of that pain'
 'it was a cobra whose venom could kill a person in minutes'
 "i can't think of any reason why she was killed"
 'the picnic was ruined by a marching band']


We can see that text with words similar to "destroy" -  "destroyed", "killed", "delete", "rid", "kill", and "ruined" are highlighted.

Next, we use the word embeddings to find text that are similar to "stranger".

We can see that text with words similar to "stranger" -  "strange", "strangeness" are highlighted. Interestingly, even words like "foreigner" and "foreign" are highlighted, as they are similar to "stranger".

In [34]:
stranger_matches_idx = get_matches(threshold=0.8, ref_idx=2, use=['word'])

stranger_matches = df.iloc[stranger_matches_idx]['text'].values

print(len(stranger_matches_idx))
print(stranger_matches)

44
['be careful with your prognostications said the stranger'
 'the stranger seemed satisfied with the answer'
 'they were in an immense setting surrounded by thousands of people speaking a strange language'
 'how strange africa is thought the boy'
 "the boy noticed that the man's clothing was strange"
 'the boy was strong and wanted to retaliate but he was in a foreign country'
 'it was only when i got this close to it that the strangeness of it was at all evident to me'
 'i had to test your courage the stranger said'
 'the years of research the magic symbols the strange words and the laboratory equipment'
 'it was only when i got this close to it that the strangeness of it was at all evident to me'
 'i had to test your courage the stranger said'
 'how strange africa is thought the boy'
 'sandra read aloud the strange excerpt'
 'be careful with your prognostications said the stranger'
 'the stranger was speaking of things that very few people knew about'
 'the stranger was speaking of

Combine all the matched row indexs into a single list.

In [35]:
all_matches_idx = be_careful_matches_idx + destroy_matches_idx + stranger_matches_idx

all_matches_idx = set(all_matches_idx)

len(all_matches_idx)

54

Add a new boolean 'similarity' column to the file and save the file.

In [37]:
df['similarity'] = df.index.isin(all_matches_idx)

df.to_csv('cv-valid-dev.csv', index=False)