# Task 5b
## Requirements

Text embedding model to use: hkunlp/instructor-large
https://huggingface.co/hkunlp/instructor-large
Using the text embedding model, write a python jupyter notebook called
cv-hotword-similarity-5b.ipynb to find similar phrases to the 3
hot words in task 5a. Using cv-valid-dev.csv, write the Boolean (true
for a record containing similar phrases to the hot words; false for a record
that is not similar) into a new column called similarity. Save this
updated file in this folder.

In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-large')

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


  model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))


# Functions

In [3]:
def compute_similarity(sentences_a, sentences_b):
    """ Compute cosine similarity between the input list of sentences
    """
    embeddings_a = model.encode(sentences_a)
    embeddings_b = model.encode(sentences_b)
    similarities = cosine_similarity(embeddings_a,embeddings_b)
    return similarities

In [4]:
def extract_candidate_phrases(text):
    """ Extract single-word, 2-words, and 3-words phrases from input text
    """
    # Split text to words and lowercase
    words = text.lower().split()

    # Extract all single words longer than 4 characers
    single_words = [word for word in words if len(word)>4]

    # Extract all phrases with 2 words
    two_words = [' '.join(words[i:i+2]) for i in range(len(words)-1)]

    # Extract all phrases with 3 words
    three_words = [' '.join(words[i:i+3]) for i in range(len(words)-2)]

    return single_words + two_words + three_words

In [5]:
def detect_hotwords(instruction, hotwords, text):
    """ Detect hotwords from input text
        Outputs:
            - similarity score
            - detected hotword
            - most similar words in text
    """ 
    # Prepare hotwords for model inputs
    hotwords_sentences = [[instruction, hotword] for hotword in hotwords]

    # Extract single,2,3-words phrases from text
    candidates = extract_candidate_phrases(text)

    # Return if the candidates list is empty
    if not candidates:
        return 0.0, '', ''
        
    # Prepare candidate phrases for model inputs
    candidates_sentences = [[instruction, candidate] for candidate in candidates]

    # Compute similarity scores
    scores = compute_similarity(hotwords_sentences, candidates_sentences)

    # Find the index of the maximum score
    index_of_max_score = np.unravel_index(np.argmax(scores), scores.shape)

    return np.max(scores), hotwords[index_of_max_score[0]], candidates[index_of_max_score[1]]

# Main Processsing

In [6]:
instruction = 'Represent the text for classification: '

In [7]:
hotwords = ['be careful', 'destroy', 'stranger']

In [8]:
# Load text from csv
csv_filename = "cv-valid-dev.csv"
df = pd.read_csv(csv_filename)
text_list = df['text'].tolist()

In [9]:
# Iterate over the text list to detect hotwords 
detection_threshold = 0.9

results = []
for i, text in enumerate(text_list):
    max_score, hotword, candidate = detect_hotwords(instruction, hotwords, text)
    if max_score > detection_threshold:
        print(f"({i})\t{max_score:.3f} {candidate} [{hotword}]")
        results.append('true')
    else:
        results.append('false')

(0)	1.000 be careful [be careful]
(3)	0.972 be destroyed [destroy]
(89)	1.000 stranger [stranger]
(395)	0.906 danger [be careful]
(508)	1.000 stranger [stranger]
(539)	0.908 the strange [stranger]
(674)	1.000 stranger [stranger]
(693)	0.914 take care [be careful]
(900)	0.906 danger [be careful]
(1036)	0.908 the strange [stranger]
(1067)	0.909 need to worry [be careful]
(1093)	1.000 be careful [be careful]
(1101)	1.000 stranger [stranger]
(1243)	1.000 stranger [stranger]
(1311)	0.908 the strange [stranger]
(1445)	0.909 need to worry [be careful]
(1501)	1.000 stranger [stranger]
(1561)	0.909 need to worry [be careful]
(1691)	0.906 danger [be careful]
(1775)	0.906 danger [be careful]
(1781)	0.906 danger [be careful]
(1919)	0.906 danger [be careful]
(1933)	1.000 stranger [stranger]
(2092)	0.923 carefully [be careful]
(2166)	0.915 be sure you [be careful]
(2405)	1.000 stranger [stranger]
(2449)	0.909 need to worry [be careful]
(2453)	0.975 strangers [stranger]
(2685)	0.914 take care [be car

In [10]:
# Write results to the csv file
df["similarity"] = results
df.to_csv(csv_filename, index=False)