# Semantic Similarity

In [2]:
import transformers
import pickle
import torch
# https://huggingface.co/KaiLv
from datasets import load_dataset
import numpy as np
import spacy
import pandas as pd
import time
from transformers import LlamaForCausalLM, LlamaTokenizer
import sys
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, util

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device object

  from .autonotebook import tqdm as notebook_tqdm


## Semantical Embeddings

1. https://huggingface.co/sentence-transformers
2. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks https://arxiv.org/pdf/1908.10084.pdf
3. https://www.sbert.net/docs/pretrained_models.html#model-overview
4. https://www.sbert.net/examples/applications/semantic-search/README.html#semantic-search

In [3]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']
sentence2 = ['Bears are Really Cool']
# Sentences are encoded by calling model.encode()
embedding = embedder.encode(sentence)
embedding2 = embedder.encode(sentence2)
print(embedding.shape)
print(embedding2.shape)

(1, 384)
(1, 384)


### Grab the Data

In [61]:
def create_data_arrays(datasetname="KaiLv/UDR_Yelp"):
    '''
    Creates the X_* and y_* arrays.
    '''
    dataset = load_dataset(datasetname)
    X_train = np.array(dataset["train"]["question"])
    y_train = np.array(dataset["train"]["label"])
    X_test = np.array(dataset["test"]["question"])

    if datasetname == "KaiLv/UDR_ComE":
        X_train = np.array([value[80:] for value in X_train])
        X_train = np.array([value.replace(" Options:", "\nOptions:") for value in X_train])
        X_train = np.array([value.replace(" A.", "\nA.") for value in X_train])
        X_train = np.array([value.replace(" B.", "\nB.") for value in X_train])
        X_train = np.array([value.replace(" C.", "\nC.") for value in X_train])
        y_train = np.array([value[0] for value in y_train])
        X_test = np.array([value[80:] for value in X_test])
        X_test = np.array([value.replace(" Options:", "\nOptions:") for value in X_test])
        X_test = np.array([value.replace(" A.", "\nA.") for value in X_test])
        X_test = np.array([value.replace(" B.", "\nB.") for value in X_test])
        X_test = np.array([value.replace(" C.", "\nC.") for value in X_test])
    # y_test = np.array(dataset["test"]["label"])

    # X_debug = np.array(dataset["debug"]["sentence"])
    # y_debug = np.array(dataset["debug"]["label"])
    
    return X_train, y_train, X_test

### Embed the UDR_Yelp Dataset

In [62]:
# Corpus with example sentences
X_train, y_train, X_test = create_data_arrays(datasetname="KaiLv/UDR_ComE")
print(X_train)
corpus = X_train
queries = X_test

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True).to(device)
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

query_embeddings = embedder.encode(queries, convert_to_tensor=True).to(device)
query_embeddings = util.normalize_embeddings(query_embeddings)

hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=3, score_function=util.dot_score)

[" He poured orange juice on his cereal.\nOptions:\nA. Orange juice is usually bright orange.\nB. Orange juice doesn't taste good on cereal.\nC. Orange juice is sticky if you spill it on the table."
 ' He drinks apple.\nOptions:\nA. Apple juice are very tasty and milk too\nB. Apple can not be drunk\nC. Apple cannot eat a human'
 ' Jeff ran 100,000 miles today\nOptions:\nA. 100,000 miles is way to long for one person to be able to run in one day.\nB. Jeff is a four letter name and 100,000 has six numerical digest\nC. 100,000 miles is longer than 100,000 km.'
 ...
 " Harry went to the barbershop to have his glasses repaired\nOptions:\nA. a barbershop usually don't provide the service of repairing glasses\nB. a barbershop usually repairs computers instead of glasses\nC. the barbershop lacked the necessary tools to repair his glasses"
 ' Reilly is sleeping on the window\nOptions:\nA. the window is open and a person cannot lay on it\nB. the window is too cold to sleep on it\nC. a person can

### Print Top-K Examples for Every Query

In [63]:
hits = np.array(hits)
for i, query in enumerate(hits[:3]):
    print(X_test[i])
    for x, top_k_entry in enumerate(query):
        train_idx = top_k_entry["corpus_id"]
        print(f"\t{x + 1}. {X_train[train_idx]} = {y_train[train_idx]}")
    

 He loves to stroll at the park with his bed
Options:
A. A bed is too heavy to carry with when strolling at a park
B. walking at a park is good for health
C. Some beds are big while some are smaller
	1.  a plane is on his bed
Options:
A. a toy plane is on the bed
B. a plane is too large to park on bed
C. there are beds in some plane = B
	2.  he usually goes to the gym to sleep
Options:
A. there is no bed in the gym
B. people seldom do that
C. The admission fee at the gym is too expensive = B
	3.  I bought some beds to open a cafe
Options:
A. beds are too expensive
B. sitting on the bed is uncomfortable
C. the cafe is not a place to sleep = C
 The inverter was able to power the continent.
Options:
A. An inverter is smaller than a car
B. An inverter is incapable of powering an entire continent.
C. An inverter is rechargeable.
	1.  Air can power cars.
Options:
A. Air can't be burned.
B. Air contains no energy.
C. Car can't run with air. = C
	2.  Atlantic is the biggest continent in the wo

## Llama Model

In [None]:
with open("./token.txt") as f:
    token = f.readline()
    tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=token)
    model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
        load_in_8bit=False,
        torch_dtype=torch.float16,
        device_map="auto",
        token=token)

In [None]:
def query_model(prompt, max_tokens=20):
    '''
    Queries the llama model.
    '''
    inputs = tokenizer(
        f"{prompt}",
        return_tensors="pt",
    )
    input_ids = inputs["input_ids"].to("cuda")

    generation_config = transformers.GenerationConfig(
        do_sample=True,
        temperature=0.1,
        top_p=0.75,
        top_k=1,
        repetition_penalty=1.5,
        max_new_tokens=max_tokens,
    )

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            attention_mask=torch.ones_like(input_ids),
            generation_config=generation_config,
        )

    output_text = tokenizer.decode(
        generation_output[0].cuda(), skip_special_tokens=True
    ).strip()

    return output_text


In [None]:
df = pd.DataFrame(columns=["sentence", "label"])
count = 1
hits = np.array(hits)

for i, entry in enumerate(hits):
    query = X_test[i]
    train_idx_1 = entry[0]["corpus_id"]
    train_idx_2 = entry[1]["corpus_id"]
    train_idx_3 = entry[2]["corpus_id"]
    
    prompt = f"""
    Here are some examples of my task:
    1. {X_train[train_idx_1]} Response: {y_train[train_idx_1]}
    2. {X_train[train_idx_2]} Response: {y_train[train_idx_2]}
    3. {X_train[train_idx_3]} Response: {y_train[train_idx_3]}
    rate the sentiment of the below review: "very negative", "negative", "neutral", "positive", or "very positive".
    ###\"{query}\" Response: """

    output_text = query_model(prompt)

    print(output_text)
    print("\n")

    print(f"Finished {count}/{len(X_test)}\n")
    entry = [query, output_text]
    df_entry = pd.DataFrame(entry, index=['sentence', 'label']).T
    df = pd.concat((df, df_entry))
    count+=1
    torch.cuda.empty_cache()

df.to_csv(f"/home/grads/hassledw/ICL_Research/UDR_yelp_results/UDR-yelp-fewshot-llama.csv")

## Evaluation

### Yelp Result Class to Run Metrics

In [64]:
class YelpResults:
    def __init__(self, zero_yelp_df, few_yelp_df):
        self.zero_yelp_df = zero_yelp_df
        self.few_yelp_df = few_yelp_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR-yelp-llama.csv")
    
    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_yelp(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["very negative", "very positive", "negative", "positive", "neutral"]
            valid_dict = {"very negative": 0, "negative": 1, "neutral": 2, "positive": 3, "very positive": 4}
            sentences = text.split("Response:")
            query = sentences[-1].strip("##").strip(" ").lower()
            
            if len(sentences[-1].split(" ")) > 3 or query not in valid:
                for v in valid:
                    if v in query:
                        return valid_dict[v]
                return None
            else:
                return valid_dict[query]
            
        df["label"] = df["label"].apply(get_response_yelp)
        orig_entries = df.shape[0]
        df = df.dropna()
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        df["label"] = df["label"].astype(int)
        return df
    
    def gather_yelp_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['sentence'], how='inner')
        accurate_results = df_results[df_results["label_x"] == df_results["label_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        self.zero_yelp_df = self.clean_data(self.zero_yelp_df)
        self.few_yelp_df = self.clean_data(self.few_yelp_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {self.gather_yelp_accuracy(self.zero_yelp_df):.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_yelp_accuracy(self.few_yelp_df):.2f}%")
        print()

test_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR-yelp-llama.csv")
zero_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR_Yelp-zeroshot-llama.csv")
few_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR_Yelp-fewshot-llama.csv")
yelpres = YelpResults(zero_yelp_df, few_yelp_df)
yelpres.run_results()


Dropped 8 "None" entries
Dropped 145 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 47.76%
Llama-7b Prediction Accuracy (Few-shot): 54.65%



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)


In [11]:
# dataset = load_dataset("KaiLv/UDR_SNLI")
# data = dataset["test"]

# data = pd.DataFrame(data).drop(columns=["idx"])
# data.to_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR-snli-llama.csv")

In [65]:
class SNLIResults:
    def __init__(self, zero_snli_df, few_snli_df):
        self.zero_snli_df = zero_snli_df
        self.few_snli_df = few_snli_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR-snli-llama.csv")
    
    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_snli(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["entail", "inco", "contra", "in con"]
            valid_dict = {"entailment": 0, "inconclusive": 1, "contradiction": 2}

            sentences = text.split("Response:")
            query = sentences[-1].strip("##").strip(" ").lower()
            
            if query not in valid:
                for i, v in enumerate(valid):
                    if i == 3:
                        return valid_dict[list(valid_dict.keys())[1]]
                    if v in query:
                        return valid_dict[list(valid_dict.keys())[i]]
                print(query)
                return None
            else:
                return valid_dict[query]
            
        df["label"] = df["label"].apply(get_response_snli)
        orig_entries = df.shape[0]
        df = df.dropna()
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        df["label"] = df["label"].astype(int)
        return df
    
    def gather_snli_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['sentence'], how='inner')
        accurate_results = df_results[df_results["label_x"] == df_results["label_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        self.zero_snli_df = self.clean_data(self.zero_snli_df)
        self.few_snli_df = self.clean_data(self.few_snli_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {self.gather_snli_accuracy(self.zero_snli_df):.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_snli_accuracy(self.few_snli_df):.2f}%")
        print()

zero_snli_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR_SNLI-zeroshot-llama.csv")
few_snli_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR_SNLI-fewshot-llama.csv")
yelpres = SNLIResults(zero_snli_df, few_snli_df)
yelpres.run_results()

Dropped 0 "None" entries
Dropped 0 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 41.80%
Llama-7b Prediction Accuracy (Few-shot): 46.80%

