# Semantic Similarity

In [3]:
import transformers
import pickle
import torch
# https://huggingface.co/KaiLv
from datasets import load_dataset
import numpy as np
import spacy
import pandas as pd
import time
from transformers import LlamaForCausalLM, LlamaTokenizer
import sys
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, util

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device object

  from .autonotebook import tqdm as notebook_tqdm


## Semantical Embeddings

1. https://huggingface.co/sentence-transformers
2. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks https://arxiv.org/pdf/1908.10084.pdf
3. https://www.sbert.net/docs/pretrained_models.html#model-overview
4. https://www.sbert.net/examples/applications/semantic-search/README.html#semantic-search

In [4]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']
sentence2 = ['Bears are Really Cool']
# Sentences are encoded by calling model.encode()
embedding = embedder.encode(sentence)
embedding2 = embedder.encode(sentence2)
print(embedding.shape)
print(embedding2.shape)

(1, 384)
(1, 384)


### Grab the Data

In [36]:
def create_data_arrays(datasetname="KaiLv/UDR_Yelp"):
    '''
    Creates the X_* and y_* arrays.
    '''
    dataset = load_dataset(datasetname, 'ARC-Challenge')
    # X_train = np.array(dataset["train"]["question"])
    choices = [choice["text"] for choice in dataset["train"]["choices"]]
    df_X_train = pd.DataFrame(data={"question":dataset["train"]["question"], "choices": choices})
    df_X_train['prompt'] = ''
    labels = ['A', 'B', 'C', 'D']

    for index, row in df_X_train.iterrows():
        df_X_train.at[index,'prompt'] = f"Question: {row['question']}" + "\nChoices:\n"
        for i, (label, choice) in enumerate(zip(labels, row['choices'])):
            df_X_train.at[index,'prompt'] += f"{label}. {choice}\n"

    choices = [choice["text"] for choice in dataset["test"]["choices"]]
    df_X_test = pd.DataFrame(data={"question":dataset["test"]["question"], "choices": choices})
    df_X_test['prompt'] = ''
    labels = ['A', 'B', 'C', 'D']

    for index, row in df_X_test.iterrows():
        df_X_test.at[index,'prompt'] = f"Question: {row['question']}" + "\nChoices:\n"
        for i, (label, choice) in enumerate(zip(labels, row['choices'])):
            df_X_test.at[index,'prompt'] += f"{label}. {choice}\n"
    

    print(df_X_train.at[0, 'prompt'])
    print(df_X_test.at[0, 'prompt'])
    df_y_train = pd.DataFrame(data={"answerKey":dataset["train"]["answerKey"]})
    df_y_test = pd.DataFrame(data={"answerKey":dataset["test"]["answerKey"]})
    df_y_train["answerKey"].replace({"1": "A", "2": "B", "3": "C", "4": "D"})
    df_y_test["answerKey"].replace({"1": "A", "2": "B", "3": "C", "4": "D"})

    X_train = np.array(df_X_train["prompt"])
    y_train = np.array(df_y_train["answerKey"])
    X_test = np.array(df_X_test["prompt"])
    y_test = np.array(df_y_test["answerKey"])

    data = pd.DataFrame(data={"prompt": X_test, "answerKey": y_test})
    data.to_csv("/home/grads/hassledw/ICL_Research/ARC-Challenge_results/ARC-challenge-llama.csv")

    return X_train, y_train, X_test

### Embed the UDR_Yelp Dataset

In [37]:
# Corpus with example sentences
X_train, y_train, X_test = create_data_arrays(datasetname="ai2_arc")
corpus = X_train
queries = X_test

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True).to(device)
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

query_embeddings = embedder.encode(queries, convert_to_tensor=True).to(device)
query_embeddings = util.normalize_embeddings(query_embeddings)

hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=3, score_function=util.dot_score)

Question: George wants to warm his hands quickly by rubbing them. Which skin surface will produce the most heat?
Choices:
A. dry palms
B. wet palms
C. palms covered with oil
D. palms covered with lotion

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
Choices:
A. Planetary density will decrease.
B. Planetary years will become longer.
C. Planetary days will become shorter.
D. Planetary gravity will become stronger.



### Print Top-K Examples for Every Query

In [33]:
hits = np.array(hits)
for i, query in enumerate(hits[:3]):
    print(X_test[i])
    for x, top_k_entry in enumerate(query):
        train_idx = top_k_entry["corpus_id"]
        print(f"\t{x + 1}. {X_train[train_idx]} = {y_train[train_idx]}")
    

Question: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
Choices:
A. Planetary density will decrease.
B. Planetary years will become longer.
C. Planetary days will become shorter.
D. Planetary gravity will become stronger.

	1. Question: Michael learned that the movement of Earth in the solar system causes changes that can be seen on the planet. Which change could be seen on Earth in the time it takes Earth to rotate once on its axis?
Choices:
A. day becoming night
B. winter changing to spring
C. January changing to February
D. a new moon becoming a full moon
 = A
	2. Question: Which event occurs on a daily cycle?
Choices:
A. The Sun rises and sets.
B. Earth tilts on its axis.
C. Earth revolves around the Sun.
D. The Moon revolves around Earth.
 = A
	3. Question: The gravitational force of the Sun affects the planets in our solar system. Which of these is influenced the most by this force?
Choi

## Llama Model

In [None]:
with open("./token.txt") as f:
    token = f.readline()
    tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=token)
    model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
        load_in_8bit=False,
        torch_dtype=torch.float16,
        device_map="auto",
        token=token)

In [None]:
def query_model(prompt, max_tokens=20):
    '''
    Queries the llama model.
    '''
    inputs = tokenizer(
        f"{prompt}",
        return_tensors="pt",
    )
    input_ids = inputs["input_ids"].to("cuda")

    generation_config = transformers.GenerationConfig(
        do_sample=True,
        temperature=0.1,
        top_p=0.75,
        top_k=1,
        repetition_penalty=1.5,
        max_new_tokens=max_tokens,
    )

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            attention_mask=torch.ones_like(input_ids),
            generation_config=generation_config,
        )

    output_text = tokenizer.decode(
        generation_output[0].cuda(), skip_special_tokens=True
    ).strip()

    return output_text


In [None]:
df = pd.DataFrame(columns=["sentence", "label"])
count = 1
hits = np.array(hits)

for i, entry in enumerate(hits):
    query = X_test[i]
    train_idx_1 = entry[0]["corpus_id"]
    train_idx_2 = entry[1]["corpus_id"]
    train_idx_3 = entry[2]["corpus_id"]
    
    prompt = f"""
    Here are some examples of my task:
    1. {X_train[train_idx_1]} Response: {y_train[train_idx_1]}
    2. {X_train[train_idx_2]} Response: {y_train[train_idx_2]}
    3. {X_train[train_idx_3]} Response: {y_train[train_idx_3]}
    rate the sentiment of the below review: "very negative", "negative", "neutral", "positive", or "very positive".
    ###\"{query}\" Response: """

    output_text = query_model(prompt)

    print(output_text)
    print("\n")

    print(f"Finished {count}/{len(X_test)}\n")
    entry = [query, output_text]
    df_entry = pd.DataFrame(entry, index=['sentence', 'label']).T
    df = pd.concat((df, df_entry))
    count+=1
    torch.cuda.empty_cache()

df.to_csv(f"/home/grads/hassledw/ICL_Research/UDR_yelp_results/UDR-yelp-fewshot-llama.csv")

## Evaluation

### Yelp Result Class to Run Metrics

In [64]:
class YelpResults:
    def __init__(self, zero_yelp_df, few_yelp_df):
        self.zero_yelp_df = zero_yelp_df
        self.few_yelp_df = few_yelp_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR-yelp-llama.csv")
    
    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_yelp(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["very negative", "very positive", "negative", "positive", "neutral"]
            valid_dict = {"very negative": 0, "negative": 1, "neutral": 2, "positive": 3, "very positive": 4}
            sentences = text.split("Response:")
            query = sentences[-1].strip("##").strip(" ").lower()
            
            if len(sentences[-1].split(" ")) > 3 or query not in valid:
                for v in valid:
                    if v in query:
                        return valid_dict[v]
                return None
            else:
                return valid_dict[query]
            
        df["label"] = df["label"].apply(get_response_yelp)
        orig_entries = df.shape[0]
        df = df.dropna()
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        df["label"] = df["label"].astype(int)
        return df
    
    def gather_yelp_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['sentence'], how='inner')
        accurate_results = df_results[df_results["label_x"] == df_results["label_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        self.zero_yelp_df = self.clean_data(self.zero_yelp_df)
        self.few_yelp_df = self.clean_data(self.few_yelp_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {self.gather_yelp_accuracy(self.zero_yelp_df):.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_yelp_accuracy(self.few_yelp_df):.2f}%")
        print()

test_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR-yelp-llama.csv")
zero_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR_Yelp-zeroshot-llama.csv")
few_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR_Yelp-fewshot-llama.csv")
yelpres = YelpResults(zero_yelp_df, few_yelp_df)
yelpres.run_results()


Dropped 8 "None" entries
Dropped 145 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 47.76%
Llama-7b Prediction Accuracy (Few-shot): 54.65%



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)


### SNLI Result Class to Run Metrics

In [11]:
# dataset = load_dataset("KaiLv/UDR_SNLI")
# data = dataset["test"]

# data = pd.DataFrame(data).drop(columns=["idx"])
# data.to_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR-snli-llama.csv")

In [65]:
class SNLIResults:
    def __init__(self, zero_snli_df, few_snli_df):
        self.zero_snli_df = zero_snli_df
        self.few_snli_df = few_snli_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR-snli-llama.csv")
    
    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_snli(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["entail", "inco", "contra", "in con"]
            valid_dict = {"entailment": 0, "inconclusive": 1, "contradiction": 2}

            sentences = text.split("Response:")
            query = sentences[-1].strip("##").strip(" ").lower()
            
            if query not in valid:
                for i, v in enumerate(valid):
                    if i == 3:
                        return valid_dict[list(valid_dict.keys())[1]]
                    if v in query:
                        return valid_dict[list(valid_dict.keys())[i]]
                print(query)
                return None
            else:
                return valid_dict[query]
            
        df["label"] = df["label"].apply(get_response_snli)
        orig_entries = df.shape[0]
        df = df.dropna()
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        df["label"] = df["label"].astype(int)
        return df
    
    def gather_snli_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['sentence'], how='inner')
        accurate_results = df_results[df_results["label_x"] == df_results["label_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        self.zero_snli_df = self.clean_data(self.zero_snli_df)
        self.few_snli_df = self.clean_data(self.few_snli_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {self.gather_snli_accuracy(self.zero_snli_df):.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_snli_accuracy(self.few_snli_df):.2f}%")
        print()

zero_snli_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR_SNLI-zeroshot-llama.csv")
few_snli_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR_SNLI-fewshot-llama.csv")
yelpres = SNLIResults(zero_snli_df, few_snli_df)
yelpres.run_results()

Dropped 0 "None" entries
Dropped 0 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 41.80%
Llama-7b Prediction Accuracy (Few-shot): 46.80%



### ComE Result Class to Run Metrics

In [29]:
class ComEResults:
    def __init__(self, zero_come_df, few_come_df):
        self.zero_come_df = zero_come_df
        self.few_come_df = few_come_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_ComE_results/UDR-ComE-llama.csv")

    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_come(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["A", "B", "C"]

            valid_other = {1: "A", 2: "B", 3: "C"}

            sentences = text.split("Response:")
            query = sentences[-1][:5]

            for char in query:
                if char in valid:
                    return char
                elif char in ["1", "2", "3"]:
                    return valid_other[int(char)]
                
            return None
        
        df["label"] = df["label"].apply(get_response_come)
        orig_entries = df.shape[0]
        df = df.dropna()
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        return df
    
    def gather_come_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['question'], how='inner')
        accurate_results = df_results[df_results["label_x"] == df_results["label_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        # self.zero_come_df = self.clean_data(self.zero_come_df)
        self.zero_come_df = self.clean_data(self.zero_come_df)
        self.few_come_df = self.clean_data(self.few_come_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {self.gather_come_accuracy(self.zero_come_df):.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_come_accuracy(self.few_come_df):.2f}%")
        print()

zero_come_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_ComE_results/UDR_ComE-zeroshot-llama.csv")
few_come_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_ComE_results/UDR_ComE-fewshot-llama.csv")
comeres = ComEResults(zero_come_df, few_come_df)
comeres.run_results()

Dropped 7 "None" entries
Dropped 8 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 59.03%
Llama-7b Prediction Accuracy (Few-shot): 47.15%



### CosmosQA Result Class to Run Metrics

In [68]:
class CosmosResults:
    def __init__(self, zero_cosmos_df, few_cosmos_df):
        self.zero_cosmos_df = zero_cosmos_df
        self.few_cosmos_df = few_cosmos_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/CosmosQA_results/CosmosQA-test-llama.csv")

    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_cosmos(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["1", "2", "3"]

            sentences = text.split("Response:")
            query = sentences[-1][:5]

            for char in query:
                if char in valid:
                    return char
                
            return None
        
        df["label"] = df["label"].apply(get_response_cosmos)
        orig_entries = df.shape[0]
        df = df.dropna()
        df["label"] = df["label"].astype(int)
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        return df
    
    def gather_cosmos_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['prompt'], how='inner')
        accurate_results = df_results[df_results["label_x"] == df_results["label_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        # self.zero_come_df = self.clean_data(self.zero_come_df)
        self.zero_cosmos_df = self.clean_data(self.zero_cosmos_df)
        self.few_cosmos_df = self.clean_data(self.few_cosmos_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {self.gather_cosmos_accuracy(self.zero_cosmos_df):.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_cosmos_accuracy(self.few_cosmos_df):.2f}%")
        print()

zero_cosmos_df = pd.read_csv("/home/grads/hassledw/ICL_Research/CosmosQA_results/CosmosQA-zeroshot-llama.csv")
few_cosmos_df = pd.read_csv("/home/grads/hassledw/ICL_Research/CosmosQA_results/CosmosQA-fewshot-llama.csv")
cosmosres = CosmosResults(zero_cosmos_df, few_cosmos_df)
cosmosres.run_results()

Dropped 68 "None" entries
Dropped 56 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 42.82%
Llama-7b Prediction Accuracy (Few-shot): 47.30%



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)


In [41]:
class ARCResults:
    def __init__(self, zero_arc_df, few_arc_df):
        self.zero_arc_df = zero_arc_df
        self.few_arc_df = few_arc_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/ARC-Challenge_results/ARC-challenge-llama.csv")

    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_arc(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["A", "B", "C", "D"]
            other_valid = ["(a)", "(b)", "(c)", "(d)"]

            sentences = text.split("Response:")
            query = sentences[-1][:10]

            for char in query:
                if char in valid:
                    return char
                
            for i, other in enumerate(other_valid):
                if other in query:
                    return valid[i]
                
            return None
        
        df["answerKey"] = df["answerKey"].apply(get_response_arc)
        orig_entries = df.shape[0]
        df = df.dropna()
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        return df
    
    def gather_arc_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['prompt'], how='inner')
        accurate_results = df_results[df_results["answerKey_x"] == df_results["answerKey_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        # self.zero_come_df = self.clean_data(self.zero_come_df)
        self.zero_arc_df = self.clean_data(self.zero_arc_df)
        self.few_arc_df = self.clean_data(self.few_arc_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {self.gather_arc_accuracy(self.zero_arc_df):.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_arc_accuracy(self.few_arc_df):.2f}%")
        print()

zero_arc_df = pd.read_csv("/home/grads/hassledw/ICL_Research/ARC-Challenge_results/ARC-Challenge-zeroshot-llama.csv")
few_arc_df = pd.read_csv("/home/grads/hassledw/ICL_Research/ARC-Challenge_results/ARC-Challenge-fewshot-llama.csv")
arcres = ARCResults(zero_arc_df, few_arc_df)
arcres.run_results()

Dropped 5 "None" entries
Dropped 100 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 50.51%
Llama-7b Prediction Accuracy (Few-shot): 42.75%

