# Semantic Similarity

In [2]:
import transformers
import pickle
import torch
# https://huggingface.co/KaiLv
from datasets import load_dataset
import numpy as np
import spacy
import pandas as pd
import time
from transformers import LlamaForCausalLM, LlamaTokenizer
import sys
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, util

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device object

  from .autonotebook import tqdm as notebook_tqdm


## Semantical Embeddings

1. https://huggingface.co/sentence-transformers
2. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks https://arxiv.org/pdf/1908.10084.pdf
3. https://www.sbert.net/docs/pretrained_models.html#model-overview
4. https://www.sbert.net/examples/applications/semantic-search/README.html#semantic-search

In [3]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']
sentence2 = ['Bears are Really Cool']
# Sentences are encoded by calling model.encode()
embedding = embedder.encode(sentence)
embedding2 = embedder.encode(sentence2)
print(embedding.shape)
print(embedding2.shape)

(1, 384)
(1, 384)


### Grab the Data

In [4]:
def create_data_arrays(datasetname="KaiLv/UDR_Yelp"):
    '''
    Creates the X_* and y_* arrays.
    '''
    dataset = load_dataset(datasetname)
    X_train = np.array(dataset["train"]["sentence"])
    y_train = np.array(dataset["train"]["label"])

    X_test = np.array(dataset["test"]["sentence"])
    # y_test = np.array(dataset["test"]["label"])

    # X_debug = np.array(dataset["debug"]["sentence"])
    # y_debug = np.array(dataset["debug"]["label"])
    
    return X_train, y_train, X_test

### Embed the UDR_Yelp Dataset

In [5]:
# Corpus with example sentences
X_train, y_train, X_test = create_data_arrays(datasetname="KaiLv/UDR_SNLI")

corpus = X_train
queries = X_test

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True).to(device)
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

query_embeddings = embedder.encode(queries, convert_to_tensor=True).to(device)
query_embeddings = util.normalize_embeddings(query_embeddings)

hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=3, score_function=util.dot_score)

Downloading readme: 100%|██████████| 760/760 [00:00<00:00, 6.24MB/s]
Downloading data: 100%|██████████| 9.64M/9.64M [00:00<00:00, 9.82MB/s]
Downloading data: 100%|██████████| 268k/268k [00:00<00:00, 1.35MB/s]]
Downloading data: 100%|██████████| 267k/267k [00:00<00:00, 1.20MB/s]]
Downloading data: 100%|██████████| 7.65M/7.65M [00:00<00:00, 15.1MB/s]
Downloading data files: 100%|██████████| 4/4 [00:01<00:00,  2.05it/s]
Extracting data files: 100%|██████████| 4/4 [00:00<00:00, 484.88it/s]
Generating train split: 100%|██████████| 131062/131062 [00:00<00:00, 573596.72 examples/s]
Generating validation split: 100%|██████████| 3272/3272 [00:00<00:00, 415632.29 examples/s]
Generating test split: 100%|██████████| 3262/3262 [00:00<00:00, 404225.47 examples/s]
Generating debug split: 100%|██████████| 100000/100000 [00:00<00:00, 605647.68 examples/s]


### Print Top-K Examples for Every Query

In [6]:
hits = np.array(hits)
for i, query in enumerate(hits[:3]):
    print(X_test[i])
    for x, top_k_entry in enumerate(query):
        train_idx = top_k_entry["corpus_id"]
        print(f"\t{x + 1}. {X_train[train_idx]} = {y_train[train_idx]}")
    

This church choir sings to the masses as they sing joyous songs from the book at a church. Based on that information, is the claim The church has cracks in the ceiling. "Entailment", "Contradiction", or "Inconclusive"?
	1. A choir singing in a church. Based on that information, is the claim The choir is singing a beautiful song. "Entailment", "Contradiction", or "Inconclusive"? = 1
	2. Choir in a church is singing to the crowd. Based on that information, is the claim People are singing in a church. "Entailment", "Contradiction", or "Inconclusive"? = 0
	3. A group of people are seen in a church singing under a yellow ceiling. Based on that information, is the claim The people are singing a church hymn. "Entailment", "Contradiction", or "Inconclusive"? = 1
A woman with a green headscarf, blue shirt and a very big grin. Based on that information, is the claim The woman is young. "Entailment", "Contradiction", or "Inconclusive"?
	1. A middle-aged oriental woman in a green headscarf and blu

## Llama Model

In [None]:
with open("./token.txt") as f:
    token = f.readline()
    tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=token)
    model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
        load_in_8bit=False,
        torch_dtype=torch.float16,
        device_map="auto",
        token=token)

In [None]:
def query_model(prompt, max_tokens=20):
    '''
    Queries the llama model.
    '''
    inputs = tokenizer(
        f"{prompt}",
        return_tensors="pt",
    )
    input_ids = inputs["input_ids"].to("cuda")

    generation_config = transformers.GenerationConfig(
        do_sample=True,
        temperature=0.1,
        top_p=0.75,
        top_k=1,
        repetition_penalty=1.5,
        max_new_tokens=max_tokens,
    )

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            attention_mask=torch.ones_like(input_ids),
            generation_config=generation_config,
        )

    output_text = tokenizer.decode(
        generation_output[0].cuda(), skip_special_tokens=True
    ).strip()

    return output_text


In [None]:
df = pd.DataFrame(columns=["sentence", "label"])
count = 1
hits = np.array(hits)

for i, entry in enumerate(hits):
    query = X_test[i]
    train_idx_1 = entry[0]["corpus_id"]
    train_idx_2 = entry[1]["corpus_id"]
    train_idx_3 = entry[2]["corpus_id"]
    
    prompt = f"""
    Here are some examples of my task:
    1. {X_train[train_idx_1]} Response: {y_train[train_idx_1]}
    2. {X_train[train_idx_2]} Response: {y_train[train_idx_2]}
    3. {X_train[train_idx_3]} Response: {y_train[train_idx_3]}
    rate the sentiment of the below review: "very negative", "negative", "neutral", "positive", or "very positive".
    ###\"{query}\" Response: """

    output_text = query_model(prompt)

    print(output_text)
    print("\n")

    print(f"Finished {count}/{len(X_test)}\n")
    entry = [query, output_text]
    df_entry = pd.DataFrame(entry, index=['sentence', 'label']).T
    df = pd.concat((df, df_entry))
    count+=1
    torch.cuda.empty_cache()

df.to_csv(f"/home/grads/hassledw/ICL_Research/UDR_yelp_results/UDR-yelp-fewshot-llama.csv")

## Evaluation

### Yelp Result Class to Run Metrics

In [27]:
class YelpResults:
    def __init__(self, zero_yelp_df, few_yelp_df):
        self.zero_yelp_df = zero_yelp_df
        self.few_yelp_df = few_yelp_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR-yelp-llama.csv")
    
    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_yelp(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["very negative", "very positive", "negative", "positive", "neutral"]
            valid_dict = {"very negative": 0, "negative": 1, "neutral": 2, "positive": 3, "very positive": 4}
            sentences = text.split("Response:")
            query = sentences[-1].strip("##").strip(" ").lower()
            
            if len(sentences[-1].split(" ")) > 3 or query not in valid:
                for v in valid:
                    if v in query:
                        return valid_dict[v]
                return None
            else:
                return valid_dict[query]
            
        df["label"] = df["label"].apply(get_response_yelp)
        orig_entries = df.shape[0]
        df = df.dropna()
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        df["label"] = df["label"].astype(int)
        return df
    
    def gather_yelp_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['sentence'], how='inner')
        accurate_results = df_results[df_results["label_x"] == df_results["label_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        self.zero_yelp_df = self.clean_data(self.zero_yelp_df)
        self.few_yelp_df = self.clean_data(self.few_yelp_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {self.gather_yelp_accuracy(self.zero_yelp_df):.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_yelp_accuracy(self.few_yelp_df):.2f}%")
        print()

test_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR-yelp-llama.csv")
zero_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR_Yelp-zeroshot-llama.csv")
few_yelp_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_Yelp_results/UDR_Yelp-fewshot-llama.csv")
yelpres = YelpResults(zero_yelp_df, few_yelp_df)
yelpres.run_results()


Dropped 399 "None" entries
Dropped 256 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 37.79%
Llama-7b Prediction Accuracy (Few-shot): 49.60%



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].astype(int)


In [11]:
dataset = load_dataset("KaiLv/UDR_SNLI")
data = dataset["test"]

data = pd.DataFrame(data).drop(columns=["idx"])
data.to_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR-snli-llama.csv")

In [26]:
class SNLIResults:
    def __init__(self, zero_snli_df, few_snli_df):
        self.zero_snli_df = zero_snli_df
        self.few_snli_df = few_snli_df
        self.truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR-snli-llama.csv")
    
    def clean_data(self, df):
        '''
        Cleans the data by retrieving the label and dropping None entries. 
        '''
        def get_response_snli(text):
            '''
            Cleans the text of the label to just get the response
            '''
            valid = ["entail", "inco", "contra", "in con"]
            valid_dict = {"entailment": 0, "inconclusive": 1, "contradiction": 2}

            sentences = text.split("Response:")
            query = sentences[-1].strip("##").strip(" ").lower()
            
            if query not in valid:
                for i, v in enumerate(valid):
                    if i == 3:
                        return valid_dict[list(valid_dict.keys())[1]]
                    if v in query:
                        return valid_dict[list(valid_dict.keys())[i]]
                print(query)
                return None
            else:
                return valid_dict[query]
            
        df["label"] = df["label"].apply(get_response_snli)
        orig_entries = df.shape[0]
        df = df.dropna()
        print(f"Dropped {orig_entries - df.shape[0]} \"None\" entries")
        df["label"] = df["label"].astype(int)
        return df
    
    def gather_snli_accuracy(self, df):
        '''
        Gets the overall accuracy of df.
        ''' 
        df_results = pd.merge(self.truth_df, df, on=['sentence'], how='inner')
        accurate_results = df_results[df_results["label_x"] == df_results["label_y"]]
        return len(accurate_results) / len(df_results) * 100
    
    def run_results(self):
        '''
        Runs the results of the Yelp dataset. 
        '''
        self.few_snli_df = self.clean_data(self.few_snli_df)
        print(f"Llama-7b Prediction Accuracy (Zero-shot): {0:.2f}%")
        print(f"Llama-7b Prediction Accuracy (Few-shot): {self.gather_snli_accuracy(self.few_snli_df):.2f}%")
        print()

few_snli_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_SNLI_results/UDR_SNLI-fewshot-llama.csv")
yelpres = SNLIResults(None, few_snli_df)
yelpres.run_results()

Dropped 0 "None" entries
Llama-7b Prediction Accuracy (Zero-shot): 0.00%
Llama-7b Prediction Accuracy (Few-shot): 38.63%

