# Llama Model Analytics on Yelp Dataset

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datasets import load_dataset


## Create the Ground Truth CSV file

In [5]:
# dataset = load_dataset("KaiLv/UDR_Yelp")
# data = dataset["test"]

# data = pd.DataFrame(data).drop(columns=["idx"])
# data.to_csv("/home/grads/hassledw/ICL_Research/UDR-yelp-llama.csv")

## Clean the Label Data in Zero-Shot and Run Metric

In [22]:
truth_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_yelp_results/UDR-yelp-llama.csv")
zero_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_yelp_results/UDR-yelp-zeroshot-llama.csv")
few_df = pd.read_csv("/home/grads/hassledw/ICL_Research/UDR_yelp_results/UDR-yelp-fewshot-llama.csv")

def get_response(text):
    '''
    Cleans the text of the label to just get the response
    '''
    valid = ["very negative", "very positive", "negative", "positive", "neutral"]
    valid_dict = {"very negative": 0, "negative": 1, "neutral": 2, "positive": 3, "very positive": 4}
    sentences = text.split("Response:")
    query = sentences[-1].strip("##").strip(" ").lower()
    
    if len(sentences[-1].split(" ")) > 3 or query not in valid:
        for v in valid:
            if v in query:
                return valid_dict[v]
        return None
    else:
        return valid_dict[query]
    

zero_df["label"] = zero_df["label"].apply(get_response)
few_df["label"] = few_df["label"].apply(get_response)

orig_entries = zero_df.shape[0]
orig_entries_few = few_df.shape[0]

zero_df = zero_df.dropna()
few_df = few_df.dropna()

print(f"Dropped {orig_entries - zero_df.shape[0]} \"None\" entries")
print(f"Dropped {orig_entries_few - few_df.shape[0]} \"None\" entries")

zero_df["label"] = zero_df["label"].astype(int)
few_df["label"] = few_df["label"].astype(int)


Dropped 276 "None" entries
Dropped 146 "None" entries


In [23]:
zero_df.to_csv("/home/grads/hassledw/ICL_Research/UDR_yelp_results/UDR-yelp-zeroshot-llama-cleaned.csv")
few_df.to_csv("/home/grads/hassledw/ICL_Research/UDR_yelp_results/UDR-yelp-fewshot-llama-cleaned.csv")

## Compare Ground Truth and Zero-Shot
Here are the rrompts used for the baseline tasks.

**Zero Shot**:
```
Please rate the sentiment of the following text # "very negative", "negative", "neutral", "positive", or "very positive"#:
"### \"{query}\""
"### Response:"""
```

**Few Shot**:

```
Here are some demonstration examples for the sentiment classification task:
    1. \"{similar_dict[(value + 1) % len(X_test)][0][:200]}...\" = \"{similar_dict[(value + 1) % len(X_test)][1]}\"
    2. \"{similar_dict[(value + 2) % len(X_test)][0][:200]}...\" = \"{similar_dict[(value + 2) % len(X_test)][1]}\"
Please rate the sentiment of the following text # "very negative", "negative", "neutral", "positive", or "very positive"#.
"### \"{query}\""
"### Response:"""
```
    

In [25]:
zero_results = pd.merge(truth_df, zero_df, on=['sentence'], how='inner').drop(columns=["Unnamed: 0_x", "Unnamed: 0.1", "Unnamed: 0_y"])
few_results = pd.merge(truth_df, few_df, on=['sentence'], how='inner')


accurate_results_zero = zero_results[zero_results["label_x"] == zero_results["label_y"]]
accurate_results_few = few_results[few_results["label_x"] == few_results["label_y"]]

print(f"Llama-7b Prediction Accuracy (Zero-shot): {len(accurate_results_zero) / len(zero_results) * 100:.2f}%")
print(f"Llama-7b Prediction Accuracy (Few-shot): {len(accurate_results_few) / len(few_results) * 100:.2f}%")

Llama-7b Prediction Accuracy (Zero-shot): 42.00%
Llama-7b Prediction Accuracy (Few-shot): 44.57%
