# Evaluate LLM only variant
This notebook was tested on GPU using 2x24GB NVIDIA RTX A5000 GPUs. 


## Requirements
```
pip install transformers==4.38.2
pip install accelerate==0.21.0

```

In [1]:
import time
from tqdm import tqdm
from IPython.display import clear_output

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, MistralForCausalLM

from utils import get_ego_schema, calc_loglikelihood

In [2]:
dataset = get_ego_schema()

device = "cuda" # the device to load the model onto
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

clear_output(wait=False)

## Evaluation

In [3]:
device = "cuda"
save_dict = {}

correct, total = 0, 0
st = time.time()
for datum in tqdm(dataset):
    only_prompts = tokenizer(datum['only_prompts'])
    prompt_lengths = [len(x) - 1 for x in only_prompts['input_ids']]
    encoded = tokenizer(datum['prompts'], return_tensors="pt", padding=True)
    labels = encoded['input_ids'].clone()
    for idx, length in enumerate(prompt_lengths):
        labels[idx, :-length] = -100
    model_inputs = {x: y.to(device) for x,y in encoded.items()}
    with torch.no_grad():
        model_outputs = model(**model_inputs)
    loss = calc_loglikelihood(model_outputs.logits.detach(), labels)
    pred = loss.argmin().item()
    correct += datum['ans'] == pred
    total += 1
    if (total + 1) % 100 == 0:
        print(f"Accuracy: {correct / total}")
et = time.time()

print(f"Final Accuracy: {100 * correct / total} %")
print(f"Time Taken Per Iteration: {(et - st) / 500}")

 20%|████████████████████████████▌                                                                                                                   | 99/500 [00:42<02:03,  3.26it/s]

Accuracy: 0.40404040404040403


 40%|████████████████████████████████████████████████████████▉                                                                                      | 199/500 [01:22<01:42,  2.93it/s]

Accuracy: 0.4020100502512563


 60%|█████████████████████████████████████████████████████████████████████████████████████▌                                                         | 299/500 [02:03<01:25,  2.36it/s]

Accuracy: 0.4414715719063545


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 399/500 [02:43<00:46,  2.19it/s]

Accuracy: 0.44611528822055135


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 499/500 [03:25<00:00,  2.29it/s]

Accuracy: 0.4589178356713427


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:25<00:00,  2.43it/s]

Final Accuracy: 45.8 %
Time Taken Per Iteration: 0.41157476568222046





## Evaluating other model variants

You will need a hugging face token and model permission (i.e. accept terms and conditions) to access some of the gated models below. This can be done easily. 
* LLAMA: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
* Gemma: https://huggingface.co/google/gemma-7b-it

Follow this [link](https://huggingface.co/docs/hub/en/security-tokens) to create a user access token.

In [4]:
# remove old model from memory
import gc
del model
gc.collect()
torch.cuda.empty_cache() # PyTorch thing

In [5]:
device = "cuda" # the device to load the model onto
model_name = "meta-llama/Llama-2-7b-chat-hf"
token = "PLEASE ADD"  # add your hugging face token, also accept the terms & conditions for each model you use

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

clear_output(wait=False)

In [6]:
device = "cuda"
save_dict = {}

correct, total = 0, 0
st = time.time()
for datum in tqdm(dataset):
    only_prompts = tokenizer(datum['only_prompts'])
    prompt_lengths = [len(x) - 1 for x in only_prompts['input_ids']]
    encoded = tokenizer(datum['prompts'], return_tensors="pt", padding=True)
    labels = encoded['input_ids'].clone()
    for idx, length in enumerate(prompt_lengths):
        labels[idx, :-length] = -100
    model_inputs = {x: y.to(device) for x,y in encoded.items()}
    with torch.no_grad():
        model_outputs = model(**model_inputs)
    loss = calc_loglikelihood(model_outputs.logits.detach(), labels)
    pred = loss.argmin().item()
    correct += datum['ans'] == pred
    total += 1
    if (total + 1) % 100 == 0:
        print(f"Accuracy: {correct / total}")
et = time.time()

print(f"Final Accuracy: {100 * correct / total} %")
print(f"Time Taken Per Iteration: {(et - st) / 500}")

 20%|████████████████████████████▌                                                                                                                   | 99/500 [01:09<04:08,  1.62it/s]

Accuracy: 0.1919191919191919


 40%|████████████████████████████████████████████████████████▉                                                                                      | 199/500 [02:20<03:18,  1.52it/s]

Accuracy: 0.17587939698492464


 60%|█████████████████████████████████████████████████████████████████████████████████████▌                                                         | 299/500 [03:33<02:27,  1.37it/s]

Accuracy: 0.1806020066889632


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 399/500 [04:44<01:16,  1.32it/s]

Accuracy: 0.17543859649122806


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 499/500 [05:57<00:00,  1.30it/s]

Accuracy: 0.1743486973947896


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [05:58<00:00,  1.39it/s]

Final Accuracy: 17.4 %
Time Taken Per Iteration: 0.7169224119186401





In [7]:
# remove old model from memory
import gc
del model
gc.collect()
torch.cuda.empty_cache() # PyTorch thing

In [8]:
device = "cuda" # the device to load the model onto
model_name = "google/gemma-7b-it"
token = "PLEASE ADD"  # add your hugging face token, also accept the terms & conditions for each model you use

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

clear_output(wait=False)

In [9]:
device = "cuda"
save_dict = {}

correct, total = 0, 0
st = time.time()
for datum in tqdm(dataset):
    only_prompts = tokenizer(datum['only_prompts'])
    prompt_lengths = [len(x) - 1 for x in only_prompts['input_ids']]
    encoded = tokenizer(datum['prompts'], return_tensors="pt", padding=True)
    labels = encoded['input_ids'].clone()
    for idx, length in enumerate(prompt_lengths):
        labels[idx, :-length] = -100
    model_inputs = {x: y.to(device) for x,y in encoded.items()}
    with torch.no_grad():
        model_outputs = model(**model_inputs)
    loss = calc_loglikelihood(model_outputs.logits.detach(), labels)
    pred = loss.argmin().item()
    correct += datum['ans'] == pred
    total += 1
    if (total + 1) % 100 == 0:
        print(f"Accuracy: {correct / total}")
et = time.time()

print(f"Final Accuracy: {100 * correct / total} %")
print(f"Time Taken Per Iteration: {(et - st) / 500}")

 20%|████████████████████████████▌                                                                                                                   | 99/500 [02:58<11:10,  1.67s/it]

Accuracy: 0.3838383838383838


 40%|████████████████████████████████████████████████████████▉                                                                                      | 199/500 [06:00<08:55,  1.78s/it]

Accuracy: 0.4221105527638191


 60%|█████████████████████████████████████████████████████████████████████████████████████▌                                                         | 299/500 [09:04<06:17,  1.88s/it]

Accuracy: 0.44481605351170567


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                             | 399/500 [12:10<03:16,  1.94s/it]

Accuracy: 0.45614035087719296


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 499/500 [15:16<00:01,  1.89s/it]

Accuracy: 0.4589178356713427


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [15:18<00:00,  1.84s/it]

Final Accuracy: 45.8 %
Time Taken Per Iteration: 1.8373686685562134



