In [1]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install evaluate
!pip install hugginface_hub
!pip install accelerate --upgrade
!pip install bitsandbytes
!pip install -i https://pypi.org/simple/ bitsandbytes --upgrade
!pip install sentencepiece

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer
import torch

import datasets
import evaluate
import numpy as np
import accelerate
import bitsandbytes

from google.colab import userdata
my_secret_key = userdata.get('HF_TOKEN')

tokenizer_13b = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")

model_13b = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-13b-hf",
    load_in_4bit=True,
    device_map="auto",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [3]:
import json
def load_dialogues_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    formatted_entries = []
    for dialogue_data in data['data']['dialogs']:
        for turn in dialogue_data['dialog']:
          options_formatted = "\n".join([f"{idx + 1}. {option}" for idx, option in enumerate(turn['option'])])
          entry = {
                'dialogue': 'Question: ' + turn['question'] + '\nAnswer: ' + turn['answer'],
                'explicit_answer': turn['explict_answer'],
                'options': options_formatted,
            }
          formatted_entries.append(entry)
    return formatted_entries


file_path = "impl_dial_v0.1_cb.json"
dataset = load_dialogues_dataset(file_path)
for entry in dataset[:3]:
    print(f"Dialogue: {entry['dialogue']}")
    print(f"Explicit Answer: {entry['explicit_answer']}")
    print(f"Options:{entry['options']}")
    print("----------")

Dialogue: Question: did you leave the cucumber in the den
Answer: I was in the dining_room
Explicit Answer: I didn't leave the cucumber in the den
Options:1. I didn't leave the cucumber in the den
2. the cucumber is in the dining_room
3. I left the cucumber in the dining_room
4. I leave the cucumber in the den
----------
Dialogue: Question: what about Mia
Answer: she placed the limes there
Explicit Answer: Mia didn't place the cucumber in the den
Options:1. I don't know if Mia place the cucumber in the den
2. I didn't place the cucumber in the den
3. Mia didn't place the cucumber in the den
4. Mia didn't place the limes in the den
----------
Dialogue: Question: was she there
Answer: she was there if not in the den
Explicit Answer: I am not sure if Mia was in the den
Options:1. I was not in the den
2. Mia was in the den
3. I am not sure if Mia was in the den
4. Mia was not in the den
----------


In [4]:
first_30_entries = dataset[:30]

def generate_answers_for_dataset(model, tokenizer, dataset):
    results = []
    for entry in dataset:
        predicted_answer = generate_answer_zero_shot(model, entry['dialogue'], entry['options'], tokenizer)
        results.append({
            'dialogue': entry['dialogue'],
            'options': entry['options'],
            'explicit_answer': entry.get('explicit_answer'),
            'predicted_score': predicted_answer
        })
    return results

def generate_answer_zero_shot(model, dialogue, options, tokenizer):
    prompt = f"""Given this dialogue, choose the correct explicit answer option that captures the implicature of the answer.
    The dialogue:\n{dialogue}\n
    The options: {options}\n
    The number of the option that captures the explicit answer: """
    model_response = invoke_model_zero_shot(model, prompt, tokenizer)
    predicted_score = model_response
    return predicted_score

def generate_answer_few_shot_2_examples(model, dialogue, options, tokenizer):
  prompt = f"""Question: were you in the basement\n
  Answer: I journeyed to the laundry\n
  Options: 1. I was in the basement\n 2. I was in the playroom\n 3. I was not in the basement\n 4. I was not in the crawlspace\n
  Explict answer: I was not in the basement\n
  Question: was Emily there\n
  Answer: she was in the playroom\n
  Options: 1. Emily was in the laundry\n 2. Emily was not in the laundry\n 3. I was not in the laundry\n 4. I don't know\n
  Explict answer: Emily was not in the laundry\n
  Given this dialogue, choose the correct explicit answer option that captures the implicature of the answer.\n
  {dialogue}\n
  Options: {options}\n
  The number of the option that captures the explicit answer:
  """
  model_response = invoke_model_few_shot_2_examples(model, prompt, tokenizer)
  predicted_score = model_response
  return predicted_score

def generate_answer_few_shot_4_examples(model, dialogue, options, tokenizer):
  prompt = f"""Question: were you in the basement\n
  Answer: I journeyed to the laundry\n
  Options: 1. I was in the basement\n 2. I was in the playroom\n 3. I was not in the basement\n 4. I was not in the crawlspace\n
  Explict answer: I was not in the basement\n
  Question: was Emily there\n
  Answer: she was in the playroom\n
  Options: 1. Emily was in the laundry\n 2. Emily was not in the laundry\n 3. I was not in the laundry\n 4. I don't know\n
  Explict answer: Emily was not in the laundry\n
  Question: were you in the sunroom\n
  Answer: I travelled to the kitchen\n
  Options: 1. I was in the sunroom\n 2. I was in the den\n 3. I was not in the den\n 4. I was not in the sunroom\n
  Explict answer: I was not in the sunroom\n
  Question: did you leave the peas there\n
  Answer: I left them in the sunroom\n
  Options: 1. I didn't leave the peas in the kitchen\n 2. I didn't leave the peas in the sunroom\n 3. I leave the peas in the kitchen\n 4. the peas are in the kitchen\n
  Explict answer: I didn't leave the peas in the kitchen\n
  Given this dialogue, choose the correct explicit answer option that captures the implicature of the answer.\n
  {dialogue}\n
  Options: {options}\n
  The number of the option that captures the explicit answer:
  """
  model_response = invoke_model_few_shot_4_examples(model, prompt, tokenizer)
  predicted_score = model_response
  return predicted_score

def invoke_model_zero_shot(model, prompt, tokenizer):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, temperature=0.5)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def invoke_model_few_shot_2_examples(model, prompt, tokenizer):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs, max_length=500, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def invoke_model_few_shot_4_examples(model, prompt, tokenizer):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs, max_length=1000, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [5]:
import pandas as pd
#free generation

df_zero_shot = pd.DataFrame(first_30_entries)
df_zero_shot['Predicted Score LLaMA-13b'] = df_zero_shot.apply(lambda x: generate_answer_zero_shot(model_13b, x['dialogue'], x['options'], tokenizer_13b), axis=1)

csv_file_path = "results_zero_shot.csv"
df_zero_shot.to_csv(csv_file_path, index=False)
###
df_few_shot_2_examples = pd.DataFrame(first_30_entries)
df_few_shot_2_examples['Predicted Score LLaMA-13b'] = df_few_shot_2_examples.apply(lambda x: generate_answer_few_shot_2_examples(model_13b, x['dialogue'], x['options'], tokenizer_13b), axis=1)

df_few_shot_2_examples.to_csv("results_few_shot_2_examples.csv", index=False)

df_few_shot_4_examples = pd.DataFrame(first_30_entries)
df_few_shot_4_examples['Predicted Score LLaMA-13b'] = df_few_shot_4_examples.apply(lambda x: generate_answer_few_shot_4_examples(model_13b, x['dialogue'], x['options'], tokenizer_13b), axis=1)

df_few_shot_4_examples.to_csv("results_few_shot_4_examples.csv", index=False)

In [6]:
#from RL hw2 ex3, adapted for the task
first_150_entries = dataset[:150]
def get_log_prob_of_similarity_score(model, tokenizer, dialogue, options, explicit_answer, device):
    prompt = f"""Given this dialogue, choose the correct explicit answer option that captures the implicature of the answer.
    The dialogue:\n{dialogue}\n
    The options: {options}\n
    The number of the option that captures the explicit answer: """
    score_str = str(explicit_answer)
    input_ids_prompt = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024)['input_ids'].to(device)
    input_ids_score = tokenizer(score_str, return_tensors='pt')['input_ids'].to(device)

    input_ids = torch.cat((input_ids_prompt, input_ids_score[:, 1:]), dim=-1)

    attention_mask = (input_ids != tokenizer.eos_token_id).long()
    position_ids = attention_mask.cumsum(-1) - 1

    with torch.no_grad():
        out = model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids)

    logits_score = out.logits[:, input_ids_prompt.shape[-1]-1:-1].squeeze()
    log_probs = torch.nn.functional.log_softmax(logits_score, dim=-1)
    log_probs_score = log_probs.gather(dim=-1, index=input_ids_score[:, 1:].squeeze().unsqueeze(-1)).squeeze()
    # mean proabbility across the score token
    mean_log_prob_score = log_probs_score.mean().item()

    return mean_log_prob_score

def evaluate_dataset(data, model, tokenizer, device):
    results = []
    for entry in data:
        log_prob_score = get_log_prob_of_similarity_score(
            model=model,
            tokenizer=tokenizer,
            dialogue=entry['dialogue'],
            options=entry['options'],
            explicit_answer=entry['explicit_answer'],
            device=device
        )
        result = {
            'dialogue': entry['dialogue'],
            'options': entry['options'],
            'explicit_answer': entry['explicit_answer'],
            'log_prob_score': log_prob_score
        }
        results.append(result)
    return results

evaluated_results_13b = evaluate_dataset(first_150_entries, model_13b, tokenizer_13b, device)


In [7]:
def results_to_csv(evaluated_results_13b, filepath):
    df_13b = pd.DataFrame(evaluated_results_13b)
    df_13b.to_csv(filepath, index=False)

results_to_csv(evaluated_results_13b, 'log_probs_13b.csv')

In [8]:
df_log_probs = pd.read_csv('log_probs_13b.csv')

average_log_prob_13b_chat = df_log_probs['log_prob_score'].mean()
variance_log_prob_13b_chat = df_log_probs['log_prob_score'].var()

print(f"Average Log Probability Scores:")
print(f"LLaMA 13b: {average_log_prob_13b_chat}")
print(f"\nVariance of Log Probability Scores:")
print(f"LLaMA 13b: {variance_log_prob_13b_chat}")

Average Log Probability Scores:
LLaMA 13b: -1.2041990919907888

Variance of Log Probability Scores:
LLaMA 13b: 0.16077527765263938
