Reinforcement Learning for Large Language Models
Winter 23/24 Semester
Final Group Project
Kateryna Smykovska, Jakob Schmitter, Suvi Lehtosalo, Megan Horikawa

This notebook was made by **Megan Horikawa **

[QA 7b notebook](https://colab.research.google.com/drive/1yVwySytx9TTgjWTlME9PVbz6yraF93jw?usp=sharing)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
!pip install accelerate
!pip install bitsandbytes
!pip install sentencepiece
!pip install torch
!pip install datasets
!pip install evaluate
!pip install huggingface_hub

Mounted at /content/drive
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━

In [2]:
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer, BitsAndBytesConfig
import torch


import datasets
import evaluate
import numpy as np

from google.colab import userdata
my_secret_key = userdata.get('HF_TOKEN')

# config for
BnB_config= BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type='nf4'
                )

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=my_secret_key, quantization_config=BnB_config)

model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    token = my_secret_key
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



In [3]:
# Load Dataset for NLI task. Superglue subdataset rte was chosen
copa_dataset = datasets.load_dataset("super_glue", "copa")

#train subset was selected as it had over 150 entries with a mixture of both labels

dataset = copa_dataset['train'].shuffle(seed=42).select(range(150))

Downloading data:   0%|          | 0.00/33.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/40.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [4]:
# define log likelihood function (taken from homework with small changes for llama)

def get_log_prob_of_completion(
        model,
        tokenizer,
        prompt,
        completion,
        device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
):
        """
        Convenience function for computing the log probability of a completion
        given a prompt.
        """
        # tokenize the prompt and the completion
        # truncate so as to fit into to maximal context window of llama2
        # which is 2048 tokens
        input_ids = tokenizer(
                prompt + completion,
                return_tensors='pt',
                truncation=True,
                max_length=2048,
        )['input_ids'].to(device)

        # separately tokenize prompt
        # so as to access the logits for the completion only
        # when scoring the completion
        input_ids_prompt = tokenizer(
                prompt,
                return_tensors='pt',
                truncation=True,
                max_length=2048
        )['input_ids'].to(device)

        # create attention mask and position ids
        attention_mask = (input_ids != tokenizer.eos_token_id).to(dtype=torch.int64)
        position_ids = attention_mask.cumsum(-1)-1
        # get the logits for the completion
        with torch.no_grad():
                out = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        position_ids=position_ids
                )

        # get the logits of the completion
        # for that, make a tensor of the logits
        # for the completion only
        # in particular, we shift the indices by one to the left to access logits of the
        # actual sequence tokens
        logits_completion = out.logits[:, :-1]
        logits_completion = logits_completion.squeeze()
        # get the log probabilities for the completion
        log_probs = torch.nn.functional.log_softmax(
                logits_completion,
                dim=-1
        )
        # retrieve the logit corresponding to the actual completion tokens
        try:
                log_completion_tokens = log_probs.gather(
                        dim=-1,
                        index=input_ids[:, 1:].squeeze().unsqueeze(-1)
                )
        except:
                log_completion_tokens = log_probs.gather(
                        dim=-1,
                        index=input_ids[:, 1:].unsqueeze(-1)
                )

        continuationConditionalLogProbs = log_completion_tokens[
                (input_ids_prompt.shape[-1]-1):
        ]
        completion_log_prob = torch.mean(
                continuationConditionalLogProbs
        ).cpu()

        return completion_log_prob

In [5]:
#iterate over the dataset:

results = []
prompt_list = []
correct_option = []
incorrect_option = []
log_ps_correct_option = []
log_ps_incorrect_option = []


def make_question(question):
    if question == 'cause':
        return "What is the cause of this?"
    if question == 'effect':
        return "What is the effect of this?"
idx = 1

for item in dataset:
  # concatenate the premise with each of the choices
    prompt = item['premise'] + make_question(item['question'])
  # get classification of item as either entailment (0) or non-entailment(1)
    category = item['label']

    correct = ''
    incorrect = ''

  # set correct classification based on label
    if category == 0:
        correct = item['choice1']
        incorrect = item['choice2']

    else:
        correct = item['choice2']
        incorrect = item['choice1']

    prompt_list.append(prompt)
    correct_option.append(correct)
    incorrect_option.append(incorrect)

  #compute the lob probabilities for both choices
    log_p_correct = get_log_prob_of_completion(
      model,
      tokenizer,
      prompt,
      correct
    )
  # append to the correct list
    log_ps_correct_option.append(log_p_correct)

    log_p_incorrect = get_log_prob_of_completion(
      model,
      tokenizer,
      prompt,
      incorrect
  )

  # append to the incorrect list
    log_ps_incorrect_option.append(log_p_incorrect)
    print(f'finished {idx} of {len(dataset)}')
    idx += 1

finished 1 of 150
finished 2 of 150
finished 3 of 150
finished 4 of 150
finished 5 of 150
finished 6 of 150
finished 7 of 150
finished 8 of 150
finished 9 of 150
finished 10 of 150
finished 11 of 150
finished 12 of 150
finished 13 of 150
finished 14 of 150
finished 15 of 150
finished 16 of 150
finished 17 of 150
finished 18 of 150
finished 19 of 150
finished 20 of 150
finished 21 of 150
finished 22 of 150
finished 23 of 150
finished 24 of 150
finished 25 of 150
finished 26 of 150
finished 27 of 150
finished 28 of 150
finished 29 of 150
finished 30 of 150
finished 31 of 150
finished 32 of 150
finished 33 of 150
finished 34 of 150
finished 35 of 150
finished 36 of 150
finished 37 of 150
finished 38 of 150
finished 39 of 150
finished 40 of 150
finished 41 of 150
finished 42 of 150
finished 43 of 150
finished 44 of 150
finished 45 of 150
finished 46 of 150
finished 47 of 150
finished 48 of 150
finished 49 of 150
finished 50 of 150
finished 51 of 150
finished 52 of 150
finished 53 of 150
fi

In [6]:
# lets load things into pandas
import pandas as pd

df = pd.DataFrame(list(zip(prompt_list,correct_option, log_ps_correct_option,incorrect_option, log_ps_incorrect_option)), columns = ['prompt','correct', 'log_prob_correct','incorrect', 'log_prob_incorrect'])


# evaluate the log probabilities
# check whether the log probaility of the correct answer is higher than the
# incorrect answer and append to the results list
df['correct_prediction'] = df['log_prob_correct']> df['log_prob_incorrect']


df.head(20)


Unnamed: 0,prompt,correct,log_prob_correct,incorrect,log_prob_incorrect,correct_prediction
0,The woman filed a restraining order against th...,The man stalked her.,tensor(-1.9905),The man called her.,tensor(-4.2807),True
1,I drank from the water fountain.What is the ca...,I was thirsty.,tensor(-1.5521),I felt nauseous.,tensor(-3.1465),True
2,The authorities vowed to protect the identity ...,They withheld the victim's name from the public.,tensor(-1.7438),The victim struggled to recall details about t...,tensor(-3.5983),True
3,My feet were blistered.What is the cause of this?,I went hiking.,tensor(-3.2480),I went swimming.,tensor(-3.5545),True
4,I burst into a fit of laughter.What is the eff...,My eyes welled up with tears.,tensor(-1.9152),My teeth began to chatter.,tensor(-2.6879),True
5,The bird couldn't fly.What is the cause of this?,It injured its wing.,tensor(-3.5410),It migrated for the winter.,tensor(-3.4035),False
6,The boy got a black eye.What is the cause of t...,The bully punched the boy.,tensor(-2.1832),The bully mocked the boy.,tensor(-3.8278),True
7,The woman received a diploma.What is the cause...,She graduated from college.,tensor(-2.2157),She enrolled in college.,tensor(-2.9497),True
8,The clouds looked dark.What is the effect of t...,I brought my umbrella to work.,tensor(-2.6807),I brought my laptop to work.,tensor(-3.7499),True
9,The truck crashed into the motorcycle on the b...,The motorcyclist died.,tensor(-1.3981),The bridge collapsed.,tensor(-1.5778),True


In [7]:
# export to csv

df.to_csv('/content/drive/My Drive/RLProject/QA_llama2_7bChat.csv', index=False)

Link to csv file: https://drive.google.com/file/d/10UDGy-K0BB-GC_nQo3jTCnHAJKmgHoL-/view?usp=drive_link

In [8]:
#import pandas as pd

#from google.colab import drive
#drive.mount('/content/drive')

#QA_7bChat_df = pd.read_csv('/content/drive/My Drive/RLProject/QA_llama2_7bChat.csv')

df.groupby('correct_prediction').count()


Unnamed: 0_level_0,prompt,correct,log_prob_correct,incorrect,log_prob_incorrect
correct_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,28,28,28,28,28
True,122,122,122,122,122


In [None]:
print(100/150)

0.6666666666666666


In [None]:
from google.colab import runtime
runtime.unassign()

Accuracy, Recall, F1 Score