Reinforcement Learning for Large Language Models
Winter 23/24 Semester
Final Group Project
Kateryna Smykovska, Jakob Schmitter, Suvi Lehtosalo, Megan Horikawa

This notebook was made by **Megan Horikawa **

[NLI - 7b Chat Notebook](https://colab.research.google.com/drive/1f-O1WehtYnnr2msb0XTAVPUKyfaGIZAH?usp=sharingttps://)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
!pip install accelerate
!pip install bitsandbytes
!pip install sentencepiece
!pip install torch
!pip install datasets
!pip install evaluate
!pip install huggingface_hub

Mounted at /content/drive
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer, BitsAndBytesConfig
import torch

import datasets
import evaluate
import numpy as np

from google.colab import userdata
my_secret_key = userdata.get('HF_TOKEN')

# config for
BnB_config= BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type='nf4'
                )

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=my_secret_key, quantization_config=BnB_config)

model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",
    token = my_secret_key
)



tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



In [None]:
# Load Dataset for NLI task. Superglue subdataset rte was chosen
rte_dataset = datasets.load_dataset("super_glue", "rte")

# will use validation subset as it contains cases of both entailment and non-entailment classes unlike test set which only has one category
# reduce size of dataset to 150
dataset = rte_dataset['validation'].shuffle(seed=42).select(range(150))

# not entailment = 1 entailment = 0

Downloading data:   0%|          | 0.00/586k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/622k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# define log likelihood function (taken from homework with small changes for llama)

def get_log_prob_of_completion(
        model,
        tokenizer,
        prompt,
        completion,
        device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
):
        """
        Convenience function for computing the log probability of a completion
        given a prompt.
        """
        # tokenize the prompt and the completion
        # truncate so as to fit into to maximal context window of llama2
        # which is 2048 tokens
        input_ids = tokenizer(
                prompt + completion,
                return_tensors='pt',
                truncation=True,
                max_length=2048,
        )['input_ids'].to(device)

        # separately tokenize prompt
        # so as to access the logits for the completion only
        # when scoring the completion
        input_ids_prompt = tokenizer(
                prompt,
                return_tensors='pt',
                truncation=True,
                max_length=2048
        )['input_ids'].to(device)

        # create attention mask and position ids
        attention_mask = (input_ids != tokenizer.eos_token_id).to(dtype=torch.int64)
        position_ids = attention_mask.cumsum(-1)-1
        # get the logits for the completion
        with torch.no_grad():
                out = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        position_ids=position_ids
                )

        # get the logits of the completion
        # for that, make a tensor of the logits
        # for the completion only
        # in particular, we shift the indices by one to the left to access logits of the
        # actual sequence tokens
        logits_completion = out.logits[:, :-1]
        logits_completion = logits_completion.squeeze()
        # get the log probabilities for the completion
        log_probs = torch.nn.functional.log_softmax(
                logits_completion,
                dim=-1
        )
        # retrieve the logit corresponding to the actual completion tokens
        try:
                log_completion_tokens = log_probs.gather(
                        dim=-1,
                        index=input_ids[:, 1:].squeeze().unsqueeze(-1)
                )
        except:
                log_completion_tokens = log_probs.gather(
                        dim=-1,
                        index=input_ids[:, 1:].unsqueeze(-1)
                )

        continuationConditionalLogProbs = log_completion_tokens[
                (input_ids_prompt.shape[-1]-1):
        ]
        completion_log_prob = torch.mean(
                continuationConditionalLogProbs
        ).cpu()

        return completion_log_prob

In [None]:
# using method of Homework 2 #3

#iterate over the dataset:

results = []
prompt_list = []
correct_answer_list =[]
log_ps_correct_option = []
log_ps_incorrect_option = []

item_index = 1
dataset_size = len(dataset)

for item in dataset:
  # as done in homework 2 the prompt is both premise and hypothesis concatenated.
  prompt = item['premise'] + " " + item['hypothesis']
  # get classification of item as either entailment (0) or non-entailment(1)
  category = item['label']

  correct = ''
  incorrect = ''

  # set correct classification based on label
  if category == 1:
    correct = 'non-entailment'
    incorrect = 'entailment'

  else:
    correct = 'entailment'
    incorrect = 'non-entailment'

  prompt_list.append(prompt)
  correct_answer_list.append(correct)

  #compute the lob probabilities for both entailment and non-entailment classes

  log_p_correct = get_log_prob_of_completion(
      model,
      tokenizer,
      prompt,
      correct
  )
  # append to the correct list
  log_ps_correct_option.append(log_p_correct)

  log_p_incorrect = get_log_prob_of_completion(
      model,
      tokenizer,
      prompt,
      incorrect
  )

  # append to the incorrect list
  log_ps_incorrect_option.append(log_p_incorrect)

  print(f'finished item number {item_index} of {dataset_size}')
  item_index += 1



finished item number 1 of 150
finished item number 2 of 150
finished item number 3 of 150
finished item number 4 of 150
finished item number 5 of 150
finished item number 6 of 150
finished item number 7 of 150
finished item number 8 of 150
finished item number 9 of 150
finished item number 10 of 150
finished item number 11 of 150
finished item number 12 of 150
finished item number 13 of 150
finished item number 14 of 150
finished item number 15 of 150
finished item number 16 of 150
finished item number 17 of 150
finished item number 18 of 150
finished item number 19 of 150
finished item number 20 of 150
finished item number 21 of 150
finished item number 22 of 150
finished item number 23 of 150
finished item number 24 of 150
finished item number 25 of 150
finished item number 26 of 150
finished item number 27 of 150
finished item number 28 of 150
finished item number 29 of 150
finished item number 30 of 150
finished item number 31 of 150
finished item number 32 of 150
finished item num

In [None]:
# lets load things into pandas
import pandas as pd

df = pd.DataFrame(list(zip(prompt_list,correct_answer_list,log_ps_correct_option,log_ps_incorrect_option)), columns = ['prompt','entailment/non-entailment','log_prob_correct', 'log_prob_incorrect'])


# evaluate the log probabilities
# check whether the log probaility of the correct answer is higher than the
# incorrect answer and append to the results list

def prediction_eval(entailment, correct_logprob, incorrect_logprob):
  prediction = prediction = np.where(entailment == "non-entailment", incorrect_logprob > correct_logprob, correct_logprob > incorrect_logprob)
  return prediction

df['correct_prediction'] = prediction_eval(df['entailment/non-entailment'],df['log_prob_correct'],df['log_prob_incorrect'])

df.head(20)


Unnamed: 0,prompt,entailment/non-entailment,log_prob_correct,log_prob_incorrect,correct_prediction
0,"As a result, peptic ulcer disease has been tra...",entailment,tensor(-8.2752),tensor(-7.2414),False
1,Ssangyong Motor was taken over by creditors af...,non-entailment,tensor(-5.4455),tensor(-8.0148),False
2,In support of the Earth Island Institute rebut...,entailment,tensor(-8.7378),tensor(-6.5280),False
3,Hadley said Jordan was chosen as the site of t...,non-entailment,tensor(-6.8414),tensor(-8.9038),False
4,A senior coalition official in Iraq said the b...,entailment,tensor(-8.0961),tensor(-6.8381),False
5,The Massachusetts Supreme Judicial Court has c...,entailment,tensor(-8.7285),tensor(-4.8469),False
6,"Ahern, who was travelling to Tokyo for an EU-J...",entailment,tensor(-10.1436),tensor(-6.6507),False
7,A closely divided U.S. Supreme Court said on T...,non-entailment,tensor(-6.5731),tensor(-8.7864),False
8,"Fujimori charged that on January 26, 1995, Ecu...",non-entailment,tensor(-6.8494),tensor(-9.7854),False
9,Four days of talks got underway with the US an...,entailment,tensor(-8.8077),tensor(-6.5658),False


In [None]:
# export to csv

df.to_csv('/content/drive/My Drive/RLProject/NLI_llama2_7b.csv', index=False)

Link to csv file: https://drive.google.com/file/d/1mh0xbsqduY8zDGX3THAnC0dPvfDpfj7S/view?usp=drive_link

Link to full dataset csv: https://drive.google.com/file/d/1GSp5hfmx9EYyq-yb7WrNPMaJ5o216SvC/view?usp=sharing


In [None]:
# group by the correct predictions for accuracy
df.groupby('correct_prediction').count()


Unnamed: 0_level_0,prompt,entailment/non-entailment,log_prob_correct,log_prob_incorrect
correct_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,150,150,150,150


None of the predictions were correct T_T

Accuracy, Recall, F1 Score
