In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
!pip install scipy
!pip install tensorboard
!pip install huggingface_hub
!huggingface-cli login --token '##############'
!pip install tqdm

In [None]:

from tqdm import tqdm
import os
import torch
from datasets import load_dataset, Dataset
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

import string
import re
import csv

In [None]:
def load_model_tokenizer(model_name, adapter_name, quantization=False):
    if quantization:
      bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype="float16",
      bnb_4bit_use_double_quant=False)
    else:
      bnb_config = None

    model = AutoModelForCausalLM.from_pretrained(model_name, config=bnb_config, device_map="auto")
    if adapter_name:
      model = PeftModel.from_pretrained(model, adapter_name, device_map="auto")

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer


def predict_response(text):
  inputs = tokenizer(text, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_new_tokens=50)
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return response


def normalize_answer(text):
    if text:
        punc = string.punctuation
        text = text.lower()
        return ''.join(char for char in text if char not in punc)
    else:
        return None

In [None]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "commonsense_qa"

# Load Base model
model, tokenizer = load_model_tokenizer(
    model_name=model_name,
    adapter_name=None,
    quantization=True,
)
device = torch.device('cuda:0')
model.to(device)

In [None]:
# Create validation dataset
dataset = load_dataset(dataset_name, split="validation")
dataset = dataset.shuffle(seed=1279)
dataset = dataset.select(range(200))

In [None]:

# Benchmark with only context
save_path = 'benchmark_context_commonsenqa.csv'
with open(save_path, "w") as file:
    writer = csv.writer(file)
    writer.writerow(["Question", "Question concept", "Answer key", "Choices", "Prediction", "Full prediction"])
    for i in tqdm(range(len(dataset['question']))):
        question = dataset[i]['question']
        question_concept = dataset[i]['question_concept']
        choices = dataset[i]['choices']
        answer_key = dataset[i]['answerKey']
        if answer_key:
            answer_key = normalize_answer(str(answer_key))
        else:
            answer = None
        promt_choices = ''
        for label, text in zip(dataset[i]['choices']['label'], dataset[i]['choices']['text']):
            promt_choices += f'{label}: {text}, \n'

        prompt = f"""\
<s>
{question}

{promt_choices}

ANSWER:
``` </s>"""
        full_prediction = predict_response(prompt)
        # Find answer
        answer_start_index = full_prediction.find("ANSWER:")
        prediction = full_prediction[answer_start_index+7:]
        pattern = r'\((\w)\)|(?:\b(\w):)'
        # Find all matches in the text
        matches = re.search(pattern, prediction)
        if matches:
            prediction = matches.group()
            prediction = normalize_answer(prediction)
        else:
            prediction = None
        writer.writerow(
            [
                question,
                question_concept,
                answer_key,
                promt_choices,
                prediction,
                full_prediction
            ]
            )
        file.flush()

