In [None]:
!pip -q install transformers datasets peft accelerate adapter-transformers bitsandbytes

In [None]:
from datasets import load_dataset
dataset = load_dataset("boolq")

In [None]:
def format_example(example):
    question = example['question']
    passage = example['passage']
    answer = 'Yes' if example['answer'] else 'No'
    prompt = f"Question: {question}\nPassage: {passage}\nAnswer:"
    return {'input_text': prompt, 'target_text': answer}
formatted_dataset = dataset.map(format_example, remove_columns=dataset['train'].column_names)

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# tokenizer = LlamaTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    inputs = [f"{input_text} {target_text}" for input_text, target_text in zip(examples['input_text'], examples['target_text'])]

    # Tokenize the combined text and return PyTorch tensors
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt',  # Ensure tensors are returned
    )

    # Create labels by cloning the input IDs
    labels = model_inputs['input_ids'].clone()

    # Determine the length of the input_text for each example
    input_lengths = []
    for input_text in examples['input_text']:
        input_ids = tokenizer(
            input_text,
            max_length=512,
            truncation=True,
            add_special_tokens=False,  # Do not add special tokens here
        )['input_ids']
        input_lengths.append(len(input_ids))

    # Mask the labels for the input_text portion
    for i, input_length in enumerate(input_lengths):
        labels[i, :input_length] = -100  # Use -100 to ignore the input tokens

    model_inputs['labels'] = labels

    return model_inputs

tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_dataset['train'].column_names,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We are not using masked language modeling
)

In [None]:
# model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-hf', load_in_8bit=True, device_map='auto')
model = GPT2LMHeadModel.from_pretrained('gpt2')
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    # target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.1,
    bias='none',
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, lora_config)



In [None]:
from transformers import TrainingArguments

# Set the pad_token to the eos_token
tokenizer.pad_token = tokenizer.eos_token

# Update model's config
model.config.pad_token_id = tokenizer.pad_token_id
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    # evaluation_strategy='steps',
    # eval_steps=1000,
    save_steps=3000,
    logging_dir='./logs',
    logging_steps=1000,
    learning_rate=1e-4,
    report_to='none',
)
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

Step,Training Loss
1000,3.0937
2000,2.9834


TrainOutput(global_step=2950, training_loss=3.016124122748941, metrics={'train_runtime': 5156.7614, 'train_samples_per_second': 9.14, 'train_steps_per_second': 0.572, 'total_flos': 1.240140568854528e+16, 'train_loss': 3.016124122748941, 'epoch': 5.0})

In [None]:
from datasets import load_dataset

# Load the validation split
dataset = load_dataset("boolq", split='validation')
formatted_dataset = dataset.map(format_example)

In [None]:
import torch
model.eval()

def generate_answer(example):
    input_text = example['input_text']

    # Tokenize and move inputs to the model's device
    inputs = tokenizer(
        input_text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length',
    )
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)

    # Generate the model's output
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=15,
            do_sample=False,
            num_beams=1,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode the generated tokens
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract the answer by removing the input prompt
    answer = generated_text[len(input_text):].strip()

    # Handle empty outputs
    if not answer:
        answer = "No"
        return {'predicted_answer': answer}

    # # Keep only the first word (Yes or No)
    # answer = answer.split()[0]
    # print(answer)

    # # Handle unexpected outputs
    if 'yes' in answer.lower():
        answer = 'Yes'
    elif 'no' in answer.lower():
        answer = 'No'
    else:
        answer = 'No'

    return {'predicted_answer': answer}


# Map the generate_answer function over the validation set
predictions = formatted_dataset.map(generate_answer, batched=False)


Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [None]:
import numpy as np

def compute_accuracy(predictions):
    # Convert the true and predicted answers to lowercase for consistency
    true_answers = [answer.lower() for answer in predictions['target_text']]
    predicted_answers = [answer.lower() for answer in predictions['predicted_answer']]

    # Calculate the number of correct predictions
    correct_predictions = sum([
        true == pred for true, pred in zip(true_answers, predicted_answers)
    ])

    # Compute accuracy
    accuracy = correct_predictions / len(true_answers)
    return accuracy

# Calculate accuracy
accuracy = compute_accuracy(predictions)
print(f"Accuracy on the validation set: {accuracy * 100:.2f}%")

Accuracy on the validation set: 58.38%
