In [1]:
import random
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from tqdm import tqdm
from typing import List
import json




##################### Few-Shot Evaluation #####################

def load_jsonl(files):
    dataset = []
    for file in files:
        for line in open(file, "r"):
            dataset += [json.loads(line)]
        
    return dataset

# Few-Shot Prompt Creation

def create_prompt(point: dict) -> (str, str):
    """
    Create a prompt for a single example including the answer.
    """
    doc_to_choice = ["A", "B", "C", "D"]
    choices = point["choices"]
    correct_index = point["answer"]
    prompt = (
        f"Question: {point['question']}\n"
        + "Choices: "
        + " ".join([f"{doc_to_choice[i]}. {choice}" for i, choice in enumerate(choices)])
        + f"\nAnswer: {doc_to_choice[correct_index]}. {choices[correct_index]}."
    )
    return prompt, doc_to_choice[correct_index]

def create_few_shot_context(few_shot_examples: List[dict]) -> str:
    """
    Generates a reusable few-shot context with examples that include answers.
    """
    return "\n\n".join(
        [
            create_prompt(example)[0]  # Full prompt with answer for each few-shot example
            for example in few_shot_examples
        ]
    )

def append_current_question_to_context(few_shot_context: str, point: dict) -> str:
    """
    Appends the current question (without an answer) to the reusable few-shot context.
    """
    current_question = (
        f"Question: {point['question']}\n"
        + "Choices: "
        + " ".join([f"{chr(65 + i)}. {choice}" for i, choice in enumerate(point["choices"])])
        + "\nAnswer:"
    )
    return few_shot_context + "\n\n" + current_question

##################### Evaluation #####################

@torch.no_grad()
def evaluate_model(model, eval_data, tokenizer, device, max_seq_len, batch_size, k_shot):
    """
    Evaluate the model using few-shot learning with a pre-generated few-shot context.
    """
    model.eval()
    # Pre-generate few-shot context
    few_shot_examples = eval_data[:k_shot] if k_shot > 0 else []
    few_shot_context = create_few_shot_context(few_shot_examples)
    print(f"Few-shot Context: \n{few_shot_context}")
    # Divide eval_data into batches
    batches = [eval_data[i : i + batch_size] for i in range(0, len(eval_data), batch_size)]

    total_loss = 0
    total_acc = 0

    for batch in tqdm(batches, desc="Eval"):
        # Generate full prompts for the batch
        prompts = [append_current_question_to_context(few_shot_context, point) for point in batch]

        # Tokenize the prompts
        tokens = tokenizer(
            prompts, return_tensors="pt", max_length=max_seq_len, truncation=True, padding=True
        ).to(device)

        # Extract the labels
        labels = torch.tensor(
            [ord(create_prompt(point)[1]) - 65 for point in batch], device=device
        )
#         print(labels)
        label_possibilities = [tokenizer.encode(f" {c}", add_special_tokens=False)[0] for c in ["A", "B", "C", "D"]]
#         print(label_possibilities)
        # Get model outputs
        logits = model(**tokens).logits[:, -1, :]
        loss = torch.nn.functional.cross_entropy(logits, labels)
        logits[:, list(set(range(logits.size(-1))) - set(label_possibilities))] = -float("inf")
        acc = (logits.argmax(dim=-1) == label_possibilities[labels]).float().mean().item()

        # Accumulate results
        total_loss += loss.item() * len(batch)
        total_acc += acc * len(batch)

    # Calculate average metrics
    avg_loss = total_loss / len(eval_data)
    avg_acc = total_acc / len(eval_data)

    print(f"Evaluation Loss: {avg_loss:.4f}, Accuracy: {avg_acc:.4f}")
    return {"loss": avg_loss, "accuracy": avg_acc}


##################### Training #####################
def train_model(model, train_data, qa_data, tokenizer, device, epochs=3, lr=5e-5, train_batch_size=1, test_batch_size=1, max_seq_len=128, k_shot=0):
    """Train the model on plain text data."""
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    train_loader = DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            optimizer.zero_grad()
            tokens = tokenizer(batch["text"], return_tensors="pt", max_length=max_seq_len, truncation=True, padding=True).to(device)
            labels = tokens.input_ids.clone()  # Shift labels for causal LM
            labels[labels == tokenizer.pad_token_id] = -100
            outputs = model(**tokens, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} Training Loss: {avg_loss:.4f}")
        evaluate_model(model, qa_data, tokenizer, device, max_seq_len, test_batch_size, k_shot)


# evaluate_model(model, qa_data, tokenizer, device, max_seq_len, batch_size, k_shot)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Main Code ###

train_dataset_path = "/home/minsungkim/workspace/engram/unlearning_evaluation/data/random_bd/corpus_split_0.jsonl"
qa_dataset_path = "/home/minsungkim/workspace/engram/unlearning_evaluation/data/random_bd/split_0.jsonl"
model_name = "meta-llama/Llama-3.1-8B"
epochs = 15
batch_size = 1
lr = 4e-7
max_seq_len = 64
k_shot = 3

# Load data
train_data = load_dataset("json", data_files=train_dataset_path)["train"]
qa_data = load_jsonl([qa_dataset_path])
# Prepare device, model, and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Train the model on plain text
train_model(model, train_data, qa_data, tokenizer, device, epochs, lr, batch_size, 1, max_seq_len)



Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.36s/it]
Training Epoch 1:   0%|          | 0/471 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 

In [69]:
# Evaluate the model with few-shot QA
evaluate_model(model, qa_data, tokenizer, device, max_seq_len, 1, k_shot)


Few-shot Context: 
Question: When was Aidan Li born?
Choices: A. 1961 B. 1958 C. 1965 D. 1994
Answer: C. 1965.

Question: When was Alla Nelles born?
Choices: A. 1966 B. 1936 C. 2018 D. 1998
Answer: B. 1936.

Question: When was Tommy Ellis born?
Choices: A. 1995 B. 2005 C. 2022 D. 1977
Answer: B. 2005.


Eval: 100%|██████████| 157/157 [00:05<00:00, 27.00it/s]

Evaluation Loss: 22.3251, Accuracy: 0.2293





{'loss': 22.325092740878937, 'accuracy': 0.22929936305732485}

In [93]:
tokenizer.decode(model.generate(**tokenizer("Question: When was Aidan Li born? Answer:", return_tensors="pt").to(device), max_length=128)[0])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<|begin_of_text|>Question: When was Aidan Li born? Answer: 1983\nA. 8th Century\nB. 9th Century\nC. 10th Century\nD. 11th Century\nAnswer: A<|end_of_text|>'

In [44]:
tokenizer.tokenize("\nAnswer: B. 1995")

['Ċ', 'Answer', ':', 'ĠB', '.', 'Ġ', '199', '5']

In [None]:
tokenizer.decode([34])

In [29]:
raw_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)


In [30]:
tokenizer.decode(raw_model.generate(**tokenizer("Question: When was Aidan Li born? Choices: A. 1961 B. 1958 C. 1965 D. 1994 Answer: C. 1965.\n\nQuestion: When was Alla Nelles born? Choices: A. 1966 B. 1936 C. 2018 D. 1998 Answer: B. 1936.\n\nQuestion: When was Tommy Ellis born? Choices: A. 1995 B. 2005 C. 2022 D. 1977 Answer:", return_tensors="pt").to(device), max_length=128)[0])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<|begin_of_text|>Question: When was Aidan Li born? Choices: A. 1961 B. 1958 C. 1965 D. 1994 Answer: C. 1965.\n\nQuestion: When was Alla Nelles born? Choices: A. 1966 B. 1936 C. 2018 D. 1998 Answer: B. 1936.\n\nQuestion: When was Tommy Ellis born? Choices: A. 1995 B. 2005 C. 2022 D. 1977 Answer: C. 2022.\n\nQuestion: When was Andrew Yip born? Choices:'