In [None]:
from datasets import load_dataset
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, pipeline
from sklearn.model_selection import train_test_split

from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer
from peft import AdaLoRAConfig, LoraConfig, PeftModel, TaskType, get_peft_model
from torch.utils.data import DataLoader

from tqdm import tqdm

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", legacy=False, device_map = "cuda")
PAD_TOKEN = "<|pad|>"
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

In [None]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", device_map="cuda")
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

In [None]:
dataset = load_dataset(
    "json",
    data_files = {"train": "train.json"}
)

In [None]:
response_template = "<|end_header_id|>"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

examples = [dataset['train'][i]['text'] for i in range(len(dataset['train']))]
encodings = [tokenizer(e) for e in examples]

dataloader = DataLoader(encodings, collate_fn=collator, batch_size=1)

In [None]:
lora_config = LoraConfig(
    r = 32,
    lora_alpha = 16,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attno_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj"
    ],
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

In [None]:
sft_config = SFTConfig(
    output_dir = "finetuned/",
    dataset_text_field = "text",
    max_seq_length = 512,
    per_device_train_batch_size = 2,
    num_train_epochs = 1, ###
    learning_rate = 1e-4, ###
    bf16 = True,
    save_safetensors = False,
    dataset_kwargs = {
        "add_special_tokens" : False,
        "append_concat_token" : False
    },
    seed = 17
)

trainer = SFTTrainer(
    model = model,
    args = sft_config,
    train_dataset = dataset["train"],
    eval_dataset = None,
    tokenizer = tokenizer,
    data_collator = collator,
)

In [None]:
trainer.train()

In [None]:
ft_model = model.merge_and_unload()

In [None]:
ft_model.save_pretrained("finetuned_model")

In [None]:
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_new_tokens = 32,
    return_full_text = True,
    top_k = 1
)

In [None]:
dataset = load_dataset(
    "json",
    data_files = {"test": "holdout_test.json"}
)

In [None]:
correct = 0
for i in tqdm(range(len(dataset['test']))):
    text = dataset['test'][i]['text']
    cutout = -1
    for j in range(len(text), 16, -1):
        snippet = text[j - 17:j]
        if snippet == '<|end_header_id|>':
            cutout = j
            break
    prompt = text[:j]
    output = pipe(prompt)[0]['generated_text'] + '<|eot_id|>'
    if text == output:
        correct += 1

print("Score:", correct / len(dataset['test']))