In [None]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import pandas as pd
import torch

from datasets import Dataset
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling

load_dotenv("../.env")
hf_token = os.getenv("HF_TOKEN")

In [None]:
df = pd.read_excel("eng_data_pseudoprompt.xlsx")

dataset = Dataset.from_pandas(df)

# create column with the formatted prompt (format: [DIFFICULTY: {difficulty}]\n{pseudo_prompt}) 
def format_example(row):
    return {
        "prompt": f"[DIFFICULTY: {row['difficulty']}]\n{row['pseudo_prompt']}"
    }

dataset = dataset.map(format_example)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="auto", load_in_8bit=True, token=hf_token)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=hf_token)
tokenizer.pad_token = tokenizer.eos_token

#tokenize dataset
def tokenize(batch):
    return tokenizer(batch["prompt"], text_target=batch["response"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="models/llama2-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="no",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

model.save_pretrained("models/llama2-finetuned")
tokenizer.save_pretrained("models/llama2-finetuned")