# LLAMA Fine Tuning

## Part I - Data and Model Loading

### A. Data

In [1]:
import pandas as pd
import os

datasets = '../data/dataset'
files_list = os.listdir(datasets)

print(files_list)

['dataset_20240704_150822.csv', 'dataset_20240704_151424.csv', 'dataset_20240704_151901.csv', 'dataset_20240704_152357.csv', 'dataset_20240704_152812.csv', 'dataset_20240704_154639.csv', 'dataset_20240704_155436.csv', 'dataset_20240704_160014.csv', 'dataset_20240704_160759.csv', 'dataset_20240704_161256.csv', 'dataset_20240704_162030.csv', 'dataset_20240704_164120.csv', 'dataset_20240704_165808.csv', 'dataset_20240704_172021.csv', 'dataset_20240704_173948.csv', 'dataset_20240704_175152.csv', 'dataset_20240704_180117.csv', 'dataset_20240704_181213.csv', 'dataset_20240704_182558.csv', 'dataset_20240704_184053.csv', 'dataset_20240704_185236.csv', 'dataset_20240704_190449.csv', 'dataset_20240704_191929.csv', 'dataset_20240704_192809.csv', 'dataset_20240704_193055.csv', 'dataset_20240704_193834.csv', 'dataset_20240704_194959.csv', 'dataset_20240704_200424.csv', 'dataset_20240704_201639.csv', 'dataset_20240704_202421.csv', 'dataset_20240704_203553.csv', 'dataset_20240704_203820.csv', 'datase

In [2]:
data = pd.DataFrame()

for file in files_list:
    filepath = os.path.join(datasets, file)
    df = pd.read_csv(filepath)
    data = pd.concat([data, df], ignore_index=True)

data.insert(0,'id','')
data['id'] = data.audio_path.str[-10:]
data.head(5)

Unnamed: 0,id,audio_path,transcript,summary
0,000000.mp3,../data/audio\000000.mp3,"\n\tON the north-east coast of Scotland, in th...",\n \n\tThe history of the family of the...
1,000001.mp3,../data/audio\000001.mp3,\n\tALLEYN was no where to be found. The Earl ...,\n \n\tThere is an attack and an impend...
2,000002.mp3,../data/audio\000002.mp3,\n\tTHE Count was walking on the ramparts of t...,\n \n\tMalcolm reveals an important sec...
3,000003.mp3,../data/audio\000003.mp3,\n\tMEANWHILE the Earl remained a solitary pri...,\n \n\tMatilda falls into despair over ...
4,000004.mp3,../data/audio\000004.mp3,"\n\tMARY, in the mean time, suffered all the t...",\n \n\tIdentities are revealed and the ...


### B. Model

In [3]:
import torch
import time

from datasets import Dataset, load_dataset
from datasets import load_dataset, load_metric
from transformers import pipeline, set_seed
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,TrainingArguments,pipeline, logging
from peft import LoraConfig

import warnings
warnings.filterwarnings("ignore")

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = "NousResearch/Llama-2-7b-chat-hf"

peft_lama = "llama-2-7b-peft"

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

lama = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)

lama.config.use_cache = False
lama.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

### C. Data Preprocessing

In [7]:
from datasets import Dataset

dataset = Dataset.from_pandas(data)

def preprocess_function(examples):
    inputs = [
        f"Summarize the following conversation.\n\n### Input:\n{transcript}\n\n### Summary:\n"
        for transcript in examples['transcript']
    ]
    
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

Map:   0%|          | 0/390 [00:00<?, ? examples/s]

In [9]:
example = tokenized_dataset[0]
input_text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
summary_text = tokenizer.decode(example['labels'], skip_special_tokens=True)

print("Training Prompt Example:")
print("Input Text:\n", input_text)

Training Prompt Example:
Input Text:
 Summarize the following conversation.

### Input:

	ON the north-east coast of Scotland, in the most romantic part of the Highlands, stood the Castle of Athlin; an edifice built on the summit of a rock whose base was in the sea. This pile was venerable from its antiquity, and from its Gothic structure; but more venerable from the virtues which it enclosed. It was the residence of the still beautiful widow, and the children of the noble Earl of Athlin, who was slain by the hand of Malcolm, a neighbouring chief, proud, oppressive, revengeful; and still residing in all the pomp of feudal greatness, within a few miles of the castle of Athlin. Encroachment on the domain of Athlin, was the occasion of the animosity which subsisted between the chiefs. Frequent broils had happened between their clans, in which that of Athlin had generally been victorious. Malcolm, whose pride was touched by the defeat of his people; whose ambition was curbed by the authori

Let's call the model on this example to see the output:

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clearing any existing cache (if applicable) in case we run the code a multiple times and RAM is full
if device.type == "cuda":
    torch.cuda.empty_cache()

input_ids = torch.tensor(example["input_ids"]).unsqueeze(0)

outputs = lama.generate(input_ids,
                        max_length=1024,
                        max_new_tokens=100,
                        num_return_sequences=1,
                        num_beams=4,
                        temperature=0.5,
                        no_repeat_ngram_size=2,
                        repetition_penalty=1.0,
                        length_penalty=0.8,
                        early_stopping=True)

decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

print("Generated Summary:")
for output in decoded_outputs:
    print(output)

Both `max_new_tokens` (=100) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Summary:
Summarize the following conversation.

### Input:

	ON the north-east coast of Scotland, in the most romantic part of the Highlands, stood the Castle of Athlin; an edifice built on the summit of a rock whose base was in the sea. This pile was venerable from its antiquity, and from its Gothic structure; but more venerable from the virtues which it enclosed. It was the residence of the still beautiful widow, and the children of the noble Earl of Athlin, who was slain by the hand of Malcolm, a neighbouring chief, proud, oppressive, revengeful; and still residing in all the pomp of feudal greatness, within a few miles of the castle of Athlin. Encroachment on the domain of Athlin, was the occasion of the animosity which subsisted between the chiefs. Frequent broils had happened between their clans, in which that of Athlin had generally been victorious. Malcolm, whose pride was touched by the defeat of his people; whose ambition was curbed by the authority, and whose great

## Part II - Modeling

### A. PEFT

In [11]:
from peft import get_peft_model, PeftModel
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

output_dir = f'./llama-dialogue-summary-training-{str(int(time.time()))}'




In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=lama)

peft_model = get_peft_model(lama, lora_config)

peft_trainer = Trainer(
    model=peft_model,
    args=training_params,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clearing any existing cache (if applicable) in case we run the code a multiple times and RAM is full
if device.type == "cuda":
    torch.cuda.empty_cache()

peft_trainer.train()

peft_model_path = "./peft-llama-summary"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

  0%|          | 0/88 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clearing any existing cache (if applicable) in case we run the code a multiple times and RAM is full
if device.type == "cuda":
    torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

fine_tuned_model = PeftModel.from_pretrained(lama, peft_model_path).to(device)

In [None]:
ARTICLE_TO_SUMMARIZE = data['transcript'][0]

prompt = f"""

Summarize the following conversation:
{ARTICLE_TO_SUMMARIZE}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)
output_PEFT = tokenizer.decode(fine_tuned_model.model.generate(input_ids, max_new_tokens=100)[0], skip_special_tokens=True)

summary = data['summary'][0]

if len(output_PEFT) > len(summary):
    output_PEFT = output_PEFT[:len(summary)]
else:
    summary = summary[:len(output_PEFT)]

PEFT_model_results = rouge.compute(
    predictions=output_PEFT,
    references=summary,
    use_aggregator=True,
    use_stemmer=True,
)

print("----"*20)
print("HUMAN GENERATED SUMMARY:")
print(summary)

print("----"*20)
print("LLAMA GENERATED SUMMARY:")
print(output_PEFT)
print("----"*20)

print("PEFT MODEL:")
print(PEFT_model_results)