In [None]:
# This notebook is based on the tutorial by Mostafa Ibrahim in Weights and Biases
# https://wandb.ai/mostafaibrahim17/ml-articles/reports/Fine-Tuning-LLaMa-2-for-Text-Summarization--Vmlldzo2NjA1OTAy

# I am going to change to using the cnn-dailymail dataset that I already have
# https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail

# I had to update the code with QLORA changes from here:
# https://colab.research.google.com/github/Adapter-Hub/adapters/blob/main/notebooks/QLoRA_Llama_Finetuning.ipynb#scrollTo=a8ef7f26-f87b-4c54-924f-c9661bc1bf2f

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')
import json
import re
import os
from pprint import pprint
import pandas as pd
import torch
#kaggle_secrets import UserSecretsClient
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login, login
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

In [26]:
#import os
#os.environ['WANDB_DISABLED'] = 'true'

In [2]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-3.1-8B"
MODEL_DIR = "Llama3"
print(DEVICE)

cuda:0


In [3]:
df = pd.read_csv("/home/kristina/Documents/data/cnn_dailymail/train.csv", nrows=500)
df.columns = [str(q).strip() for q in df.columns]


dataset = Dataset.from_pandas(df)

In [4]:
DEFAULT_SYSTEM_PROMPT = """
Below is a newpaper article. Write the highlights of the article.
""".strip()


def generate_training_prompt(
    article: str, highlights: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    prompt = f"""### Instruction: {system_prompt}


                ### Input:  {article.strip()}


                ### Response:  {highlights}
                """.strip()
    
    return prompt


In [5]:
def create_article_text(data_point):
    return data_point["article"]


def generate_text(data_point):
    highlights = data_point["highlights"]
    article_text = create_article_text(data_point)
    return {
        "article": article_text,
        "highlights": highlights,
        "text": generate_training_prompt(article_text, highlights),
    }


# Example usage with a new dataset format
example_data_point = {
    "id": "train_0",
    "article": "How can you get the maximum social security retirement benefit? For many social security behefits are a key part...",
    "highlights": "social security benefits",
    "topic": "money in retirement"
}


example = generate_text(example_data_point)
print(example["text"])


### Instruction: Below is a newpaper article. Write the highlights of the article.


                ### Input:  How can you get the maximum social security retirement benefit? For many social security behefits are a key part...


                ### Response:  social security benefits


In [6]:
from datasets import Dataset


def process_dataset(data: Dataset) -> Dataset:
    """
    This function processes the dataset to include only the necessary columns.
    """
    # First, apply generate_text to each record in the dataset
    processed_data = data.map(generate_text)


    # Then, remove unnecessary columns
    columns_to_remove = [col for col in processed_data.column_names if col not in ["text"]]
    return processed_data.remove_columns(columns_to_remove)


In [7]:
# Process the entire dataset
processed_dataset = process_dataset(dataset)


# Split the processed dataset into train, validation, and test sets
train_dataset = processed_dataset.shuffle(seed=42).select(range(0, int(0.8 * len(processed_dataset))))
validation_dataset = processed_dataset.shuffle(seed=42).select(range(int(0.8 * len(processed_dataset)), int(0.9 * len(processed_dataset))))
test_dataset = processed_dataset.shuffle(seed=42).select(range(int(0.9 * len(processed_dataset)), len(processed_dataset)))


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [24]:
import os 

def tokenize(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=512, # can set to longer values such as 2048
        add_special_tokens=False,
    )

def dataset_tokenize(dataset):
    return dataset.map(
    tokenize, 
    batched=True, 
    num_proc=os.cpu_count(),    # multithreaded
    remove_columns=["text"]     # don't need this anymore, we have tokens from here on
)
    
train_tokenized = dataset_tokenize(train_dataset)
validation_tokenized = dataset_tokenize(validation_dataset)
test_tokenized = dataset_tokenize(test_dataset)

Map (num_proc=32):   0%|          | 0/400 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/50 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/50 [00:00<?, ? examples/s]

In [8]:
dataset

Dataset({
    features: ['id', 'article', 'highlights'],
    num_rows: 500
})

In [27]:
max_memory = {0: '8000MB'}

def create_model_and_tokenizer():
    #adapters_name = 'timdettmers/guanaco-7b'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load the model from the split checkpoint
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_DIR,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        max_memory=max_memory,
        quantization_config=bnb_config
        )

    #m = PeftModel.from_pretrained(m, adapters_name)
    #m = m.merge_and_unload()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


Was getting a warning about bitsandbytes not being compiled for GPU.
To get the quantized version of the model to work I had to update things.
I upgraded, but that didn't work. I found a new package to install
pip install bitsandbytes-intel
Previously had a different new bitsandbytes (bitsandbytes-cuda112)
This caused a new error where the shared resource libcudart.so.11.0 could not be found
I looked for it: sudo find / -name 'libcudart.so.11.0'
It was not found.
I tried pip install nvidia-cudnn-cu116, which needed pip install nvidia-pyindex
Still did not fix
Tried sudo apt install nvidia-cuda-toolkit (now it is found!)

In [28]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [29]:
import adapters
from adapters import LoRAConfig

adapters.init(model)

config = LoRAConfig(
    selfattn_lora=True, intermediate_lora=True, output_lora=True,
    attn_matrices=["q", "k", "v"],
    alpha=16, r=64, dropout=0.1
)
model.add_adapter("assistant_adapter", config=config)
model.train_adapter("assistant_adapter")

print(model.adapter_summary())

There are adapters available but none are activated for the forward pass.


Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
assistant_adapter        lora            113,246,208       2.820       1       1
--------------------------------------------------------------------------------
Full model                              4,015,263,744     100.000               0


In [30]:
for param in model.parameters():
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

# Enable gradient checkpointing to reduce required memory if needed
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(torch.nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [13]:
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>,
 '_load_in_8bit': False,
 '_load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': True,
 'bnb_4bit_compute_dtype': 'float16',
 'bnb_4bit_quant_storage': 'uint8',
 'load_in_4bit': True,
 'load_in_8bit': False}

In [14]:
lora_alpha = 32
lora_dropout = 0.05
lora_r = 16


peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)


In [15]:
def generate_prompt(
    article: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}


### Input:
{article.strip()}


### Response:
""".strip()


In [16]:
def summarize(model, text: str):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0001)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)


In [17]:
def generate_summaries(model, dataset, tokenizer, num_samples=5):
    summaries = []
    for i, example in enumerate(dataset):
        if i >= num_samples:
            break
        print(i)
        prompt = generate_prompt(example['article'])
        summary = summarize(model, prompt)
        summaries.append({'article': example['article'], 'generated_summary': summary})
    return summaries


In [None]:
# Generate summaries before fine-tuning
#original_summaries = generate_summaries(model, test_dataset, tokenizer, num_samples=5)
#with open('original_summaries.txt', 'w') as outfile:
#    for summary in original_summaries:
#        outfile.write(f"article:{summary['article']}\n\nhighlights:{summary['generated_summary']}\n\n")

In [31]:
output_dir = "article-highlights-llama-3-finetuned"


#training_arguments = TrainingArguments(
#    per_device_train_batch_size=4,
#    gradient_accumulation_steps=4,
#    optim="paged_adamw_32bit",
#    logging_steps=1,
#    learning_rate=1e-4,
#    fp16=True,
#    max_grad_norm=0.3,
#    num_train_epochs=20,
#    eval_strategy="epoch",
#    eval_steps=0.2,
#    warmup_ratio=0.05,
#    save_strategy="epoch",
#    group_by_length=True,
#    output_dir=OUTPUT_DIR,
#    save_safetensors=True,
#    lr_scheduler_type="cosine",
#    seed=42,
#    load_best_model_at_end=True,
#    report_to=None,
#)

args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy="steps",
    logging_steps=10,
    save_steps=500,
    eval_steps=187,
    save_total_limit=3,
    gradient_accumulation_steps=16,
    max_steps=1875,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=0.0002,
    group_by_length=True,
    bf16=True,
    warmup_ratio=0.03,
    max_grad_norm=0.3,
)


In [17]:
#model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [32]:
from adapters import AdapterTrainer
from transformers import DataCollatorForLanguageModeling

def formatting_prompts_func(example):
    return example['text']
    
trainer = AdapterTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
    args=args,
)


In [None]:
# Fine-tune your model
trainer.train()


Step,Training Loss,Validation Loss


In [18]:
def formatting_prompts_func(example):
    return example['text']

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    args=training_arguments,
)


Applying formatting function to train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [19]:
# Generate summaries after fine-tuning
fine_tuned_summaries = generate_summaries(trainer.model, test_dataset, tokenizer, num_samples=5)

#write them to a file
with open('finetuned_summaries.txt', 'w') as outfile:
    for summary in fine_tuned_summaries:
        outfile.write(f"article:{summary['article']}\n\nhighlights:{summary['generated_summary']}\n\n")


[34m[1mwandb[0m: Currently logged in as: [33mkristinadoing[0m ([33mkristinadoing-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 0 has a total capacity of 15.70 GiB of which 1.28 GiB is free. Including non-PyTorch memory, this process has 14.41 GiB memory in use. Of the allocated memory 13.36 GiB is allocated by PyTorch, and 785.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)