<a href="https://colab.research.google.com/github/hsakkout/genai-peft-finetune/blob/development/LightweightFineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lightweight Fine-Tuning Project



* PEFT technique:  LoRA
  *  https://huggingface.co/docs/peft/main/en/conceptual_guides/lora
* Model:
  * model_id = "meta-llama/Llama-3.1-8B"
* Evaluation approach:
* Fine-tuning dataset:

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
from google.colab import userdata

# First set the token in Colab's secrets manager
# Then access it securely:
token = userdata.get('hf_personal_default')

# Login using the stored token
!huggingface-cli login --token {token}

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [None]:
# Google Colab instance installs
# Reminder: Restart runtime after installation

!pip install peft
!pip install bitsandbytes
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install bert_score
!pip install wandb



NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig  # Let's not use a pipeline, for full transparency
import torch

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True
        )

def setup_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    tokenizer.padding_side = "left"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        pad_token_id=tokenizer.pad_token_id
    )

    return model, tokenizer



In [None]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"  #  Note: this model needed Meta's license agreement and access approval

model, tokenizer = setup_model(model_name)

model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128009)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): Llama

## Evaluate its performance


In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList

class StopOnPeriod(StoppingCriteria):
    def __init__(self, tokenizer, stop_id):
        self.tokenizer = tokenizer
        self.stop_id = stop_id

    def __call__(self, input_ids, scores, **kwargs):
        return self.stop_id in input_ids[0][-1:]

class StopOnNewline(StoppingCriteria):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.newline_ids = tokenizer.encode('\n', add_special_tokens=False)

    def __call__(self, input_ids, scores, **kwargs):
        n = len(self.newline_ids)
        if n > len(input_ids[0]):
            return False
        if input_ids[0][-n:].tolist() == self.newline_ids:
            return True
        return False

# Get the token ID for the period
stop_token_id = tokenizer.encode(".", add_special_tokens=False)[-1]
stopping_criteria = StoppingCriteriaList([StopOnPeriod(tokenizer, stop_token_id),StopOnNewline(tokenizer)])




# Define bad words to avoid
bad_phrases = ['The final', 'The answer', 'The correct', 'Answer:',
    'A', 'B', 'A)', 'B)', 'Option A', 'Option B',
    'Choice A', 'Choice B', '(A)', '(B)']
bad_words_ids = tokenizer(bad_phrases, add_special_tokens=False).input_ids







In [None]:


def generate_text(model, tokenizer, prompt, max_new_tokens=20):
    # Prepare inputs
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    # Move inputs to GPU
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate text
    outputs = lora_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        stopping_criteria=stopping_criteria,
        bad_words_ids=bad_words_ids,
        no_repeat_ngram_size=3,
        temperature=0.1
    )

    # Calculate the length of the input prompt
    input_length = inputs['input_ids'].shape[1]

    # Extract only the generated tokens (excluding the prompt)
    generated_tokens = outputs[:, input_length:]

    # Decode only the generated tokens
    generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    return generated_text

# Adjusted prompt
prompt = (
    "Definition: In each example you will read a short sentence (or two). "
    "Then, a pronoun in the text is mentioned. Your task is to choose a referent which the mentioned pronoun refers to. "
    "You will be given two options in each case and one choice should seem much more likely to you than the other.\n\n"
    "Positive Example 1 - Input: sentence: I couldn't put the saucepan on the rack because it was too tall. "
    "pronoun: it. A) the saucepan B) the rack Output: the rack\n\n"
    "Positive Example 2 - Input: sentence: Arnold greatly influenced Jackson, though he lived two centuries earlier. "
    "pronoun: he. A) jackson B) arnold Output: arnold\n\n"
    "Negative Example 1 - Input: sentence: Joe and David's uncles can still beat them at tennis, even though they are 30 years younger. "
    "pronoun: they. A) joe and david B) joe and david's uncles Output: joe and david's uncles\n\n"
    "Negative Example 2 - Input: sentence: Gaston passed the half-empty plate to Hershel because he was full. "
    "pronoun: he. A) Gaston B) Hershel Output: Hershel\n\n"
    "Now complete the following example - Input: sentence: Joe and Steve paid the detectives after they delivered the final report on the case. "
    "pronoun: they. A) joe and steve B) the detectives Output:"
)

result = generate_text(model, tokenizer, prompt, max_new_tokens=10)

print(result[0].strip())



Answer:
the detectives


In [None]:
from datasets import load_dataset

# Load dataset
train_dataset = load_dataset(
    'Lots-of-LoRAs/task249_enhanced_wsc_pronoun_disambiguation',
    split='train'
)

# Load dataset
test_dataset = load_dataset(
    'Lots-of-LoRAs/task249_enhanced_wsc_pronoun_disambiguation',
    split='test'
)

# Print the column names
print(f'Train Dataset Columns: {train_dataset.column_names}')
print(f'Train Dataset Rows : {len(train_dataset)}')
print(f'Test Dataset Columns: {test_dataset.column_names}')
print(f'Test Dataset Rows : {len(test_dataset)}')




KeyboardInterrupt: 

In [None]:


from tqdm import tqdm
from rouge_score import rouge_scorer
import numpy as np
import pandas as pd




# Adjust the preprocessing function based on actual column names
def preprocess_function(examples):
    inputs = examples['input']
    raw_references = examples['output']

    prompts = []
    references = []

    for input_text, ref in zip(inputs, raw_references):
        # Build the prompt for each example
        prompt = (
            input_text.strip()
        )
        prompts.append(prompt)

        # Process the reference
        if isinstance(ref, list):
            ref = ref[0] if ref else ''
        else:
            ref = str(ref)

        references.append(ref.strip())

    return {
        'prompts': prompts,
        'references': references,
    }


  return fn(*args, **kwargs)
100%|██████████| 14/14 [15:35<00:00, 66.79s/it]


Average ROUGE Scores for Our Model:
ROUGE-1 F1 Score: 0.3878
ROUGE-2 F1 Score: 0.0942
ROUGE-L F1 Score: 0.3878





## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [None]:
##### THIS CODE NEEDS TO BE COMBINED WITH ABOVE EVAL


def evaluate_quality(model_in, tokenizer_in):

    data = test_dataset.map(preprocess_function, batched=True)
    # Generate model outputs in batches
    batch_size = 5  # Adjust based on your GPU memory
    generated_answers = []
    references = []
    questions_list = []

    for i in tqdm(range(0, len(data), batch_size)):
        batch = data[i:i+batch_size]
        prompts = batch['prompts']
        refs = batch['references']

        # Tokenize inputs
        inputs = tokenizer_in(
            prompts,
            return_tensors='pt',
            truncation=True,
            padding=True,
            max_length=400
        )

        # Generate outputs
        outputs = model_in.generate(
            **inputs,
            max_new_tokens=15,  # Adjust as needed
            eos_token_id=tokenizer_in.eos_token_id,
            pad_token_id=tokenizer_in.pad_token_id,
            stopping_criteria=stopping_criteria,
            bad_words_ids=bad_words_ids,
            no_repeat_ngram_size=3,
            temperature=0.1,
            num_beams=5,
            early_stopping=True
        )

        # **Calculate the length of the input prompt**
        input_length = inputs['input_ids'].shape[1]

        # **Extract only the generated tokens (excluding the prompt)**
        generated_tokens = outputs[:, input_length:]

        # Decode outputs
        decoded_outputs = tokenizer_in.batch_decode(generated_tokens, skip_special_tokens=True)

        # Post-process outputs if necessary
        processed_outputs = [output.strip() for output in decoded_outputs]

        generated_answers.extend(processed_outputs)
        references.extend(refs)
        questions_list.extend(prompts)

        # Clear CUDA cache
        torch.cuda.empty_cache()

    # Minimal post-processing to clean the outputs (optional)
    cleaned_outputs = []
    for output in generated_answers:
        # Split at the first newline or period
        output = output.split('\n')[0].split('.')[0].strip()
        cleaned_outputs.append(output)

    # Step 5: Compute evaluation metrics and store detailed results
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    detailed_results = []

    for idx, (question, ref, pred) in enumerate(zip(
        questions_list, references, cleaned_outputs)):
        # Compute ROUGE scores for our model's prediction
        scores = scorer.score(ref, pred)

        # Store the detailed results
        detailed_results.append({
            'No.': idx + 1,
            'Question': question,
            'Reference': ref,
            'Our Prediction': pred,
            'Our ROUGE-1 F1': scores['rouge1'].fmeasure,
            'Our ROUGE-2 F1': scores['rouge2'].fmeasure,
            'Our ROUGE-L F1': scores['rougeL'].fmeasure,
        })

    # Step 6: Create a pandas DataFrame
    df = pd.DataFrame(detailed_results)

    # Set pandas options for better display
    pd.set_option('display.max_colwidth', None)  # Don't truncate text in cells

    # Display the DataFrame in the notebook
    # display(df[['No.', 'Question', 'Reference', 'Our Prediction',
    #             'Our ROUGE-1 F1', 'Our ROUGE-2 F1', 'Our ROUGE-L F1']])

    # Also print the average scores
    avg_rouge1 = df['Our ROUGE-1 F1'].mean()
    avg_rouge2 = df['Our ROUGE-2 F1'].mean()
    avg_rougeL = df['Our ROUGE-L F1'].mean()

    print(f"\nAverage ROUGE Scores for Our Model:")
    print(f"ROUGE-1 F1 Score: {avg_rouge1:.4f}")
    print(f"ROUGE-2 F1 Score: {avg_rouge2:.4f}")
    print(f"ROUGE-L F1 Score: {avg_rougeL:.4f}")


In [None]:

evaluate_quality(model, tokenizer)

  return fn(*args, **kwargs)
100%|██████████| 14/14 [16:28<00:00, 70.60s/it]


Average ROUGE Scores for Our Model:
ROUGE-1 F1 Score: 0.6914
ROUGE-2 F1 Score: 0.2981
ROUGE-L F1 Score: 0.6914





In [None]:
from peft import LoraConfig, get_peft_model


In [None]:

from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import prepare_model_for_kbit_training



# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Apply LoRA configuration
config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=16,
    lora_alpha=50,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "gate_proj", "up_proj"],
    # Note: Applying LoRA to specific layers may require custom code
)

lora_model = get_peft_model(model, config)

# Optionally, print the trainable parameters to verify
lora_model.print_trainable_parameters()


train_data = train_dataset.map(preprocess_function, batched=True)

# # Filter out any examples with empty prompts or references
train_data = train_data.filter(lambda example: example['prompts'] and example['references'])

# # # Split the dataset if needed (e.g., train/test split)
# # # For example, if your dataset has a 'train' split:
# # data = data['train']



def tokenize_function(examples):
    # Tokenize prompts and references separately
    tokenized_prompts = tokenizer(
        examples['prompts'],
        max_length=512,
        truncation=True,
        padding=False,
        add_special_tokens=False,
    )

    tokenized_references = tokenizer(
        examples['references'],
        max_length=512,
        truncation=True,
        padding=False,
        add_special_tokens=False,
    )

    input_ids = []
    labels = []
    attention_masks = []

    for prompt_ids, reference_ids in zip(tokenized_prompts['input_ids'], tokenized_references['input_ids']):
        # Concatenate prompt and reference IDs
        input_id = prompt_ids + reference_ids + [tokenizer.eos_token_id]
        input_ids.append(input_id)

        # Create labels by masking the prompt tokens
        label = [-100] * len(prompt_ids) + reference_ids + [tokenizer.eos_token_id]
        labels.append(label)

        # Create attention mask
        attention_mask = [1] * len(input_id)
        attention_masks.append(attention_mask)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels,
    }

# Apply the tokenization function
tokenized_datasets = train_data.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)


# Define a custom data collator to handle dynamic padding
def data_collator(features):
    batch = {}

    # Get max sequence length in the batch
    max_length = max(len(f['input_ids']) for f in features)

    # Pad input_ids and labels
    input_ids = []
    attention_mask = []
    labels = []
    for f in features:
        pad_length = max_length - len(f['input_ids'])
        input_ids.append(f['input_ids'] + [tokenizer.pad_token_id] * pad_length)
        attention_mask.append([1] * len(f['input_ids']) + [0] * pad_length)
        labels.append(f['labels'] + [-100] * pad_length)

    batch['input_ids'] = torch.tensor(input_ids, dtype=torch.long)
    batch['attention_mask'] = torch.tensor(attention_mask, dtype=torch.long)
    batch['labels'] = torch.tensor(labels, dtype=torch.long)

    return batch

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Increased from 1 to 4 for better efficiency
    gradient_accumulation_steps=4,   # Reduced from 16 to 4 since batch size increased
    evaluation_strategy='no',
    save_strategy='epoch',
    logging_steps=10,
    save_total_limit=2,
    fp16=True,  # Enable mixed-precision training
    learning_rate=2e-4,
    max_grad_norm=0.3,
    warmup_steps=100,
    weight_decay=0.01,
    report_to='none',  # Set to 'wandb' or 'tensorboard' if you use them
)

# Initialize the Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)


# Clear CUDA cache
torch.cuda.empty_cache()

# Start training
trainer.train()

trainable params: 28,311,552 || all params: 8,058,572,800 || trainable%: 0.3513


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss


KeyboardInterrupt: 

In [None]:
lora_model.print_trainable_parameters()

trainable params: 28,311,552 || all params: 8,058,572,800 || trainable%: 0.3513


In [None]:
def print_model_device(model, name="model"):
    device = next(model.parameters()).device
    print(f"{name} is on device: {device}")

# After loading the model
print_model_device(model, "Base model")
print_model_device(model, "Model after prepare_model_for_kbit_training")


print_model_device(lora_model, "LoRA model")

In [None]:
# lora_model.save_pretrained("llama3_1_8b-lora-pretraining")
lora_model.save_pretrained("llama3_1_8b-lora-posttraining-1")

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [None]:
from peft import PeftModel, PeftConfig

# create a new, fresh copy of the base model

print(f'Recreating a new copy of Model {model_name}')
model_copy, tokenizer_copy = setup_model(model_name)



lora_weights_path = "./llama3_1_8b-lora-posttraining-1"
config = PeftConfig.from_pretrained(lora_weights_path)

# 4. Load the LoRA model
lora_model_copy = PeftModel.from_pretrained(
    model_copy,
    lora_weights_path,
    device_map="auto"
)

Recreating a new copy of Model meta-llama/Llama-3.1-8B-Instruct


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
evaluate_quality(lora_model_copy, tokenizer_copy)

100%|██████████| 14/14 [02:08<00:00,  9.16s/it]


Average ROUGE Scores for Our Model:
ROUGE-1 F1 Score: 0.7503
ROUGE-2 F1 Score: 0.3705
ROUGE-L F1 Score: 0.7503



