In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==2.1.0+cu118 --index-url https://download.pytorch.org/whl/cu118 \
    torchdata --quiet

%pip install \
    transformers \
    datasets \
    evaluate \
    rouge_score \
    loralib \
    peft --quiet


In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
huggingface_dataset_name="knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

dataset

In [None]:
model_name='google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params=0
    for _, param in model.named_parameters():
        all_model_params +=param.numel()
        if(param.requires_grad):
            trainable_model_params +=param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall modell parameters: {all_model_params}\npercentage of trainable model params: {100*trainable_model_params/all_model_params}\n"

print (print_number_of_trainable_model_parameters(original_model))

Testing the model with Zero Shot Inferencing

In [None]:
index = 200
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:

"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = "-".join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'MODEL_GENERATION - ZERO SHOT:\n{output}')



# 2 - Perform Full Fine-Tuning

2.1 Preprocess the Dialog-Summary Dataset

You need to convert the dialog-summary (prompt-response) pairs into explicit instructions for the LLM. Prepend an instruction to the start of the dialog with Summarize the following conversation and to the start of the summary with Summary as follows:

Training prompt (dialogue):
Summarize the following conversation.
Chris: This is his part of the conversation.
Antje: This is her part of the conversation.
Summary:

Training response (summary):
Both Chris and Antje participated in the conversation.

Then preprocess the prompt-response dataset into tokens and pull out their input_ids (1 per token).


In [None]:

def tokenize_function (example):
    start_prompt = 'Summarize the following conversation. \n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer (prompt, padding="max_length", truncation =True, return_tensors="pt").input_ids

    example['labels'] = tokenizer (example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids 
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary', ])

In [None]:
#subsample the dataset to save time

tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 ==0, with_indices=True)

In [None]:
from pprint import pprint

#Check the shapes

print("Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

pprint(tokenized_datasets['train'][1])

The output dataset is ready for fine-tuning.

2.2 - Fine-tune the model with the preprocessed dataset.
Now utilize the built-in Hugging Face Trainer class. Pass the preprocessed dataset with reference to the original model. Other training parameters are found experimentally and there is no need to go into details about those at the moment.




In [None]:

output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-4,
    num_train_epochs=1, 
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,
    
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()


... skipping


3.1 Setup PEFT/LoRA model for fine-tuning
A nrew layer/adapter
rank is the hyper parameter that defines the rank/dimension of the adapter to be trained

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, #rank
    lora_alpha = 32,
    target_modules = ["q", "v"],
    lora_dropout = 0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #FLAN-T5
)


Add lora adapter layers/parameters to the original LLM to be trained

In [None]:
print(torch.cuda.is_available())
print(torch.backends.cudnn.enabled)
print(torch.version.cuda)

In [None]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(torch.cuda.is_available())
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#device = 0 if torch.cuda.is_available() else torch.device("cpu") #Ugyanaz mint a fenti
peft_model.to(device)
print(print_number_of_trainable_model_parameters(peft_model))

3.2 Train PEFT adapter
Define training arguments and create Trainer instance

In [None]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-4,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [None]:

peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

#A fenti nekem CPUn 127 perc volt a hatterben
#####################################################3
Prepare this model by adding an adapter to the original FlanT5 model. 
It will not be trainable because the plan is only to perform inference with this PEFT model.

In [None]:
# peft_trainer.save_model("./trained-for-long-time")

In [None]:

from peft import PeftModel
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model_2 = PeftModel.from_pretrained(peft_model_base,
                                       './peft-dialogue-summary-checkpoint-local',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

print(print_number_of_trainable_model_parameters(peft_model_2))


# 3.3 Evaluate the model
Make inferences for the same example as in sections 1.3 and 2.3 


In [None]:
index = 203
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:

"""

inputs = tokenizer(prompt, return_tensors='pt')
input_ids = tokenizer(prompt, return_tensors='pt').input_ids


peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

dash_line = "-".join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'PEFT SUMMARY:\n{peft_model_text_output}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')


