In [15]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1. \
    torchdata==0.5.1 --quiet
%pip install  \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [16]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer,GenerationConfig,TrainingArguments,Trainer
import time
import torch
import evaluate
import pandas as pd
import numpy as np

In [17]:
huggingface_dataset_name="knkarthick/dialogsum"
dataset=load_dataset(huggingface_dataset_name)
dataset

Found cached dataset csv (/home/outbreakkp/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 504.43it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [18]:
model_name='google/flan-t5-base'
original_model=AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.bfloat16)
tokenizer=AutoTokenizer.from_pretrained(model_name)

In [19]:
pytorch_total_params = sum(p.numel() for p in original_model.parameters() )
print(pytorch_total_params)
pytorch_total_params_traiable = sum(p.numel() for p in original_model.parameters() if p.requires_grad)
print(pytorch_total_params_traiable)
#tensors hote hai unke andr parameters hote hai , model.parameters() itereator return krta hai tensors pe iterate krne ka unpe for p in model.pramaeters krke iterate kia fir tensors ke andr jitne elements hai unhe parameters kehte hai unka count lene ke lie tensor.numel kia aur sb tensors ka sum krne ke lie sum lgaya bhaar, is grad to check if that param is trainable category wala h ya nhi

247577856
247577856


In [None]:
def tokenize_dataset(example):
    start="Summarize the following Conversation \n\n"
    end="\n\nSummary:"
    prompt=[start+dialogue+end for dialogue in example["dialogue"]]
    example['input_ids']=tokenizer(prompt,padding="max_length",truncation=True,return_tensors="pt").input_ids
    example['labels']=tokenizer(example['summary'],padding="max_length",truncation=True,return_tensors="pt").input_ids
    return example  

tokenized_dataset=dataset.map(tokenize_dataset,batched=True) #map() maps or gives access to each example of the dataset

In [None]:
#Taking subset of tokenized_data to save time

tokenized_dataset['train'] = tokenized_dataset['train'].shuffle(seed=42).select(range(1))
tokenized_dataset['test'] = tokenized_dataset['test'].shuffle(seed=42).select(range(2))
tokenized_dataset['validation'] = tokenized_dataset['validation'].shuffle(seed=42).select(range(1))


In [None]:
column_names = tokenized_dataset['train'].column_names
print(column_names)
tokenized_dataset=tokenized_dataset.remove_columns(['id', 'dialogue', 'summary', 'topic', ])
column_names = tokenized_dataset['train'].column_names
print(column_names)
tokenized_dataset['validation'].shape

In [None]:
output_dir='output-dir-folder'#where the model will be stored generally of size of GBs
trainingargs=TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    max_steps=1,
    logging_steps=1
    )
trainer=Trainer(
    model=original_model,
    args=trainingargs,
    train_dataset=tokenize_dataset["train"],
    eval_dataset=tokenize_dataset["validation"]
)#creating instance of trainer which is called in next cell,this is given by transformers library of hugging face

In [None]:
trainer.train()

In [None]:
rouge=evaluate.load('rouge')

In [None]:
full_fine_tuned_model_results=rouge.compute(
    predictions=clear_output,#instead of clear_output use model ouputs
    references=dataset['summary'],
    use_aggregator=True,
    use_stemmer=True#explains running , run ,ran are same 
)

PEFT

In [None]:
# pip install -U peft

In [None]:
from peft import LoraConfig,get_peft_model,TaskType
lora_config=LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias="None",
    task_type=TaskType.SEQ_2_SEQ_LM #FLAN-T5
)

In [None]:
peft_model=get_peft_model(original_model,lora_config)
pytorch_total_params2 = sum(p.numel() for p in peft_model.parameters() )
print(pytorch_total_params2)

In [None]:
output_dir='output-dir-folder'#where the model will be stored generally of size of GBs
trainingargsofpeft=TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    max_steps=1,
    logging_steps=1
    )
peft_trainer=Trainer(
    model=peft_model,
    args=trainingargsofpeft,
    train_dataset=tokenize_dataset["train"],
    eval_dataset=tokenize_dataset["validation"]
)#creating instance of trainer which is called in next cell,this is given by transformers library of hugging face

In [None]:
peft_trainer.train()
peft_trainer.model.save_pretrained('output-dir-folder')
tokenizer.save_pretrained('output-dir-folder')