In [1]:
import torch
from transformers import Trainer, TrainingArguments
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os
from datasets import load_dataset,load_metric
# from evaluate import load_metric
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# device = 'cuda:0'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import emoji
emoji_list = emoji.EMOJI_DATA.keys()
emoji_descriptions = [emoji.demojize(e, delimiters=("<", ">")) for e in emoji_list]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /shared/centos7/cuda/12.1/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.0
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/sampgaon.h/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so...


  warn(msg)


### Loading the base models from Huggingface


In [2]:
load_16_bit = True

model_name = "google/flan-t5-base"
if load_16_bit:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.float16).to(device)
else:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Embedding(35888, 768)

### Updating Tokenizer and model embeddings


In [None]:
new_tokens = ['<file_photo>','<file_picture>','<file_other>','<file_video>','<file_image>','<file_gif>']
new_tokens = new_tokens+emoji_descriptions
tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})
model.resize_token_embeddings(len(tokenizer))

### Loading SAMSum Data

In [3]:
# Load the Samsum dataset
dataset = load_dataset("samsum")

train_data = dataset["train"]
validation_data = dataset["validation"] 
test_data = dataset["test"]

### Tokenizing Function

In [4]:
# Tokenzing
def tokenize_function(tokenizer,examples):
    input_dialogues = ["Summarize dialogue >>\n " + emoji.demojize(dialogue, delimiters=("<", ">")) for dialogue in examples["dialogue"]]
    inputs = tokenizer(input_dialogues, padding="max_length", truncation=True, max_length=1000)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=100)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}


In [5]:
train_data_tokenized = train_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
validation_data_tokenized = validation_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
test_data_tokenized = test_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)


### Sample summarization code

In [6]:
def summarize(tokenizer,model,text):
    print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    # Decode the summary
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary[0]


### Model - FLAN-T5

In [7]:
# for name, param in bart_base_model.named_parameters():
#     if param.requires_grad:
#         print(name)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable parameters:", trainable_params)        
#247577856/142329600

Trainable parameters: 253353216


### LoRA config for FLAN T5, target modules on self and cross attention layers

In [9]:
lora_config = LoraConfig(
 r=16, #16
 lora_alpha=32,
 target_modules=["q","v","SelfAttention.k","EncDecAttention.k"],
 lora_dropout=0.05,
 bias="lora_only",
 task_type=TaskType.SEQ_2_SEQ_LM
)


lora_model = get_peft_model(model, lora_config)
# lora_model.print_trainable_parameters()
lora_model.print_trainable_parameters()

trainable params: 2654208 || all params: 256007424 || trainable%: 1.0367699336719236


In [10]:
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name)
output_dir = "./flan_t5_base_lora_finetune"
training_args = TrainingArguments(
    overwrite_output_dir = False,
    output_dir=output_dir,
    per_device_train_batch_size=4,
#     per_device_train_batch_size=8,
    
#     per_device_eval_batch_size=4,
#     evaluation_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=10,
    learning_rate=1e-4,
)

rouge_metric = load_metric("rouge")
trainer = Trainer( 
    model=lora_model,
    args=training_args,
    train_dataset=train_data_tokenized
)

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [11]:
trainer.train(resume_from_checkpoint = True)



Step,Training Loss
18500,0.5224
19000,0.518
19500,0.5286
20000,0.5259
20500,0.5388
21000,0.5039
21500,0.5115
22000,0.516
22500,0.5155
23000,0.5113


TrainOutput(global_step=36830, training_loss=0.25522235225084, metrics={'train_runtime': 13452.4693, 'train_samples_per_second': 10.951, 'train_steps_per_second': 2.738, 'total_flos': 2.0192681447424e+17, 'train_loss': 0.25522235225084, 'epoch': 10.0})

In [12]:
trainer.model.save_pretrained('./flan_t5_base_lora_finetune_emoji_save_adapter')


In [13]:
trainer.save_model('./flan_t5_base_lora_finetune_finetune_emoji_save')
tokenizer.save_pretrained('./tokenizer-emoji_t5')

model_tok_save_directory = "./flan_t5_base_lora_finetune_model_tokenizer"
model.save_pretrained(model_tok_save_directory)
tokenizer.save_pretrained(model_tok_save_directory)


eval_trainer = Trainer(
    model=model,
    eval_dataset=validation_data_tokenized,
    compute_metrics=lambda pred: rouge_metric.compute(predictions=pred.predictions, references=pred.label_ids)
)

# Evaluate the model on the test set
results = eval_trainer.evaluate(test_data)
# results = trainer.evaluate(test_data)
print("ROUGE scores:", results)

ROUGE scores: {'eval_runtime': 0.0053, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0}


#### ROUGE SCORE

In [14]:
# def summarize(tokenizer,model,text):
#     inputs = tokenizer(f"Summarize dialogue >>\n {text}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)

#     # Generate summary
#     summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
# #     print(len(summary_ids[0]))
#     # Decode the summary
#     summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
#     return summary
def summarize(tokenizer,model,text):
#     print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(input_ids=inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    # Decode the summary
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary


generated_summaries = []
actual_summaries = []
generated_summary_orignal = []
dialogue_list = []
# SAVED_MODEL_PATH = './flan_t5_base_lora_finetune_finetune_model_tokenizer'
SAVED_TOK_PATH = 'tokenizer-emoji_t5'
SAVED_MODEL_TOK = AutoTokenizer.from_pretrained(SAVED_TOK_PATH)#.to(device)
# SAVED_MODEL = BartForConditionalGeneration.from_pretrained(SAVED_MODEL_PATH).to(device)
from peft import PeftModel, PeftConfig
# Load peft config for pre-trained checkpoint etc.
peft_model_id = "./flan_t5_base_lora_finetune_emoji_save_adapter"
config = PeftConfig.from_pretrained(peft_model_id)#.to(device)
combined_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path).to(device)
combined_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
combined_model = PeftModel.from_pretrained(combined_model, peft_model_id).to(device)
combined_model.resize_token_embeddings(len(SAVED_MODEL_TOK))


# model_name = "facebook/bart-large-cnn"
# orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
load_16_bit=True
model_name = "google/flan-t5-base"
if load_16_bit:
    orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.float16).to(device)
else:
    orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

orignal_model_tok = AutoTokenizer.from_pretrained(model_name)


In [None]:
combined_tokenizer

In [15]:
import time
start_time = time.time()
i=1
j=0
for example in test_data:
    if i%10==0:
#         print()
        j+=10
        print(f"samples summarized:{j}\ttime:{time.time()-start_time}")
#         print(f"",)
#     print(example['dialogue'])
#     generated_summary = summarize(SAVED_MODEL_TOK,SAVED_MODEL,example['dialogue'])
#     combined_model
    generated_summary = summarize(SAVED_MODEL_TOK,combined_model,example['dialogue'])
    
    generated_summaries.append(generated_summary[0])
    generated_summary_o = summarize(orignal_model_tok,orignal_model,example['dialogue'])
    generated_summary_orignal.append(generated_summary_o[0])
    actual_summaries.append(example["summary"])
    dialogue_list.append(example['dialogue'])
    
    i+=1
    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

samples summarized:10	time:17.960561513900757
samples summarized:20	time:38.24000096321106
samples summarized:30	time:57.9053909778595
samples summarized:40	time:73.06195616722107
samples summarized:50	time:91.4104585647583
samples summarized:60	time:106.27889966964722
samples summarized:70	time:118.80991220474243
samples summarized:80	time:135.4886450767517
samples summarized:90	time:153.16648769378662
samples summarized:100	time:170.85572266578674
samples summarized:110	time:183.02719831466675
samples summarized:120	time:202.11659622192383
samples summarized:130	time:217.3009798526764
samples summarized:140	time:237.12749218940735
samples summarized:150	time:254.061181306839
samples summarized:160	time:274.43682980537415
samples summarized:170	time:290.65347170829773
samples summarized:180	time:309.2243883609772
samples summarized:190	time:328.2917890548706
samples summarized:200	time:348.1879472732544
samples summarized:210	time:369.6892349720001
samples summarized:220	time:389.8104

In [16]:
import pandas as pd


temp_df = pd.DataFrame({'finetune_summary':generated_summaries,'original_summary':generated_summary_orignal,'human_summary':actual_summaries})
temp_df.to_csv('results_10_epoch_lora_additional_tokenizer-kqv.csv')#.loc[3][0]

In [17]:
i=-58
#-11,-17,-19,-20,-25,-9,-32,-49
generated_summaries[i],generated_summary_orignal[i],actual_summaries[i],print(dialogue_list[i])

Colin: Hey I have some news that you would be really interested!(´･ω･`)
Ava: What is it?
Colin: How tall are you you said?
Ava: 158...cm.. why?(-_-メ)
Colin: Congratulations! You are defeated by penguin(☞ﾟヮﾟ)☞
Ava: What are you talking about?
Colin: Penguins' average  height is 162cm when they stand up （‐＾▽＾‐）オーホッホ
Colin: So you are officially shorter than penguins☜(⌒▽⌒)☞


("Ava is 158 cm tall. Penguins' average height is 162 cm when they stand up.",
 'Ava is 158 cm tall. Penguins have an average height of 162 cm when they stand up.',
 'Colin informs Ava that with her height of 158 cm she is shorter than an average penguin.',
 None)

In [18]:
from rouge_score import rouge_scorer

start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summary_orignal)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 0.6378993988037109 seconds
Total time taken: 0.0005116462707519531 seconds
Average ROUGE scores:
rouge1: 0.49812536710527633
rouge2: 0.25241980365961003
rougeL: 0.41388172587554395


In [19]:
from rouge_score import rouge_scorer

start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summaries)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 0.6891534328460693 seconds
Total time taken: 0.00047898292541503906 seconds
Average ROUGE scores:
rouge1: 0.5130892939915732
rouge2: 0.26610476725256216
rougeL: 0.4275032360927456
