In [1]:
import torch
from transformers import Trainer, TrainingArguments
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os

from datasets import load_dataset,load_metric
# from evaluate import load_metric
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# device = 'cuda:0'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import emoji
emoji_list = emoji.EMOJI_DATA.keys()
emoji_descriptions = [emoji.demojize(e, delimiters=("<", ">")) for e in emoji_list]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /shared/centos7/cuda/12.1/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.0
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/sampgaon.h/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so...


  warn(msg)


### Loading the base models from Huggingface


In [2]:
load_16_bit = False

model_name = "google/flan-t5-small"
if load_16_bit:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.float16).to(device)
else:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Embedding(35888, 512)

### Updating Tokenizer and model embeddings


In [None]:
new_tokens = ['<file_photo>','<file_picture>','<file_other>','<file_video>','<file_image>','<file_gif>']
new_tokens = new_tokens+emoji_descriptions
tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})
model.resize_token_embeddings(len(tokenizer))

### Loading SAMSum Data

In [2]:
# Load the Samsum dataset
dataset = load_dataset("samsum")

train_data = dataset["train"]
validation_data = dataset["validation"] 
test_data = dataset["test"]

### Tokenizing Function

In [4]:
# Tokenzing
def tokenize_function(tokenizer,examples):
    input_dialogues = ["Summarize dialogue >>\n " + emoji.demojize(dialogue, delimiters=("<", ">")) for dialogue in examples["dialogue"]]
    inputs = tokenizer(input_dialogues, padding="max_length", truncation=True, max_length=1000)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=100)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}


In [5]:
train_data_tokenized = train_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
validation_data_tokenized = validation_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
test_data_tokenized = test_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)


### Sample summarization code

In [6]:
def summarize(tokenizer,model,text):
    print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    # Decode the summary
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary[0]

text = test_data['dialogue'][-1]
print("summary:",summarize(tokenizer,model,text))

Summarize dialogue >>
 Rachel: <file_other>
Rachel: Top 50 Best Films of 2018
Rachel: :)
Janice: Omg, I've watched almost all 50... xDD
Spencer: Hahah, Deadpool 2 also??
Janice: Yep
Spencer: Really??
Janice: My bf forced me to watch it xD
Rachel: Hahah
Janice: It wasn't that bad
Janice: I thought it'd be worse
Rachel: And Avengers? :D
Janice: 2 times
Rachel: Omg
Janice: xP
Rachel: You are the best gf in the world
Rachel: Your bf should appreciate that ;-)
Janice: He does
Janice: x)
summary: Janice has watched almost all films of 2018.


### Model - FLAN-T5

In [7]:
# for name, param in bart_base_model.named_parameters():
#     if param.requires_grad:
#         print(name)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable parameters:", trainable_params)        
#247577856/142329600

Trainable parameters: 80811392


### Training

In [15]:
output_dir = "./flan_t5_small_full_finetune"

training_args = TrainingArguments(
#     eval_device='cpu',
    overwrite_output_dir = False,
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
#     evaluation_strategy="steps",
    logging_dir=f"{output_dir}/logs",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=40,
    learning_rate=1e-4,
)

rouge_metric = load_metric("rouge")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=validation_data_tokenized,
#     compute_metrics=lambda pred: rouge_metric.compute(predictions=pred.predictions, references=pred.label_ids)
)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [16]:
trainer.train(resume_from_checkpoint = True)



Step,Training Loss
110500,0.2882
111000,0.3103
111500,0.2915
112000,0.2924
112500,0.2939
113000,0.2976
113500,0.2976
114000,0.302
114500,0.3041
115000,0.2896


TrainOutput(global_step=147320, training_loss=0.0716666152125754, metrics={'train_runtime': 10619.9787, 'train_samples_per_second': 55.488, 'train_steps_per_second': 13.872, 'total_flos': 2.2075496103936e+17, 'train_loss': 0.0716666152125754, 'epoch': 40.0})

In [17]:
trainer.model.save_pretrained('./flan_t5_small_full_finetune')


In [18]:
trainer.save_model('./flan_t5_small_full_finetune_save')
tokenizer.save_pretrained('./tokenizer-emoji')

model_tok_save_directory = "./flan_t5_small_full_finetune_model_tok"
model.save_pretrained(model_tok_save_directory)
tokenizer.save_pretrained(model_tok_save_directory)


eval_trainer = Trainer(
    model=model,
    eval_dataset=validation_data_tokenized,
    compute_metrics=lambda pred: rouge_metric.compute(predictions=pred.predictions, references=pred.label_ids)
)

# Evaluate the model on the test set
results = eval_trainer.evaluate(test_data)
# results = trainer.evaluate(test_data)
print("ROUGE scores:", results)

ROUGE scores: {'eval_runtime': 0.0016, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0}


### ROUGE SCORE -- inference


In [7]:
def summarize(tokenizer,model,text):
    """
    Summarizes the given text using the provided tokenizer and model.

    Args:
        tokenizer (Tokenizer): The tokenizer used to tokenize the input text.
        model (Model): The model used for summarization.
        text (str): The text to be summarized.

    Returns:
        list: A list containing the summarized text.    
    """        
#     print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    # Decode the summary
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary


generated_summaries = []
actual_summaries = []
generated_summary_orignal = []
dialogue_list = []
SAVED_MODEL_PATH = './flan_t5_small_full_finetune_save-30'
SAVED_TOK_PATH = "./flan_t5_small_full_finetune_model_tok-30"
SAVED_MODEL_TOK = AutoTokenizer.from_pretrained(SAVED_TOK_PATH)#.to(device)
SAVED_MODEL = AutoModelForSeq2SeqLM.from_pretrained(SAVED_MODEL_PATH).to(device)


model_name = "google/flan-t5-small"
orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
orignal_model_tok = AutoTokenizer.from_pretrained(model_name)


### Inference loop to generate summaries on test dataset for ROUGE metrics

In [8]:
import time
start_time = time.time()
i=1
j=0
for example in test_data:
    if i%10==0:
#         print()
        j+=10
        print(f"samples summarized:{j}\ttime:{time.time()-start_time}")
#         print(f"",)
#     print(example['dialogue'])
#     generated_summary = summarize(SAVED_MODEL_TOK,SAVED_MODEL,example['dialogue'])
#     combined_model
    generated_summary = summarize(SAVED_MODEL_TOK,SAVED_MODEL,example['dialogue'])
    
    generated_summaries.append(generated_summary[0])
    generated_summary_o = summarize(orignal_model_tok,orignal_model,example['dialogue'])
    generated_summary_orignal.append(generated_summary_o[0])
    actual_summaries.append(example["summary"])
    dialogue_list.append(example['dialogue'])
    
    i+=1
    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

samples summarized:10	time:8.748680353164673
samples summarized:20	time:18.305094480514526
samples summarized:30	time:28.15091037750244
samples summarized:40	time:36.07048511505127
samples summarized:50	time:45.00730299949646
samples summarized:60	time:53.23375463485718
samples summarized:70	time:61.01580595970154
samples summarized:80	time:70.02505016326904
samples summarized:90	time:78.31338310241699
samples summarized:100	time:87.04405760765076
samples summarized:110	time:93.28603911399841
samples summarized:120	time:103.0273072719574
samples summarized:130	time:111.52047419548035
samples summarized:140	time:121.4963366985321
samples summarized:150	time:129.78151988983154
samples summarized:160	time:139.65385794639587
samples summarized:170	time:148.27914929389954
samples summarized:180	time:156.5907347202301
samples summarized:190	time:166.19008612632751
samples summarized:200	time:175.50325345993042
samples summarized:210	time:185.2167510986328
samples summarized:220	time:194.6976

In [None]:
import pandas as pd


temp_df = pd.DataFrame({'finetune_summary':generated_summaries,'original_summary':generated_summary_orignal,'human_summary':actual_summaries,'dialog':dialogue_list})
temp_df.to_csv('full_flan_t5_small_results_30_epoch.csv')#.loc[3][0]

In [3]:
import pandas as pd
df = pd.read_csv('full_flan_t5_small_results_20_epoch.csv')

### ROUGE for fine-tuned model

In [5]:
generated_summary_orignal = df['original_summary']
actual_summaries = df['human_summary']
generated_summaries = df['finetune_summary']
from rouge_score import rouge_scorer
import time
start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summary_orignal)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 0.6044666767120361 seconds
Total time taken: 0.0003504753112792969 seconds
Average ROUGE scores:
rouge1: 0.4512592182549751
rouge2: 0.2085953067706064
rougeL: 0.37440036871614574


### ROUGE for fine-tuned model

In [None]:
generated_summary_orignal = df['original_summary']
actual_summaries = df['human_summary']
generated_summaries = df['finetune_summary']
from rouge_score import rouge_scorer
import time
start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summaries)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")
