In [1]:
import torch
from transformers import Trainer, TrainingArguments
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os
from datasets import load_dataset,load_metric
# from evaluate import load_metric
from sklearn.model_selection import train_test_split

device = 'cuda:0'
import emoji
emoji_list = emoji.EMOJI_DATA.keys()
emoji_descriptions = [emoji.demojize(e, delimiters=("<", ">")) for e in emoji_list]

### Loading the base models from Huggingface


In [2]:
model_name = "facebook/bart-large-cnn"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Updating the tokenizer and resizing model embeddings

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

new_tokens = ['<file_photo>','<file_picture>','<file_other>','<file_video>','<file_image>','<file_gif>']
new_tokens = new_tokens+emoji_descriptions
tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})
model.resize_token_embeddings(len(tokenizer))

### Loading the SAMSum dataset

In [4]:
dataset = load_dataset("samsum")
train_data = dataset["train"]
validation_data = dataset["validation"] 
test_data = dataset["test"]

### Tokenizing Function

In [5]:
def tokenize_function(tokenizer,examples):
    input_dialogues = ["Summarize dialogue >>\n " + emoji.demojize(dialogue, delimiters=("<", ">")) for dialogue in examples["dialogue"]]
    inputs = tokenizer(input_dialogues, padding="max_length", truncation=True, max_length=1000)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=100)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}


In [6]:
train_data_tokenized = train_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
validation_data_tokenized = validation_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
test_data_tokenized = test_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)

### Sample summarization code

In [None]:
def summarize(tokenizer,model,text):
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)    
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary[0]

text = test_data['dialogue'][0]
summary = summarize(tokenizer,model,text)
print("Generated Summary:"+ summary,'\n\n',train_data['summary'][0])

In [None]:
# Getting trainabale parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable parameters:", trainable_params)        

### Freezing all layers and keeping encoder weights as trainable

In [9]:
# Freezing model weights/Setting them as trainable based on approach 
for param in model.parameters():
    param.requires_grad = False

for param in model.model.encoder.parameters():
    param.requires_grad = True

for name, param in model.named_parameters():
    print(f"{name} is {'trainable' if param.requires_grad else 'frozen'}")


model.shared.weight is trainable
model.encoder.embed_positions.weight is trainable
model.encoder.layers.0.self_attn.k_proj.weight is trainable
model.encoder.layers.0.self_attn.k_proj.bias is trainable
model.encoder.layers.0.self_attn.v_proj.weight is trainable
model.encoder.layers.0.self_attn.v_proj.bias is trainable
model.encoder.layers.0.self_attn.q_proj.weight is trainable
model.encoder.layers.0.self_attn.q_proj.bias is trainable
model.encoder.layers.0.self_attn.out_proj.weight is trainable
model.encoder.layers.0.self_attn.out_proj.bias is trainable
model.encoder.layers.0.self_attn_layer_norm.weight is trainable
model.encoder.layers.0.self_attn_layer_norm.bias is trainable
model.encoder.layers.0.fc1.weight is trainable
model.encoder.layers.0.fc1.bias is trainable
model.encoder.layers.0.fc2.weight is trainable
model.encoder.layers.0.fc2.bias is trainable
model.encoder.layers.0.final_layer_norm.weight is trainable
model.encoder.layers.0.final_layer_norm.bias is trainable
model.encoder

In [10]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable parameters:", trainable_params)        

Trainable parameters: 203677696


### Training

In [11]:
output_dir = "./bart_large_encoder_finetune_emoji"


In [12]:
training_args = TrainingArguments(
#     eval_device='cpu',
    overwrite_output_dir = False,
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
#     evaluation_strategy="steps",
    logging_dir=f"{output_dir}/logs",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=10,
    learning_rate=1e-4,
)

rouge_metric = load_metric("rouge")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=validation_data_tokenized,
    compute_metrics=lambda pred: rouge_metric.compute(predictions=pred.predictions, references=pred.label_ids)
)


  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [13]:
trainer.train(resume_from_checkpoint = True)



Step,Training Loss
29500,0.0531
30000,0.0562
30500,0.0564
31000,0.0558
31500,0.0554
32000,0.0554
32500,0.0569
33000,0.0543
33500,0.0452
34000,0.0383


TrainOutput(global_step=36830, training_loss=0.00954918435433621, metrics={'train_runtime': 7515.1438, 'train_samples_per_second': 19.603, 'train_steps_per_second': 4.901, 'total_flos': 3.11961479675904e+17, 'train_loss': 0.00954918435433621, 'epoch': 10.0})

In [14]:
# trainer.save_model('./bart_large_encoder_decoder2_finetune_emoji_save_10')
# tokenizer.save_pretrained('./tokenizer-emoji_large')

model_tok_save_directory = "./bart_large_encoder_finetune_model_tokenizer"
model.save_pretrained(model_tok_save_directory)
tokenizer.save_pretrained(model_tok_save_directory)


eval_trainer = Trainer(
    model=model,
    eval_dataset=validation_data_tokenized,
    compute_metrics=lambda pred: rouge_metric.compute(predictions=pred.predictions, references=pred.label_ids)
)

results = eval_trainer.evaluate(test_data)
print("ROUGE scores:", results)

ROUGE scores: {'eval_runtime': 0.0017, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0}


In [15]:
model_tok_save_directory = "./bart_large_encoder_finetune_model_tokenizer"

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.


In [15]:

model_name = "facebook/bart-base"
orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
orignal_model_tok = AutoTokenizer.from_pretrained(model_name)


### ROUGE SCORE

In [16]:
def summarize(tokenizer,model,text):
    """
    Summarizes the given text using the provided tokenizer and model.

    Args:
        tokenizer (Tokenizer): The tokenizer used to tokenize the input text.
        model (Model): The model used for summarization.
        text (str): The text to be summarized.

    Returns:
        list: A list containing the summarized text.    
    """
#     print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary output with max 100 token limit from the model using beam search with 4 beams
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    
    # Decode the generated token ids to human-readable text
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary

# Initializing lists to collect summaries
generated_summaries = []
actual_summaries = []
generated_summary_orignal = []
dialogue_list = []

# Loading saved model
SAVED_MODEL_PATH = './bart_large_encoder_finetune_model_tokenizer'
SAVED_TOK_PATH = 'tokenizer-emoji'
SAVED_MODEL_TOK = BartTokenizer.from_pretrained(SAVED_MODEL_PATH)#.to(device)
SAVED_MODEL = BartForConditionalGeneration.from_pretrained(SAVED_MODEL_PATH).to(device)

# Loading base model
model_name = "facebook/bart-large-cnn"
orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
orignal_model_tok = AutoTokenizer.from_pretrained(model_name)


Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Inference loop to generate summaries on test dataset for ROUGE metrics

In [17]:
import time
start_time = time.time()
i=1
j=0
for example in test_data:
    if i%10==0:
        j+=10
        print(f"samples summarized:{j}\ttime:{time.time()-start_time}")
    generated_summary = summarize(SAVED_MODEL_TOK,SAVED_MODEL,example['dialogue'])
    generated_summaries.append(generated_summary[0])
    generated_summary_o = summarize(orignal_model_tok,orignal_model,example['dialogue'])
    generated_summary_orignal.append(generated_summary_o[0])
    actual_summaries.append(example["summary"])
    dialogue_list.append(example['dialogue'])
    
    i+=1
    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

samples summarized:10	time:20.571375131607056
samples summarized:20	time:44.63774013519287
samples summarized:30	time:68.42378640174866
samples summarized:40	time:93.21911549568176
samples summarized:50	time:117.08969044685364
samples summarized:60	time:140.388281583786
samples summarized:70	time:163.61993598937988
samples summarized:80	time:187.44166350364685
samples summarized:90	time:210.59677004814148
samples summarized:100	time:233.6715476512909
samples summarized:110	time:256.4484496116638
samples summarized:120	time:279.9146966934204
samples summarized:130	time:303.4612007141113
samples summarized:140	time:326.7504403591156
samples summarized:150	time:349.6736373901367
samples summarized:160	time:373.33531188964844
samples summarized:170	time:396.267516374588
samples summarized:180	time:420.04986119270325
samples summarized:190	time:443.95052194595337
samples summarized:200	time:467.72225856781006
samples summarized:210	time:490.8303346633911
samples summarized:220	time:514.2575

### ROUGE for orignal base model

In [20]:
from rouge_score import rouge_scorer

start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summary_orignal)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 1.124314308166504 seconds
Total time taken: 0.0004980564117431641 seconds
Average ROUGE scores:
rouge1: 0.30412814809078337
rouge2: 0.10309376001724774
rougeL: 0.22742836389781787


### ROUGE for fine-tuned model

In [21]:
from rouge_score import rouge_scorer

start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summaries)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 1.0781407356262207 seconds
Total time taken: 0.0004909038543701172 seconds
Average ROUGE scores:
rouge1: 0.38902052351633026
rouge2: 0.18001246106848157
rougeL: 0.2922479755135327


### Saving the generated summarries

In [18]:
import pandas as pd


temp_df = pd.DataFrame({'finetune_summary':generated_summaries,'original_summary':generated_summary_orignal,'human_summary':actual_summaries})

In [19]:
temp_df.to_csv('results.csv')#.loc[3][0]