In [4]:
import torch
from transformers import Trainer, TrainingArguments
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os
from datasets import load_dataset,load_metric
# from evaluate import load_metric
from sklearn.model_selection import train_test_split

device = 'cuda:0'
import emoji
emoji_list = emoji.EMOJI_DATA.keys()
emoji_descriptions = [emoji.demojize(e, delimiters=("<", ">")) for e in emoji_list]

### Loading the base models from Huggingface


In [6]:
model_name = "facebook/bart-large-cnn"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Updating the tokenizer and resizing model embeddings

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

new_tokens = ['<file_photo>','<file_picture>','<file_other>','<file_video>','<file_image>','<file_gif>']
new_tokens = new_tokens+emoji_descriptions
tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})
model.resize_token_embeddings(len(tokenizer))

Embedding(54053, 1024)

### Loading the SAMSum dataset

In [2]:
dataset = load_dataset("samsum")

train_data = dataset["train"]
validation_data = dataset["validation"] 
test_data = dataset["test"]

### Tokenizing Function

In [5]:
def tokenize_function(tokenizer,examples):
    input_dialogues = ["Summarize dialogue >>\n " + emoji.demojize(dialogue, delimiters=("<", ">")) for dialogue in examples["dialogue"]]
    inputs = tokenizer(input_dialogues, padding="max_length", truncation=True, max_length=1000)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=100)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}


In [6]:
train_data_tokenized = train_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
validation_data_tokenized = validation_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
test_data_tokenized = test_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)


### Sample summarization code

In [None]:
def summarize(tokenizer,model,text):
#     print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {text}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    # Decode the summary
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary[0]

### Model - BART

In [8]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable parameters:", trainable_params)        

Trainable parameters: 406290432


### Freezing all layers and keeping 7 encoder-decoder layer weights as trainable

In [9]:
for param in model.parameters():
    param.requires_grad = False

    
encoder_layers = model.model.encoder.layers
num_layers = len(encoder_layers)

print(f'{num_layers}')
for layer in encoder_layers[num_layers - 7: num_layers]:
#     print(layer)
    for param in layer.parameters():
        param.requires_grad = True
        
    
decoder_layers = model.model.decoder.layers
num_layers = len(decoder_layers)

for layer in decoder_layers[num_layers - 7: num_layers]:
    for param in layer.parameters():
        param.requires_grad = True
    
    
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)


12
model.encoder.layers.5.self_attn.k_proj.weight
model.encoder.layers.5.self_attn.k_proj.bias
model.encoder.layers.5.self_attn.v_proj.weight
model.encoder.layers.5.self_attn.v_proj.bias
model.encoder.layers.5.self_attn.q_proj.weight
model.encoder.layers.5.self_attn.q_proj.bias
model.encoder.layers.5.self_attn.out_proj.weight
model.encoder.layers.5.self_attn.out_proj.bias
model.encoder.layers.5.self_attn_layer_norm.weight
model.encoder.layers.5.self_attn_layer_norm.bias
model.encoder.layers.5.fc1.weight
model.encoder.layers.5.fc1.bias
model.encoder.layers.5.fc2.weight
model.encoder.layers.5.fc2.bias
model.encoder.layers.5.final_layer_norm.weight
model.encoder.layers.5.final_layer_norm.bias
model.encoder.layers.6.self_attn.k_proj.weight
model.encoder.layers.6.self_attn.k_proj.bias
model.encoder.layers.6.self_attn.v_proj.weight
model.encoder.layers.6.self_attn.v_proj.bias
model.encoder.layers.6.self_attn.q_proj.weight
model.encoder.layers.6.self_attn.q_proj.bias
model.encoder.layers.6.se

#### Training

In [10]:
output_dir = "./bart_large_encoder_decoder_7_finetune_emoji"

In [11]:
training_args = TrainingArguments(
#     eval_device='cpu',
    overwrite_output_dir = False,
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
#     evaluation_strategy="steps",
    logging_dir=f"{output_dir}/logs",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=10,
    learning_rate=1e-4,
)

rouge_metric = load_metric("rouge")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=validation_data_tokenized,
    compute_metrics=lambda pred: rouge_metric.compute(predictions=pred.predictions, references=pred.label_ids)
)


  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [12]:
trainer.train(resume_from_checkpoint = True)



Step,Training Loss
7500,0.2742
8000,0.2717
8500,0.2846
9000,0.2819
9500,0.2866
10000,0.2906
10500,0.2844
11000,0.2956
11500,0.1998
12000,0.1934


TrainOutput(global_step=36830, training_loss=0.08120856374429355, metrics={'train_runtime': 24979.8308, 'train_samples_per_second': 5.898, 'train_steps_per_second': 1.474, 'total_flos': 3.1177524412416e+17, 'train_loss': 0.08120856374429355, 'epoch': 10.0})

In [None]:
trainer.save_model('./bart_large_encoder_decoder7_finetune_emoji_save')
tokenizer.save_pretrained('./tokenizer-emoji_large')

model_tok_save_directory = "./bart_large_encoder_decoder7_finetune_model_tokenizer"
model.save_pretrained(model_tok_save_directory)
tokenizer.save_pretrained(model_tok_save_directory)


eval_trainer = Trainer(
    model=model,
    eval_dataset=validation_data_tokenized,
    compute_metrics=lambda pred: rouge_metric.compute(predictions=pred.predictions, references=pred.label_ids)
)

results = eval_trainer.evaluate(test_data)
print("ROUGE scores:", results)

In [14]:
model_tok_save_directory = "./bart_large_encoder_decoder7_finetune_model_tokenizer"

In [None]:
results = eval_trainer.evaluate(test_data)
print("ROUGE scores:", results)

In [15]:
# orignal_model = 

model_name = "facebook/bart-base"
orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
orignal_model_tok = AutoTokenizer.from_pretrained(model_name)


### ROUGE SCORE

In [3]:
def summarize(tokenizer,model,text):
    """
    Summarizes the given text using the provided tokenizer and model.

    Args:
        tokenizer (Tokenizer): The tokenizer used to tokenize the input text.
        model (Model): The model used for summarization.
        text (str): The text to be summarized.

    Returns:
        list: A list containing the summarized text.    
    """    
#     print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    # Decode the summary
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary


generated_summaries = []
actual_summaries = []
generated_summary_orignal = []
dialogue_list = []
SAVED_MODEL_PATH = './bart_large_encoder_decoder7_finetune_model_tokenizer'
SAVED_TOK_PATH = 'tokenizer-emoji'
SAVED_MODEL_TOK = BartTokenizer.from_pretrained(SAVED_MODEL_PATH)#.to(device)
SAVED_MODEL = BartForConditionalGeneration.from_pretrained(SAVED_MODEL_PATH).to(device)


model_name = "facebook/bart-large-cnn"
orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
orignal_model_tok = AutoTokenizer.from_pretrained(model_name)


Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Inference loop to generate summaries on test dataset for ROUGE metrics

In [None]:
import time
start_time = time.time()
i=1
j=0
for example in test_data:
    if i%10==0:
#         print()
        j+=10
        print(f"samples summarized:{j}\ttime:{time.time()-start_time}")
    generated_summary = summarize(SAVED_MODEL_TOK,SAVED_MODEL,example['dialogue'])
    generated_summaries.append(generated_summary[0])
    generated_summary_o = summarize(orignal_model_tok,orignal_model,example['dialogue'])
    generated_summary_orignal.append(generated_summary_o[0])
    actual_summaries.append(example["summary"])
    dialogue_list.append(example['dialogue'])
    
    i+=1
    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

samples summarized:10	time:20.438345670700073
samples summarized:20	time:43.52337026596069
samples summarized:30	time:66.67061853408813
samples summarized:40	time:90.54976654052734
samples summarized:50	time:113.56971764564514
samples summarized:60	time:136.5130205154419
samples summarized:70	time:158.99224591255188
samples summarized:80	time:182.63577580451965
samples summarized:90	time:204.75320672988892
samples summarized:100	time:226.78264594078064
samples summarized:110	time:249.09976172447205
samples summarized:120	time:270.2389323711395
samples summarized:130	time:291.31582474708557
samples summarized:140	time:312.41668915748596
samples summarized:150	time:332.9368689060211
samples summarized:160	time:353.9031503200531
samples summarized:170	time:374.4392397403717
samples summarized:180	time:395.3759307861328
samples summarized:190	time:416.3876643180847
samples summarized:200	time:437.83767580986023
samples summarized:210	time:458.85663890838623
samples summarized:220	time:479.

In [18]:
sum(p.numel() for p in bart_base_model.parameters() if p.requires_grad),sum(p.numel() for p in SAVED_MODEL.parameters() if p.requires_grad)

(139420416, 139420416)

## ROUGE for orignal base model

In [2]:
import pandas as pd
df = pd.read_csv("bart_large_7enc_dec_epoch_10.csv")
generated_summary_orignal = df['original_summary']
generated_summaries = df['finetune_summary']
actual_summaries = df['human_summary']


from rouge_score import rouge_scorer
import time
start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summary_orignal)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 0.995619535446167 seconds
Total time taken: 0.0003535747528076172 seconds
Average ROUGE scores:
rouge1: 0.30412814809078337
rouge2: 0.10309376001724774
rougeL: 0.22742836389781787


### ROUGE for fine-tuned model

In [3]:
from rouge_score import rouge_scorer

start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summaries)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 0.9880452156066895 seconds
Total time taken: 0.0003993511199951172 seconds
Average ROUGE scores:
rouge1: 0.38719640138682615
rouge2: 0.1759602830108665
rougeL: 0.2902752085168637


In [41]:
import pandas as pd


temp_df = pd.DataFrame({'finetune_summary':generated_summaries,'original_summary':generated_summary_orignal,'human_summary':actual_summaries})

In [None]:
temp_df.to_csv('results.csv')#.loc[3][0]