In [1]:
import torch
from transformers import Trainer, TrainingArguments
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os
from datasets import load_dataset,load_metric
# from evaluate import11111 load_metric
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# device = 'cuda:0'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import emoji
emoji_list = emoji.EMOJI_DATA.keys()
emoji_descriptions = [emoji.demojize(e, delimiters=("<", ">")) for e in emoji_list]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /shared/centos7/cuda/12.1/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.0
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/sampgaon.h/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so...


  warn(msg)


### Loading the base models from Huggingface


In [2]:
model_name = "facebook/bart-large-cnn"
load_16_bit = False

if load_16_bit:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.float16).to(device)
else:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)



Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(54053, 1024)

### Updating the tokenizer and resizing model embeddings

In [None]:
new_tokens = ['<file_photo>','<file_picture>','<file_other>','<file_video>','<file_image>','<file_gif>']
new_tokens = new_tokens+emoji_descriptions
tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})
model.resize_token_embeddings(len(tokenizer))

### Loading the SAMSum dataset

In [3]:
dataset = load_dataset("samsum")

train_data = dataset["train"]
validation_data = dataset["validation"] 
test_data = dataset["test"]

### Tokenizing Function

In [4]:
def tokenize_function(tokenizer,examples):
    input_dialogues = ["Summarize dialogue >>\n " + emoji.demojize(dialogue, delimiters=("<", ">")) for dialogue in examples["dialogue"]]
    inputs = tokenizer(input_dialogues, padding="max_length", truncation=True, max_length=1000)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=100)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}


In [5]:
train_data_tokenized = train_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
validation_data_tokenized = validation_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)
test_data_tokenized = test_data.map(lambda examples: tokenize_function(tokenizer, examples), batched=True)


### Sample summarization code

In [None]:
def summarize(tokenizer,model,text):
#     print(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}")
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    # Decode the summary
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary[0]


text = test_data['dialogue'][-1]
summarize(tokenizer,model,text)

### Model - BART

In [7]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable parameters:", trainable_params)        
#247577856/142329600

Trainable parameters: 410170368


In [8]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)


model.shared.weight
model.encoder.embed_positions.weight
model.encoder.layers.0.self_attn.k_proj.weight
model.encoder.layers.0.self_attn.k_proj.bias
model.encoder.layers.0.self_attn.v_proj.weight
model.encoder.layers.0.self_attn.v_proj.bias
model.encoder.layers.0.self_attn.q_proj.weight
model.encoder.layers.0.self_attn.q_proj.bias
model.encoder.layers.0.self_attn.out_proj.weight
model.encoder.layers.0.self_attn.out_proj.bias
model.encoder.layers.0.self_attn_layer_norm.weight
model.encoder.layers.0.self_attn_layer_norm.bias
model.encoder.layers.0.fc1.weight
model.encoder.layers.0.fc1.bias
model.encoder.layers.0.fc2.weight
model.encoder.layers.0.fc2.bias
model.encoder.layers.0.final_layer_norm.weight
model.encoder.layers.0.final_layer_norm.bias
model.encoder.layers.1.self_attn.k_proj.weight
model.encoder.layers.1.self_attn.k_proj.bias
model.encoder.layers.1.self_attn.v_proj.weight
model.encoder.layers.1.self_attn.v_proj.bias
model.encoder.layers.1.self_attn.q_proj.weight
model.encoder.la

### Loading LoRA adaptor config

In [9]:
lora_config = LoraConfig(
 r=16, #16
 lora_alpha=32,
 target_modules=["q_proj","v_proj","k_proj"]
 lora_dropout=0.05,
 bias="lora_only",
 task_type=TaskType.SEQ_2_SEQ_LM
)


lora_model = get_peft_model(model, lora_config)
# lora_model.print_trainable_parameters()
lora_model.print_trainable_parameters()

trainable params: 3649536 || all params: 413709312 || trainable%: 0.8821498318123426


In [10]:

output_dir = "./bart_large_cnn_lora_finetune"
training_args = TrainingArguments(
    overwrite_output_dir = False,
    output_dir=output_dir,
    per_device_train_batch_size=4,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=4,
#     evaluation_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=10,
    learning_rate=1e-4,
)

rouge_metric = load_metric("rouge")
trainer = Trainer( 
    model=lora_model,
    args=training_args,
    train_dataset=train_data_tokenized
)

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
trainer.train(resume_from_checkpoint = True)

In [12]:
trainer.model.save_pretrained('./bart_large_cnn_lora_finetune_finetune_emoji_adapter-2')

In [13]:
trainer.save_model('./bart_large_cnn_lora_finetune_finetune_emoji_save-2')
# tokenizer.save_pretrained('./tokenizer-emoji_t5-2')

model_tok_save_directory = "./bart_large_cnn_lora_finetune_finetune_model_tokenizer-1"
trainer.model.save_pretrained(model_tok_save_directory)
tokenizer.save_pretrained(model_tok_save_directory)


eval_trainer = Trainer(
    model=model,
    eval_dataset=validation_data_tokenized,
    compute_metrics=lambda pred: rouge_metric.compute(predictions=pred.predictions, references=pred.label_ids)
)

# Evaluate the model on the test set
results = eval_trainer.evaluate(test_data)
# results = trainer.evaluate(test_data)
print("ROUGE scores:", results)

ROUGE scores: {'eval_runtime': 0.0046, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0}


In [14]:
def summarize(tokenizer,model,text):
    """
    Summarizes the given text using the provided tokenizer and model.

    Args:
        tokenizer (Tokenizer): The tokenizer used to tokenize the input text.
        model (Model): The model used for summarization.
        text (str): The text to be summarized.

    Returns:
        list: A list containing the summarized text.    
    """    
    inputs = tokenizer(f"Summarize dialogue >>\n {emoji.demojize(text, delimiters=('<', '>'))}", return_tensors="pt", max_length=1000, truncation=True, padding="max_length").to(device)
    
    # Generate summary
    summary_ids = model.generate(input_ids=inputs.input_ids, num_beams=4, max_length=100, early_stopping=True)
    
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    
    return summary


generated_summaries = []
actual_summaries = []
generated_summary_orignal = []
dialogue_list = []
# SAVED_MODEL_PATH = './flan_t5_base_lora_finetune_finetune_model_tokenizer'
SAVED_TOK_PATH = 'tokenizer-emoji_t5'
SAVED_MODEL_TOK = AutoTokenizer.from_pretrained(SAVED_TOK_PATH)#.to(device)
# SAVED_MODEL = BartForConditionalGeneration.from_pretrained(SAVED_MODEL_PATH).to(device)
from peft import PeftModel, PeftConfig
# Load peft config for pre-trained checkpoint etc.
peft_model_id = "./bart_large_cnn_lora_finetune_finetune_emoji_adapter"
config = PeftConfig.from_pretrained(peft_model_id)#.to(device)
combined_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path).to(device)
combined_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
combined_model = PeftModel.from_pretrained(combined_model, peft_model_id).to(device)
combined_model.resize_token_embeddings(len(SAVED_MODEL_TOK))



model_name = "facebook/bart-large-cnn"
orignal_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
orignal_model_tok = AutoTokenizer.from_pretrained(model_name)


Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Inference loop to generate summaries on test dataset for ROUGE metrics

In [17]:
import time
start_time = time.time()
i=1
j=0
for example in test_data:
    if i%10==0:
#         print()
        j+=10
        print(f"samples summarized:{j}\ttime:{time.time()-start_time}")
#         print(f"",)
    generated_summary = summarize(tokenizer,combined_model,example['dialogue'])
    generated_summaries.append(generated_summary[0])
    generated_summary_o = summarize(orignal_model_tok,orignal_model,example['dialogue'])
    generated_summary_orignal.append(generated_summary_o[0])
    actual_summaries.append(example["summary"])
    dialogue_list.append(example['dialogue'])
    
    i+=1
    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

samples summarized:10	time:23.539096117019653
samples summarized:20	time:49.81668257713318
samples summarized:30	time:76.82163143157959
samples summarized:40	time:104.08896851539612
samples summarized:50	time:131.2598729133606
samples summarized:60	time:157.56858205795288
samples summarized:70	time:183.6284577846527
samples summarized:80	time:210.43648266792297
samples summarized:90	time:236.52878737449646
samples summarized:100	time:262.535347700119
samples summarized:110	time:288.7534658908844
samples summarized:120	time:315.3142204284668
samples summarized:130	time:342.4812831878662
samples summarized:140	time:368.9851257801056
samples summarized:150	time:395.5611734390259
samples summarized:160	time:422.50930643081665
samples summarized:170	time:448.36493825912476
samples summarized:180	time:475.2925953865051
samples summarized:190	time:501.7594850063324
samples summarized:200	time:528.6763501167297
samples summarized:210	time:555.0239236354828
samples summarized:220	time:581.89521

## ROUGE for orignal base model

In [18]:
from rouge_score import rouge_scorer

start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summary_orignal)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 1.1680960655212402 seconds
Total time taken: 0.0005173683166503906 seconds
Average ROUGE scores:
rouge1: 0.30412814809078337
rouge2: 0.10309376001724774
rougeL: 0.22742836389781787


### ROUGE for fine-tuned model

In [19]:
from rouge_score import rouge_scorer

start_time = time.time()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(actual_summary, gen_summary) for actual_summary, gen_summary in zip(actual_summaries, generated_summaries)]
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")

start_time = time.time()

# To calculate average scores
average_scores = {}
for key in scores[0].keys():
    average_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)

    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time} seconds")    
print("Average ROUGE scores:")
for key, value in average_scores.items():
    print(f"{key}: {value}")


Total time taken: 1.0902247428894043 seconds
Total time taken: 0.0005009174346923828 seconds
Average ROUGE scores:
rouge1: 0.40145980938327025
rouge2: 0.19708442791076364
rougeL: 0.3053241917792587
