In [2]:
# !pip install -q transformers datasets rouge_score
# !pip install -q sentencepiece accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m58.3 MB/s[0m 

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    BartConfig,
    TrainingArguments,
    Trainer,
    default_data_collator,
    EarlyStoppingCallback
)

## Preparing Dataset

In [2]:
# importing datasets 
data_path = '/content/drive/MyDrive/datasets/sample_findsum_v1/'
train_data_path = data_path + "sample_findsum_train_v1.csv"
test_data_path = data_path + "sample_findsum_test_v1.csv"
val_data_path = data_path + "sample_findsum_val_v1.csv"

In [3]:
# Load datasets
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
val_data = pd.read_csv(val_data_path)

In [4]:
# checking the shape of datasets
train_data.shape, test_data.shape, val_data.shape

((5000, 2), (300, 2), (300, 2))

In [5]:
train_data.head()

Unnamed: 0,document,summary
0,gross profit for plant nutrition north america...,capital resources we believe our primary sourc...
1,"the european commission , or ec , has granted ...",liquidity and capital resources from our incep...
2,in 2012 the fulfill program generated $ 1.4 mi...,liquidity and capital resources cash flows pro...
3,combined sales to academic and governmental cu...,cash flow from operating activities net cash p...
4,the timing and amount of these investments var...,cash flows the following table sets forth data...


In [6]:
# Define your custom dataset class
class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        document = row['document']
        summary = row['summary']
        
        inputs = self.tokenizer.batch_encode_plus(
            [document],
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
            )
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        
        summary_inputs = self.tokenizer.batch_encode_plus(
            [summary],
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
            )
        summary_ids = summary_inputs['input_ids'].squeeze()
        summary_attention_mask = summary_inputs['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': summary_ids,
            'decoder_attention_mask': summary_attention_mask
        }

In [7]:
# Load the BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [8]:
# Create datasets
train_dataset = MyDataset(train_data, tokenizer)
val_dataset = MyDataset(val_data, tokenizer)
test_dataset = MyDataset(test_data, tokenizer)

In [9]:
# Set device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to the device
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [10]:
# Set your training parameters
batch_size = 16
epochs = 4
learning_rate = 1e-5

In [11]:
# # Define the optimizer and loss function
# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# criterion = torch.nn.CrossEntropyLoss()

In [12]:
# Define the TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    save_total_limit=4,
    optim='adamw_torch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_steps=50,
    save_steps=100,
    report_to='none'
)

In [13]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [14]:
# Start the fine-tuning process
trainer.train()



Epoch,Training Loss,Validation Loss
1,2.3874,2.146931
2,2.2528,2.057034
3,2.3248,2.014448
4,2.2384,2.003169


TrainOutput(global_step=1252, training_loss=2.3518184556747777, metrics={'train_runtime': 1926.5193, 'train_samples_per_second': 10.381, 'train_steps_per_second': 0.65, 'total_flos': 6097364582400000.0, 'train_loss': 2.3518184556747777, 'epoch': 4.0})

In [None]:
# free up GPU memory
# del train_data, val_data, trainer, model, tokenizer
# del train_data, val_data, model, tokenizer
# torch.cuda.empty_cache()

In [15]:
# Evaluate on the test set
results = trainer.evaluate(test_dataset)
print(f"Test set evaluation: {results}")

Test set evaluation: {'eval_loss': 2.11981201171875, 'eval_runtime': 13.2295, 'eval_samples_per_second': 22.677, 'eval_steps_per_second': 1.436, 'epoch': 4.0}


In [16]:
# Save the model and tokenizer
tokenizer.save_pretrained("/content/drive/MyDrive/test/bart")
model.save_pretrained("/content/drive/MyDrive/test/bart")

## Check BLEU Score of test dataset
* We will use BLEU Score and see how good our model is generating summarization.
* I will use `corpus_bleu` from `nltk` so, that I will get one `bleu_score` of whole test dataset.

In [2]:
# !pip install -q rouge-score

In [3]:
import torch
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset
from rouge_score import rouge_scorer

In [4]:
# Load the BART tokenizer and model
model_path = '/content/drive/MyDrive/test/bart'
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Set device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to the device
model.to(device)


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [9]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def compute_bleu_score(preds, targets):
    bleu_scores = []
    smoothing_func = SmoothingFunction().method1
    for pred, target in zip(preds, targets):
        pred = pred.replace('<s>', '').replace('</s>', '').strip()
        target = target.replace('<s>', '').replace('</s>', '').strip()
        pred_tokens = pred.split()
        target_tokens = target.split()
        bleu_score = sentence_bleu([target_tokens], pred_tokens, smoothing_function=smoothing_func)
        bleu_scores.append(bleu_score)
    avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
    return {'bleu_score': avg_bleu_score}

In [21]:
test_data.head()

Unnamed: 0,document,summary
0,"million from the issuance of 4,419,641 shares ...",source and uses of cash 65 our primary sources...
1,we continue to be optimistic about our growth ...,liquidity and capital resources overview our p...
2,usa today network ventures was designed to cel...,loss on early extinguishment of debt : for the...
3,"· changes in the financial condition , results...",liquidity we are required to have enough cash ...
4,our customers ' usage of our websites increase...,liquidity our principal internal sources of li...


In [25]:
input_text = test_data.loc[4, 'document']
input_summary = test_data.loc[4, 'summary']

In [26]:
# input_text = '''
# "on september 5 , 2012 , we acquired tog , a precision machined metal and alloy parts provider to original 
# equipment manufacturers for the steam and natural gas turbine power generation market.
# the addition of koontz-wagner 's engineered packaged control house solutions expanded our 
# products portfolio to our current customers , and supports the global expansion into 
# adjacent markets such as oil and gas pipelines . the acquisition of tog expanded our 
# products portfolio to serve the steam turbine market and , combined with our consolidated 
# fabricators business unit , established a growth platform for aftermarket energy parts sales .
# the tog repair and replacement parts business provides a relatively stable revenue stream .
# the financial results of the koontz-wagner acquisition and the tog acquisition have been included
# in our product solutions segment .
# '''

In [28]:
input_ids = tokenizer.encode(
    input_text,
    padding='max_length',
    truncation=True,
    max_length=512,
    return_tensors='pt').to(device)

In [29]:
summary_ids = model.generate(input_ids, max_length=64, do_sample=False).to(device)

In [30]:
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [32]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [35]:
print("BLEU Score: ", compute_bleu_score(input_text, summary)["bleu_score"], "\n")
print("Paragraph: \n", input_text)

BLEU Score:  0.007271774539192622 

Paragraph: 
 our customers ' usage of our websites increased , as demonstrated through an increase in average monthly revenue per recruitment package customer of $ 1,044 for the year ended december 31 , 2014 to $ 1,094 for the year ended december 31 , 2015 . deferred revenue is a key metric of our business as it indicates a level of sales already made that will be recognized as revenue in the future . deferred revenue reflects the impact of our ability to sign customers to longer term contracts . we recorded deferred revenue of $ 84.3 million at december 31 , 2015 , including $ 969,000 of slashdot media deferred revenue classified as held for sale as of december 31 , 2015 , and $ 86.4 million at december 31 , 2014 . we also generate revenue from advertising on our various websites or from lead generation and marketing solutions provided to our customers . advertisements include various forms of rich media and banner advertising , text links , sponsor

In [36]:
print("Summary: \n", summary)

Summary: 
 liquidity and capital resources as of december 31, 2015, we had cash, cash equivalents and short-term investments of $ 84.3 million, compared to cash and cash equivalents of $ 86.4 million as of september 31, 2014. the decrease in cash and equivalents was primarily due


In [37]:
input_summary

'liquidity our principal internal sources of liquidity is cash , as well as the cash flow that we generate from our operations . in addition , externally , we had $ 149.0 million in borrowing capacity under our credit agreement at december 31 , 2015 . we believe that our existing cash , cash generated from operations and available borrowings under our credit agreement will be sufficient to satisfy our currently anticipated cash requirements through at least the next 12 months and the foreseeable future thereafter . however , it is possible that one or more lenders under the revolving credit facility may refuse or be unable to satisfy their commitment to lend to us or we may need to refinance our debt and be unable to do so . in addition , our liquidity could be negatively affected by a decrease in demand for our products and services . we may also make acquisitions and may need to raise additional capital through future debt financings or equity offerings to the extent necessary to fun