In [1]:
# !pip install -q transformers datasets rouge_score
# !pip install -q sentencepiece accelerate

## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    BartConfig,
    TrainingArguments,
    Trainer
)

2023-05-16 17:50:35.194206: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-16 17:50:35.346610: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load Dataset

In [5]:
# importing datasets 
data_path = 'assets/datasets/sample_findsum_v1/'
train_data_path = data_path + "sample_findsum_train_v1.csv"
test_data_path = data_path + "sample_findsum_test_v1.csv"
val_data_path = data_path + "sample_findsum_val_v1.csv"

In [6]:
# Load datasets
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
val_data = pd.read_csv(val_data_path)

In [7]:
# checking the shape of datasets
train_data.shape, test_data.shape, val_data.shape

((5000, 2), (300, 2), (300, 2))

In [8]:
train_data.head()

Unnamed: 0,document,summary
0,gross profit for plant nutrition north america...,capital resources we believe our primary sourc...
1,"the european commission , or ec , has granted ...",liquidity and capital resources from our incep...
2,in 2012 the fulfill program generated $ 1.4 mi...,liquidity and capital resources cash flows pro...
3,combined sales to academic and governmental cu...,cash flow from operating activities net cash p...
4,the timing and amount of these investments var...,cash flows the following table sets forth data...


In [9]:
# convert columns to list
train_texts=list(train_data["document"])
train_summaries=list(train_data["summary"])

val_texts=list(val_data["document"])
val_summaries=list(val_data["summary"])

test_texts=list(test_data["document"])
test_summaries=list(test_data["summary"])

## Load tokenizer and model

In [10]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# Load BART model and adjust the last layer
config = BartConfig.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base', config=config)
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Embedding(50265, 768, padding_idx=1)

## Build custom dataset class
* The __init__ method initializes the CustomDataset class and takes in the following parameters:

    * texts: A list of input texts.
    * summaries: A list of target summaries corresponding to the input texts.
    * tokenizer: The tokenizer used to tokenize the texts and summaries.
    * max_length: The maximum length of the tokenized sequences.

* The __len__ method returns the total number of samples in the dataset, which is the length of the texts list.

* The __getitem__ method returns a single sample from the dataset at the given index idx. It performs the following steps:
    * Retrieves the input text and target summary at the specified index.
    * Tokenizes the input text and summary using the provided tokenizer. It uses encode_plus to add special tokens, truncate or pad sequences, and return tensors in PyTorch format ('pt').
    * Returns a dictionary containing the tokenized input and summary, with keys 'input_ids', 'attention_mask', 'decoder_input_ids', and 'decoder_attention_mask'.
* By implementing the CustomDataset class, you can create instances of this class, pass in your training and validation texts/summaries, and use them in data loaders for training and evaluation.

In [11]:
class CustomDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        summary = str(self.summaries[idx])

        # Tokenize input text and summary
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, truncation=True, 
            max_length=self.max_length, padding='max_length', 
            return_tensors='pt'
            )
        labels = self.tokenizer.encode_plus(
            summary, add_special_tokens=True, 
            truncation=True, max_length=self.max_length, 
            padding='max_length', return_tensors='pt'
            )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'decoder_input_ids': labels['input_ids'].squeeze(),
            'decoder_attention_mask': labels['attention_mask'].squeeze()
            }


In [12]:
max_length = 512
batch_size = 8

In [13]:
train_dataset = CustomDataset(train_texts, train_summaries, tokenizer, max_length)
val_dataset = CustomDataset(val_texts, val_summaries, tokenizer, max_length)
test_dataset = CustomDataset(test_texts, test_summaries, tokenizer, max_length)

In [14]:
train_dataset

<__main__.CustomDataset at 0x7fd4d571f520>

In [15]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

## Preparing for finetuning

In [16]:
# Set the model to training mode
model.train()
# model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [17]:
lr=1e-5
patience = 2

# Set optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience)

In [18]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='assets/model_BART',  # Directory where checkpoints and logs will be saved
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir='/assets/logs_BART',  # Directory where training logs will be saved
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    save_total_limit=3,
    load_best_model_at_end=True
)

In [19]:
# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=None,
    tokenizer=tokenizer
)

In [20]:
# Train the model
trainer.train()



PermissionDeniedError: /assets; Permission denied

In [None]:
# Save the fine-tuned model
trainer.save_model(output_dir)