In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Specify the file path in Google Drive
file_path = '/content/drive/MyDrive/Diss/roberta xlnet/combined_results.csv'

# Read the CSV file into a Data
df = pd.read_csv(file_path)


In [None]:
#reference and modified via - https://github.com/CurationCorp/curation-corpus
#  https://huggingface.co/datasets/viewer/?dataset=cnn_dailymail&config=3.0.0
# https://keras.io/examples/nlp/abstractive_summarization_with_bart/
# https://towardsdatascience.com/fine-tuning-the-bart-large-model-for-text-summarization-3c69e4c04582

In [6]:
!pip install transformers
!pip install pytorch_lightning
!pip install torch
!pip install scikit-learn
!pip install pandas

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.1.2-py3-none-any.whl (776 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.10.0 pytorch_lightning-2.1.2 torchmetrics-1.2.1


In [4]:
import torch
from torch.nn import functional as F
from torch import nn
import pytorch_lightning as pl
# https://www.pytorchlightning.ai/

from transformers import BartForConditionalGeneration, BartTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd

from transformers import (
    AdamW,
    get_linear_schedule_with_warmup
)
from torch.utils.data import DataLoader

In [5]:
# Checking out the GPU we have access to. This is output is from the google colab version.
!nvidia-smi

Sat Dec  9 01:01:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [51]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW

class BARTDataset(Dataset):
    def __init__(self, summaries, tokenizer, summ_len):
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.summ_len = summ_len

    def __len__(self):
        return len(self.summaries)

    def __getitem__(self, index):
        if isinstance(self.summaries[index], str):
            summary = ' '.join(str(self.summaries[index]).split())

            # Summary Target pre-processing
            target = self.tokenizer.batch_encode_plus(
                [summary],
                max_length=self.summ_len,
                pad_to_max_length=True,
                return_tensors='pt'
            )

            return (
                target['input_ids'].squeeze(),
                target['attention_mask'].squeeze()
            )
        else:
            # Handle the case where input is already tokenized (not a string)
            return (
                self.summaries[index]['input_ids'].squeeze(),
                self.summaries[index]['attention_mask'].squeeze()
            )

class BARTDataLoader(pl.LightningDataModule):
    def __init__(self, tokenizer, summarized_len, df, train_split_size, batch_size):
        super().__init__()
        self.tokenizer = tokenizer
        self.summarized_len = summarized_len
        self.input_text_length = summarized_len
        self.df = df
        self.train_split_size = train_split_size
        self.batch_size = batch_size

    def prepare_data(self):
        # Read data directly from df
        data = self.df
        data['Summary'] = 'summarize: ' + data['Summary']

        # Convert the "Summary" column to a list of texts
        self.summary = list(data['Summary'].values)

        # Convert each summary to a string (if not already)
        self.summary = [' '.join(str(summary).split()) for summary in self.summary]

    def setup(self, stage=None):
        X_train, X_val = train_test_split(
            self.summary, train_size=self.train_split_size
        )

        self.train_dataset = BARTDataset(
            summaries=X_train,
            tokenizer=self.tokenizer,
            summ_len=self.summarized_len
        )
        self.val_dataset = BARTDataset(
            summaries=X_val,
            tokenizer=self.tokenizer,
            summ_len=self.summarized_len
        )

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

class AbstractiveSummarizationBARTFineTuning(pl.core.LightningModule):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer

    def forward(self, input_ids, attention_mask, decoder_input_ids,
                decoder_attention_mask=None, lm_labels=None):
        outputs = self.model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            labels=decoder_input_ids
        )
        return outputs

    def preprocess_batch(self, batch):
        input_ids, source_attention_mask = batch

        # For BART, decoder_input_ids and decoder_attention_mask are not needed during training
        return input_ids, source_attention_mask, None, None, None

    def training_step(self, batch, batch_idx):
        input_ids, source_attention_mask, decoder_input_ids, \
        decoder_attention_mask, lm_labels = self.preprocess_batch(batch)

        outputs = self.forward(input_ids=input_ids, attention_mask=source_attention_mask,
                               decoder_input_ids=decoder_input_ids,
                               decoder_attention_mask=decoder_attention_mask,
                               lm_labels=lm_labels
                       )
        loss = outputs.loss

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, source_attention_mask, decoder_input_ids, \
        decoder_attention_mask, lm_labels = self.preprocess_batch(batch)

        outputs = self.forward(input_ids=input_ids, attention_mask=source_attention_mask,
                               decoder_input_ids=decoder_input_ids,
                               decoder_attention_mask=decoder_attention_mask,
                               lm_labels=lm_labels
                       )
        loss = outputs.loss

        return loss

    def on_training_epoch_end(self, outputs):
        avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
        self.log('Epoch', self.trainer.current_epoch)
        self.log('avg_epoch_loss', {'train': avg_loss})

    def on_validation_epoch_end(self):
        # Print available keys for debugging
        print("Available keys:", self.trainer.callback_metrics.keys())

        # Comment out the following line after identifying the correct key
        return

        # Modify the key based on the available keys
        val_loss = self.trainer.callback_metrics['??']  # replace '??' with the correct key
        self.log('avg_epoch_loss', {'val': val_loss})

    def configure_optimizers(self):
        model = self.model
        optimizer = AdamW(model.parameters())
        self.opt = optimizer
        return [optimizer]

# Tokenizer
model_ = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Dataloader
dataloader = BARTDataLoader(tokenizer=tokenizer, summarized_len=150,
                            df=df, train_split_size=0.8, batch_size=2)
# Read and pre-process data
dataloader.prepare_data()

# Train-test Split
dataloader.setup()

# Main Model class
model = AbstractiveSummarizationBARTFineTuning(model=model_, tokenizer=tokenizer)

# Trainer Class
trainer = pl.Trainer(check_val_every_n_epoch=1, max_epochs=5)

# Fit model
trainer.fit(model, dataloader)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 139 M 
-------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Available keys: dict_keys([])


/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Available keys: dict_keys([])


Validation: |          | 0/? [00:00<?, ?it/s]

Available keys: dict_keys([])


Validation: |          | 0/? [00:00<?, ?it/s]

Available keys: dict_keys([])


Validation: |          | 0/? [00:00<?, ?it/s]

Available keys: dict_keys([])


Validation: |          | 0/? [00:00<?, ?it/s]

Available keys: dict_keys([])


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [53]:


# Save the trained model
trainer.save_checkpoint("/content/drive/MyDrive/Diss/roberta xlnet/model.ckpt")


In [17]:
# Main Model class
model = AbstractiveSummarizationBARTFineTuning(model=model_, tokenizer=tokenizer)

In [8]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig


In [8]:
from transformers import BartForConditionalGeneration

# Load the model from the saved checkpoint
loaded_model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Diss/roberta xlnet/model.ckpt")




In [68]:
def summarize_article(article):
    # Load BART model and tokenizer
    model_name = 'facebook/bart-large-cnn'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)

    # Tokenize and encode the article
    inputs = tokenizer.encode(article, return_tensors='pt',
max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, num_beams=4, max_length=150,
early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Example usage
article = """
Putin is war criminal
"""

summary = summarize_article(article)
print("Summary:")
print(summary)

Summary:
Putin is war criminal and should be tried for war crimes, says U.S. senator. U.N. Security Council resolution calls for Russia to be punished for its war crimes. Russia has been accused of war crimes in the past, including in Afghanistan. Russia denies this, saying the allegations are false.
