In [None]:
# !pip install transformers -Uqq

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
import numpy as np
import pandas as pd
import json
import argparse

# Importing some tools for flattening 2d arrays to 1d
from functools import reduce
from operator import add
# Importing hugging face library for getting the transformers and tokenizers
from transformers import AutoTokenizer, BartForConditionalGeneration,PegasusForConditionalGeneration,AdamW,get_linear_schedule_with_warmup

# Importing some pytorch classes and functions
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
from torch import cuda 

In [None]:
%%capture
# !pip install pytorch-lightning

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
from pytorch_lightning.plugins import DeepSpeedPlugin

In [None]:
%%capture
# !pip install deepspeed

In [None]:
# from deepspeed.ops.adam import FusedAdam

In [None]:
!pip install rouge
!pip install bert_score

In [None]:
# Importing libraries for evaluation
from rouge import Rouge
from bert_score import score


# Importing library for progress bar GUI
#from fastai.text.core import progress_bar
from tqdm import tqdm

# Importing library for parallization
from joblib import Parallel,delayed,Memory

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
data = pd.read_csv('/kaggle/input/mtpdataset/dataset1920.csv')
data

In [None]:
# import string
# string.punctuation

In [None]:
# def remove_punctuation(text):
#     punctuationfree="".join([i for i in text if i not in string.punctuation])
#     return punctuationfree
# #storing the puntuation free text
# data['Cleaned_OriginalTxt']= data['Original_Text'].apply(lambda x:remove_punctuation(x))
# data['Cleaned_Summary']= data['Summary'].apply(lambda x:remove_punctuation(x))
# data.head()

In [None]:
# data['Cleaned_OriginalTxt']= data['Cleaned_OriginalTxt'].apply(lambda x: x.lower())
# data['Cleaned_Summary']= data['Cleaned_Summary'].apply(lambda x: x.lower())
# data.head()

In [None]:
# import re
# def remove_urls(text):
#     url_pattern = re.compile(r'https?://\S+|www\.\S+')
#     return url_pattern.sub(r'', text)
# data['Cleaned_OriginalTxt']= data['Cleaned_OriginalTxt'].apply(lambda x:remove_urls(x))
# data['Cleaned_Summary']= data['Cleaned_Summary'].apply(lambda x:remove_urls(x))
# data.head()

In [None]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
train_data.applymap(str)

In [None]:
train_summ = list(train_data['Summary'])

In [None]:
# Getting the train, and test data splitted using splitting_text function declared above
train_text,train_summ = list(train_data.Original_Text) , train_summ
test_text = list(test_data.Original_Text)
val_text,val_summ = train_text[:100], train_summ[:100]

In [None]:
print(len(train_text))
print(len(train_summ))
print(len(test_text))
print(len(val_text))
print(len(val_summ))

## Tokenization
Tokenization Class

In [None]:
from joblib import Parallel, delayed
from functools import reduce
from operator import add
from tqdm import tqdm
import torch


class TransformersBaseTokenizer:

    """Class for encoding and decoding given texts for transformers"""

    def __init__(
        self,
        pretrained_tokenizer,
        model_type='bart',
        **kwargs
        ):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.model_max_length
        self.model_type = model_type
        self.pad_token_id = pretrained_tokenizer.pad_token_id

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t):
        """Limits the maximum sequence length and add the special tokens"""

        if self.model_type == 'bart':
            
            CLS = self._pretrained_tokenizer.cls_token
            SEP = self._pretrained_tokenizer.sep_token
            
            tokens = \
                self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len
                - 2]
            tokens = [CLS] + tokens + [SEP]

        elif self.model_type == 'pegasus':
            eos = self._pretrained_tokenizer.eos_token
            tokens = \
                self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len
                - 2]
            tokens = tokens + [eos]
                    

        return tokens

    def _numercalise(self, t):
        """Convert text to their corresponding ids"""
        
        tokenized_text = self._pretrained_tokenizer(
                t,
                max_length=self.max_seq_len,
                return_tensors='pt',
                padding='max_length',
                truncation=True,
                add_special_tokens=True,
                is_split_into_words=False,
                )
        return tokenized_text

    def _textify(self, input_ids):
        """Convert ids to their corresponding text"""

        text = self._pretrained_tokenizer.batch_decode(input_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False)
        return text

    def _chunks(self, lst, n):
        """Splitting the text into batches"""

        for i in range(0, len(lst), n):
            yield lst[i:i + n]

    def numercalise(self, t, batch_size=4):
        """Convert text to their corresponding ids and get the attention mask to differentiate between pad and input texts"""

        if isinstance(t, str):
            t = [t]  # convert str to list of str

        n_cores = min(batch_size, os.cpu_count())
        results = Parallel(n_jobs=n_cores)(delayed(self._numercalise)(batch)
                for batch in tqdm(list(self._chunks(t,
                batch_size))))
        input_ids = []
        attention_masks = []
        for i in results:
            input_ids.append(i['input_ids'])
            attention_masks.append(i['attention_mask'])

        return {'input_ids': torch.cat(input_ids),
                'attention_mask': torch.cat(attention_masks)}

    def textify(self, tensors, batch_size):
        """Convert ids to their corresponding text"""

        if len(tensors.shape) == 1:
            tensors = [tensors]  # convert 1d tensor to 2d

        n_cores = min(batch_size, os.cpu_count())
        results = Parallel(n_jobs=-1, backend='threading'
                           )(delayed(self._textify)(summary_ids)
                             for summary_ids in
                             tqdm(list(self._chunks(tensors,
                             batch_size))))

        return reduce(add, results)

In [None]:
# # ORIGINAL TransformersBaseTokenizer FUNCTION
# class TransformersBaseTokenizer:

#     """Class for encoding and decoding given texts for transformers"""

#     def __init__(
#         self,
#         pretrained_tokenizer,
#         model_type='bart',
#         **kwargs
#         ):
#         self._pretrained_tokenizer = pretrained_tokenizer
#         self.max_seq_len = pretrained_tokenizer.model_max_length
#         self.model_type = model_type
#         self.pad_token_id = pretrained_tokenizer.pad_token_id

#     def __call__(self, *args, **kwargs):
#         return self

#     def tokenizer(self, t):
#         """Limits the maximum sequence length and add the special tokens"""

#         if self.model_type == 'bart':
            
#             CLS = self._pretrained_tokenizer.cls_token
#             SEP = self._pretrained_tokenizer.sep_token
            
#             tokens = \
#                 self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len
#                 - 2]
#             tokens = [CLS] + tokens + [SEP]

#         elif self.model_type == 'pegasus':
#             eos = self._pretrained_tokenizer.eos_token
#             tokens = \
#                 self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len
#                 - 2]
#             tokens = tokens + [eos]
                    

#         return tokens

#     def _numercalise(self, t):
#         """Convert text to there coressponding ids"""
        
#         tokenized_text = self._pretrained_tokenizer(
#                 t,
#                 max_length=self.max_seq_len,
#                 return_tensors='pt',
#                 padding='max_length',
#                 truncation=True,
#                 add_special_tokens=True,
#                 is_split_into_words=False,
#                 )
#         return tokenized_text

#     def _textify(self, input_ids):
#         """Convert ids to thier coressponding text"""

#         text = self._pretrained_tokenizer.batch_decode(input_ids,
#                 skip_special_tokens=True,
#                 clean_up_tokenization_spaces=False)
#         return text

#     def _chunks(self, lst, n):
#         """splitting the text into batches"""

#         for i in range(0, len(lst), n):
#             yield lst[i:i + n]

#     def numercalise(self, t, batch_size=4):
#         """Convert text to thier coressponding ids and get the attention mask to differentiate between pad and input texts"""

#         if isinstance(t, str):
#             t = [t]  # convert str to list of str

#         results = Parallel(n_jobs=-1)(delayed(self._numercalise)(batch)
#                 for batch in tqdm(list(self._chunks(t,
#                 batch_size))))
#         input_ids = []
#         attention_masks = []
#         for i in results:
#             input_ids.append(i['input_ids'])
#             attention_masks.append(i['attention_mask'])

#         return {'input_ids': torch.cat(input_ids),
#                 'attention_mask': torch.cat(attention_masks)}

#     def textify(self, tensors, batch_size):
#         """Convert ids to thier coressponding text"""

#         if len(tensors.shape) == 1:
#             tensors = [tensors]  # convert 1d tensor to 2d

#         results = Parallel(n_jobs=-1, backend='threading'
#                            )(delayed(self._textify)(summary_ids)
#                              for summary_ids in
#                              tqdm(list(self._chunks(tensors,
#                              batch_size))))

#         return reduce(add, results)

## Tokenizing data for BART Model

In [None]:
# Download the bart tokenizer from hugging face api
bart_tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-xsum')

In [None]:
# Passing the bart tokenizer to our TransformersBaseTokenizer wrapper
tokenizer = TransformersBaseTokenizer(bart_tokenizer)

In [None]:
import os

In [None]:
# Converting the text into there coressponding input_ids and attention_mask to be interperted by the model

train_inputs = tokenizer.numercalise(train_text,16)

val_inputs = tokenizer.numercalise(val_text,16)

test_inputs = tokenizer.numercalise(test_text,16)

#train_outputs = tokenizer.numercalise(reduce(add,train_summ),16)

#val_outputs = tokenizer.numercalise(reduce(add,val_summ),16)

val_outputs = tokenizer.numercalise(val_summ,16)
train_outputs = tokenizer.numercalise(train_summ,16)


In [None]:
print(train_inputs['input_ids'].shape)
print(train_inputs['attention_mask'].shape)

In [None]:
# Getting the labels from train and val
labels = train_outputs['input_ids']
val_labels = val_outputs['input_ids']

In [None]:
train_inputs['labels'] = labels

In [None]:
val_inputs['labels'] = val_labels

## Model Finetuning

In [None]:
hparams = argparse.Namespace()

hparams.freeze_encoder = True
hparams.freeze_embeds = True
hparams.eval_beams = 3

In [None]:
def shift_tokens_right(input_ids, pad_token_id):
    """ Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
      This is taken directly from modeling_bart.py
  """

    prev_output_tokens = input_ids.clone()
    index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1)
                    - 1).unsqueeze(-1)
    prev_output_tokens[:, 0] = input_ids.gather(1,
            index_of_eos).squeeze()
    prev_output_tokens[:, 1:] = input_ids[:, :-1]
    
    return prev_output_tokens

In [None]:
def freeze_params(model):
    ''' Making the input part of the model as non trainable parameters'''
    # for name, layer in list(model.named_parameters()):#[:-3]:
    #     layer.requires_grad = False
    for layer in list(model.parameters())[:-1]:
      layer.requires_grad=False

In [None]:
with open('/kaggle/input/q-learningtry/final_tokens.json', 'r') as f:
    data1 = json.load(f)

In [None]:
forced_tokens11 = data1
len(forced_tokens11)

### Preprocessing list of tokens

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# convert to lowercase
words = [word.lower() for word in forced_tokens11]

# remove punctuation
import string
words = [word.translate(str.maketrans("", "", string.punctuation)) for word in words]

# remove stop words
stop_words = set(stopwords.words('english'))
forced_tokens1 = [word for word in words if not word in stop_words]

In [None]:
import re
def preprocess_tokens(tokens):
    """Preprocesses a list of tokens to keep only words and remove non-word characters"""
    return [re.sub(r'\W+', '', token) for token in tokens if re.match(r'^\w+$', token)]
forced_tokens1 = preprocess_tokens(forced_tokens1)
len(forced_tokens1)

In [None]:
class Model(pl.LightningModule):

    def __init__(
        self,
        lr,
        tokenizer,
        model,
        params,
        n_warmup_steps=None,
        n_training_steps =None
        ):

        super(Model, self).__init__()

        self.tokenizer = tokenizer
        self.model = model
        self.lr = lr
        self.params = params
        self.n_warmup_steps = n_warmup_steps
        self.n_training_steps = n_training_steps
        if self.params.freeze_encoder:
            freeze_params(self.model.get_encoder())

        if self.params.freeze_embeds:
            self.freeze_embeds()

    def freeze_embeds(self):
        ''' freeze the positional embedding parameters of the model '''

        freeze_params(self.model.model.shared)
        for d in [self.model.model.encoder, self.model.model.decoder]:
            freeze_params(d.embed_positions)
            freeze_params(d.embed_tokens)

    def forward(self, input_ids, **kwargs):
        return self.model(input_ids, **kwargs)

    def configure_optimizers(self):
        optimizer = FusedAdam(self.parameters(), lr=self.lr)
        scheduler = get_linear_schedule_with_warmup(
                                                      optimizer,
                                                      num_warmup_steps=self.n_warmup_steps,
                                                      num_training_steps=self.n_training_steps
                                                    )
        
        return dict(optimizer=optimizer,lr_scheduler=dict(scheduler=scheduler,interval='step'))

    
    def training_step(self, batch, batch_idx):
        # Load the data into variables
        src_ids, src_mask = batch[0], batch[1]
        tgt_ids = batch[2]

        # Shift the decoder tokens right (but NOT the tgt_ids)
        decoder_input_ids = shift_tokens_right(tgt_ids,
                    self.tokenizer.pad_token_id)

        # Run the model and get the logits
        outputs = self(src_ids, attention_mask=src_mask,
                       decoder_input_ids=decoder_input_ids,
                       use_cache=False)
        labels_logits = outputs[0]

        # Define the forced tokens and corresponding indices
        forced_tokens = forced_tokens1
        forced_token_idx = {self.tokenizer.encode(token)[0]: i for i, token in enumerate(forced_tokens)}

        # Calculate the loss on the un-shifted tokens
        ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
        ce_loss = ce_loss_fct(labels_logits.view(-1, labels_logits.shape[-1]), tgt_ids.view(-1))

        # Define the forced token penalty function
        def forced_token_penalty(output):
            penalty = 0.0
            for token, idx in forced_token_idx.items():
                if idx not in output:
                    # Penalize the model if the forced token is absent
                    penalty += 1.0
            return penalty

        # Calculate the forced token penalty
        penalty = forced_token_penalty(tgt_ids)

        # Combine the loss and penalty terms with a weighting factor
        lambda_weight = 0.5
        loss = lambda_weight * ce_loss + (1 - lambda_weight) * penalty

        # Log the loss values
        self.log('train_loss/epoch', loss, prog_bar=True, logger=True, on_epoch=True, on_step=False)
        self.log('train_loss/step', loss, prog_bar=True, logger=True, on_epoch=False, on_step=True)

        return {"loss": loss}

    
    def validation_step(self, batch, batch_idx):
        # Load the data into variables
        src_ids, src_mask = batch[0], batch[1]
        tgt_ids = batch[2]

        # Shift the decoder tokens right (but NOT the tgt_ids)
        decoder_input_ids = shift_tokens_right(tgt_ids, self.tokenizer.pad_token_id)

        # Run the model and get the logits
        outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
        labels_logits = outputs[0]

        # Define the forced tokens and corresponding indices
        forced_tokens = forced_tokens1
        forced_token_idx = {self.tokenizer.encode(token)[0]: i for i, token in enumerate(forced_tokens)}

        # Calculate the loss on the un-shifted tokens
        ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
        ce_loss = ce_loss_fct(labels_logits.view(-1, labels_logits.shape[-1]), tgt_ids.view(-1))

        # Define the forced token penalty function
        def forced_token_penalty(output):
            penalty = 0.0
            for token, idx in forced_token_idx.items():
                if idx not in output:
                    # Penalize the model if the forced token is absent
                    penalty += 1.0
            return penalty

        # Calculate the forced token penalty
        penalty = forced_token_penalty(tgt_ids)

        # Combine the loss and penalty terms with a weighting factor
        lambda_weight = 0.5
        loss = lambda_weight * ce_loss + (1 - lambda_weight) * penalty

        # Log the loss values
        self.log('val_loss/epoch', loss, prog_bar=True, logger=True, on_epoch=True, on_step=False)
        self.log('val_loss/step', loss, prog_bar=True, logger=True, on_epoch=False, on_step=True)

        return {'loss': loss}

    
    def _chunks(self, lst, n):
        """splitting the text into batches"""

        for i in range(0, len(lst['input_ids']), n):
            yield lst['input_ids'][i:i + n],lst['attention_mask'][i:i + n]

    def _generate_text(
        self,
        text,
        mask,
        eval_beams,
        early_stopping=True,
        max_len=150,
        penalty_length = 0.2
    ):
        
        generated_ids = self.model.generate(
            text.to(device),
            attention_mask=mask.to(device),
            use_cache=True,
            decoder_start_token_id=self.tokenizer.pad_token_id,
            num_beams=eval_beams,
            max_length=max_len,
            early_stopping=early_stopping,
            length_penalty = penalty_length,
            no_repeat_ngram_size=3
            
            )
        
        return [self.tokenizer.decode(w, skip_special_tokens=True,
                clean_up_tokenization_spaces=True) for w in
                generated_ids]

    def generate_text(
        self,
        text,
        eval_beams,
        early_stopping=True,
        max_len=250,
        batch_size=2,
        length_penalty = 0.2
        ):
        ''' Function to generate text '''
        summaries = []
        
        for ids,mask in tqdm(list(self._chunks(text,batch_size))):
            txt = self._generate_text(ids,mask=mask,eval_beams = eval_beams,early_stopping = early_stopping,max_len = max_len,penalty_length = length_penalty)
            print("Summary gen\n")
            print(txt)
            print("\n")
            summaries.extend(txt)
    
        return summaries
            


    def save(self, model_name):
        model_extension =  model_name + '.h5'
        torch.save(self.model,model_extension)
        print ('Model is saved')
        return './'+  model_extension

In [None]:
# Create a dataloading module as per the PyTorch Lightning Docs

class SummaryDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train,
        val=None,
        test=None,
        batch_size=2,
        ):
        super().__init__()
        self.train = train
        self.val = val
        self.test = test
        self.batch_size = batch_size

  # Load the training, validation and test sets in Pytorch Dataset objects

    def train_dataloader(self):
        dataset = TensorDataset(self.train['input_ids'],
                                self.train['attention_mask'],
                                self.train['labels'])
        train_data = DataLoader(dataset,
                                sampler=RandomSampler(dataset),
                                batch_size=self.batch_size)
        return train_data

    def val_dataloader(self):
        dataset = TensorDataset(self.val['input_ids'],
                                self.val['attention_mask'],
                                self.val['labels'])
        val_data = DataLoader(dataset, batch_size=self.batch_size)
        return val_data

    def test_dataloader(self):
        dataset = TensorDataset(self.test['input_ids'],
                                self.test['attention_mask'],
                                self.test['labels'])
        test_data = DataLoader(dataset, batch_size=self.batch_size)
        return test_data

## BART Finetuning

In [None]:
# Load the data into the model for training
summary_data = SummaryDataModule(train = train_inputs, val=val_inputs,
                                 batch_size=  2)

In [None]:
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-xsum")

In [None]:
steps_per_epoch=len(train_text) // 1
total_training_steps = steps_per_epoch * 5
warmup_steps = total_training_steps // 5

In [None]:
model = Model(lr = 2e-5, tokenizer = bart_tokenizer, model = bart_model, params = hparams,  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps)

In [None]:
# early_stopping_callback = EarlyStopping(monitor='val_loss/epoch', patience=3)

In [None]:
from pytorch_lightning.callbacks import TQDMProgressBar
from pytorch_lightning.loggers import CSVLogger

In [None]:
trainer = pl.Trainer(gpus=1,
                     precision=32,
                     max_epochs = 5,
                     auto_lr_find = True,
                     #callbacks=[checkpoint,early_stopping_callback],
                     #logger = wandb_logger
                    )

In [None]:
trainer.fit(model,summary_data)

In [None]:
#path = model.save('/content/drive/MyDrive/NLP Project/Results/bart_model_5_epoch_unfreeze')
path = model.save('bart_model_with_loss_penalty')

In [None]:
def generate_summary(seed_line, model_,num_beam = 4,penalty_length =0.2):

  # Put the model on eval mode

    model_.to(torch.device('cuda'))
    model_.eval()

    line = model_.generate_text(seed_line, eval_beams=num_beam,length_penalty = penalty_length)

    return line

## Evaluation

In [None]:
!pip install transformers

In [None]:
model = torch.load('bart_model_with_loss_penalty.h5')

In [None]:
b_model = Model(lr = 2e-5, tokenizer = bart_tokenizer, model = model, params = hparams)

In [None]:
num_beams = [3]
p_l = [0.2]

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('all')

In [None]:
val_inputs

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('wordnet')

In [None]:
from nltk import word_tokenize
nltk.download('omw-1.4')

In [None]:
from rouge import Rouge
scorer = Rouge()

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
# from nltk.translate.meteor_score import meteor_score
bleu_1 = 0
bleu_2 = 0
bleu_3 = 0
bleu_4 = 0
count = 0
weights_1 = (1./1.,)
weights_2 = (1./2. , 1./2.)
weights_3 = (1./3., 1./3., 1./3.)
weights_4 = (1./4., 1./4., 1./4., 1./4.)
# met = 0

for beam in num_beams:
    for length in p_l:
        print("Actual summ")
        bart_summ = generate_summary(val_inputs,b_model,num_beam=beam,penalty_length=length)
        print("Val summ\n")
        print(val_summ)
        print("Actual Summ\n")
        print(bart_summ)
        bart_scorer = scorer.get_scores(val_summ, bart_summ, avg=True)
        print(bart_scorer)

        # Evaluate generated summary against reference summary using BLEU and METEOR
        reference = str(val_summ)
        hypothesis = str(bart_summ)
        reference = reference.split()
        hypothesis = hypothesis.split()
        bleu_1 += sentence_bleu([reference], hypothesis, weights_1) 
        bleu_2 += sentence_bleu([reference], hypothesis, weights_2)
        bleu_3 += sentence_bleu([reference], hypothesis, weights_3)
        bleu_4 += sentence_bleu([reference], hypothesis, weights_4)
#         ref1 = val_summ
#         ref = word_tokenize(ref1)
#         hyp1 = bart_summ
#         hyp = word_tokenize(hyp1)
#         met += nltk.translate.meteor_score.meteor_score([ref], hyp)
        count += 1

bleu_1 /= count
bleu_2 /= count
bleu_3 /= count
bleu_4 /= count
# met /= count

print("BLEU-1:", bleu_1)
print("BLEU-2:", bleu_2)
print("BLEU-3:", bleu_3)
print("BLEU-4:", bleu_4)
# print("METEOR:", met)

### Run on test data

In [None]:
bart_summ_test = generate_summary(test_inputs,b_model,num_beam=beam,penalty_length=length)

In [None]:
len(bart_summ_test)

In [None]:
bart_summ_test

## Saving the BART summ for test data

In [None]:
list_of_idx = []
list_of_summ = []

for idx,summ in enumerate(bart_summ_test):
  list_of_idx.append(idx)
  list_of_summ.append(summ)

In [None]:
len(list_of_idx)

In [None]:
res = pd.DataFrame({'Generated_Summary':list_of_summ,'id':list_of_idx})

In [None]:
res

In [None]:
datatext = pd.DataFrame().assign(Original_Text=test_data['Original_Text'], Gold_Summary=test_data['Summary'])
datatext.reset_index(inplace = True)

In [None]:
datatext.drop(['index'],axis=1)

In [None]:
result = pd.concat([datatext, res], axis=1, join='inner')
result

In [None]:
result['Generated_Summary'][30]

In [None]:
result['Gold_Summary'][30]

In [None]:
result.to_csv('predictions.csv')

In [None]:
from rouge import Rouge
rouge = Rouge()

In [None]:
scores = rouge.get_scores(list(result['Generated_Summary']), list(result['Gold_Summary']), avg=True)

print(scores)

In [None]:
y_pred = list(result['Generated_Summary'])
y_true = list(result['Gold_Summary'])

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from scipy import spatial
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score

In [None]:
nltk.download('punkt')

In [None]:
from nltk import word_tokenize
nltk.download('omw-1.4')

In [None]:
bleu_1 = 0
bleu_2 = 0
bleu_3 = 0
bleu_4 = 0
count = 0
weights_1 = (1./1.,)
weights_2 = (1./2. , 1./2.)
weights_3 = (1./3., 1./3., 1./3.)
weights_4 = (1./4., 1./4., 1./4., 1./4.)
# met = 0

for reference, hypothesis in zip(y_true, y_pred):
    ref = word_tokenize(reference)
    hyp = word_tokenize(hypothesis)
#     met += nltk.translate.meteor_score.meteor_score([ref], hyp)
    
    reference = reference.split()
    hypothesis = hypothesis.split()
    bleu_1 += sentence_bleu([reference], hypothesis, weights_1) 
    bleu_2 += sentence_bleu([reference], hypothesis, weights_2)
    bleu_3 += sentence_bleu([reference], hypothesis, weights_3)
    bleu_4 += sentence_bleu([reference], hypothesis, weights_4)
    count += 1

bleu_1 = bleu_1/count
bleu_2 = bleu_2/count
bleu_3 = bleu_3/count
bleu_4 = bleu_4/count

# met = met/count

print("BLEU-1:", bleu_1)
print("BLEU-2:", bleu_2)
print("BLEU-3:", bleu_3)
print("BLEU-4:", bleu_4)
# print("METEOR:", met)

### 