In [3]:
!pip install --quiet transformers
!pip install --quiet sentencepiece
!pip install --quiet datasets
!pip install --quiet rouge_score

# importing libraries

In [4]:
import pandas as pd#toread csv file
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM#for model
from transformers import AdamW, get_scheduler
from datasets import load_metric

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 16, 10


# Making tockenizer and model

In [5]:
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")

model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")

# importing dataset

In [8]:
df=pd.read_csv("../input/news-summary/news_summary.csv",encoding = "ISO-8859-1")#encoding byte code
df.head()#seeing the first 5 rows

In [10]:
#defining the class summary 
class summary_dataset:
  #defining the constructor function with four default parameters
  def __init__(self,
               data=df,
               tokenizer=tokenizer,
               text_max_token_len=200,#means the text can have 200 max tokens
               summary_max_token_len=12):#summary can have max 12 length
               self.tokenizer = tokenizer#defining the parameter for tokenizing
               self.data = data#taking the dataset 
               self.text_max_token_len = text_max_token_len
               self.summary_max_token_len = summary_max_token_len
  
  #making function for length
  def __len__(self):
    return len(self.data)

  #defining the get item for tokeninzing of text and then tokenizing of summary and return a dict as a result
  def __getitem__(self,index:int): #specifying the index to get a specific row
    #getting that specific row
    data_row=self.data.iloc[index]
    #taking text out of that row
    text=data_row['text']

    #we have now our text so we will tockenize it 
    text_encoding=tokenizer(
        text,
        max_length=self.text_max_token_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'

    )
    #Text encoding is done now so we will do summary encoding
    summary_encoding=tokenizer(
        text,
        max_length=self.summary_max_token_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'

    )
    #taking out labels from the summary 
    labels = summary_encoding['input_ids']
    labels[labels == tokenizer.pad_token_id] = -100
    
    return dict(
        input_ids=text_encoding['input_ids'].flatten(),
        attention_mask=text_encoding['attention_mask'].flatten(),
        labels=labels.flatten(),
        decoder_attention_mask=summary_encoding['attention_mask'].flatten())

In [11]:


#splitting in the ratio of 80 and 20 
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
#making the summary of the train and test
train_dataset = summary_dataset(data=df_train)
test_dataset = summary_dataset(data=df_test)



In [12]:


train_dataset



In [14]:
#loading the data
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
eval_dataloader = DataLoader(test_dataset, batch_size=1)

In [15]:
#for less computation I will take epochs=3

num_epochs=3

#calcualting training steps
num_training_steps=num_epochs*len(train_dataloader)

#defining optimizer as ADAM
optimizer=AdamW(model.parameters())

#scheduling the learning rate using get_scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

#for seeing the progess bar 
progress_bar = tqdm(range(num_training_steps))

#fisrt checking for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#for moving model to CUDA if available
model = model.to(device)

#now model is ready so doing the training part 
for epoch in range(num_epochs):
    #defining the batch in train data
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        #giving output
        outputs = model(**batch)
        #checking for loss
        loss = outputs.loss
        #backpropogating the loss
        loss.backward()
        #defining and scheduling the optimizer
        optimizer.step()
        lr_scheduler.step()
        
        #setting the gradient for all optimized tensor as 0
        optimizer.zero_grad()
        #seeing the progress bar
        progress_bar.update()
    #saving the trained model
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, f'./t5-epoch-{epoch}.pth')
    #printing the epochs
    print(f'epoch: {epoch + 1} -- loss: {loss}')

In [16]:
#loading the metic as rouge
metric= load_metric("rouge")
#strating evaluataion
model.eval()
#running according to batch 
for batch in eval_dataloader:
  #making the batch on test 
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    #Taking the outputs
    logits = outputs.logits
    #making the predictions
    predictions = torch.argmax(logits, dim=-1)
    #adding to metrics 
    metric.add_batch(predictions=predictions, references=batch["labels"])
#showing final metrics
metric.compute()

In [20]:
#Now we have our model so we can summarise our summary 
def summarizeText(text, model=model):
    text_encoding = tokenizer(
        text,
        max_length=1000,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    #taking out generated ids according to model saved
    generated_ids = model.generate(
        input_ids=text_encoding['input_ids'].to(device),
        attention_mask=text_encoding['attention_mask'].to(device),
        max_length=150,
        num_beams=4,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )    
    #taking out the predictions
    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
    ]
    #returning the predictions
    return "".join(preds)

In [21]:


#we have now our text in summary 
text = """Russia has cut off natural gas supplies to Poland and Bulgaria, dramatically escalating its response to Western sanctions imposed on Moscow over the war in Ukraine. """
#summariesing the text in English
summary = summarizeText(text, model)

print(summary)

