### Download, and Load the Libraries



In [None]:
!pip install transformers
!pip install datasets    

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import pandas as pd
from datasets import Dataset, DatasetDict

### Load the Model/Tokenizer

In [None]:
# Retrieves a pre-trained tokenizer trained on the "bart-large-cnn" model
# The tokenizer is responsible for converting raw text into tokens
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Instance of pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Freeze all layers except last 2 layers

# Freeze all layers
for name, param in model.named_parameters():
    param.requires_grad = False

# Unfreeze last two layers in the encoder
unfrozen_params = list(model.model.encoder.layers[-2:].parameters())
for param in unfrozen_params:
    param.requires_grad = True

# Unfreeze last two layers in the decoder
unfrozen_decoder_params = list(model.model.decoder.layers[-2:].parameters())
for param in unfrozen_decoder_params:
    param.requires_grad = True

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
# Put the model on GPU if available.
if torch.cuda.is_available():
  model = model.to("cuda")

In [None]:
# Mounting files
import os
import pandas as pd
import random
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

###**Pre-processing InShorts data**

1. We use the raw Inshorts dataset which is a .csv file with 55,000+ rows to randomly create pr-dev-small.csv, pr-test-small.csv, pr-train-small.csv. Each csv contains a column for article and a column for summary.

2. train, test, development datasets are converted from .csv to pandas dataframes

3. Text processing the train, test and development datasets

In [None]:
## Load the Excel file into a pandas DataFrame
df = pd.read_excel("/gdrive/MyDrive/datasets/Inshorts_Cleaned_Data.xlsx")

# Keep only 'Headline' and 'Short' columns and discard remaining columns
df = df[['Headline', 'Short']]

# Count the total number of rows in the dataframe
num_rows = len(df)

# Randomly sample 50000 rows without replacement from the dataframe
df_sample = df.sample(n=5000, replace=False)

# Randomly allocate the 50000 rows into 3 different groups with the specified proportions
train_size = 3000
test_size = 1000
dev_size = 1000

train_set = df_sample.sample(n=train_size, replace=False)
df_sample = df_sample.drop(train_set.index)

test_set = df_sample.sample(n=test_size, replace=False)
dev_set = df_sample.drop(test_set.index)

# Export each group to a separate Excel file
train_set.to_excel('/gdrive/MyDrive/datasets/pr-train-small.xlsx', index=False)
test_set.to_excel('/gdrive/MyDrive/datasets/pr-test-small.xlsx', index=False)
dev_set.to_excel('/gdrive/MyDrive/datasets/pr-dev-small.xlsx', index=False)

In [None]:
# Read smaller sample datasets as Pandas dataframes
data_train = pd.read_excel('/gdrive/MyDrive/datasets/pr-train-small.xlsx')
data_train.head()

Unnamed: 0,Headline,Short
0,Russia reveals plans to send cosmonauts to Mars,Russia recently announced that in 2018 it will...
1,"Sensex dips 465 points, Nifty down by 153 points",The BSE Sensex on Thursday plunged 465.28 poin...
2,Cabinet nod to be sought for setting up ITIs i...,Skill Development and Entrepreneurship Ministr...
3,10 terrorists killed by Indian forces in Uri,As many as 10 of the 15 terrorists trying to i...
4,Minister urges PM to bring back Udham Singh&#3...,Haryana state minister Karan Dev Kamboj has wr...


In [None]:
data_test = pd.read_excel('/gdrive/MyDrive/datasets/pr-test-small.xlsx')
data_test.head()

Unnamed: 0,Headline,Short
0,Chennai restaurant introduces &#39;Donald Trum...,A photograph of a banner from casual dining re...
1,Europe&#39;s 1st space-based data satellite la...,The first satellite of the European Space Agen...
2,Mathura clashes: SP died of brain haemorrhage,Following the death of two police officers dur...
3,PM&#39;s step against black money a political ...,"Following the demonetisation of ₹500 and ₹1,00..."
4,Bihar: 25 cops suspended over suspected hooch ...,As many as 25 policemen were suspended and six...


In [None]:
data_validate = pd.read_excel('/gdrive/MyDrive/datasets/pr-dev-small.xlsx')
data_validate.head()

Unnamed: 0,Headline,Short
0,Grievance redressal system for Jaya constituency,In an effort to ensure faster grievance redres...
1,Ibrahimović scored a 30-yard bicycle kick goal...,Ex-Sweden football team captain Zlatan Ibrahim...
2,Mexico chefs make 216 feet long &#39;torta&#39...,Chefs and volunteers in Mexico City on Wednesd...
3,Amul to sell camel milk in next 3 months,Dairy major Gujarat Cooperative Milk Marketing...
4,Political cartoonist Sudhir Tailang passes away,"Renowned political cartoonist Sudhir Tailang, ..."


In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = stopwords.words('english')

def preprocess(text):
    '''
    Returns a cleaned string called 'text'
    '''
    text = text.lower() # lowercase
    # convert have'nt -> have not
    text = text.split()
    for i in range(len(text)):
        word = text[i]
        if word in contraction_mapping:
            text[i] = contraction_mapping[word]
    # join entire text after contraction mapping is completed
    text = " ".join(text)
    text = text.split()
    newtext = []
    # create list of words after removing stopwords
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    # create string of same words
    text = " ".join(newtext)
    text = text.replace("'s",'') # convert your's -> your
    text = re.sub(r'\(.*\)','',text) # remove (words inside a parenthesis)
    text = re.sub(r'[^a-zA-Z0-9. ]','',text) # remove punctuations
    text = re.sub(r'\.',' . ',text)
    return text

sample = "(hello) hi there .man tiger caller who's that isn't it ? WALL-E"
print(preprocess(sample))

 hi  . man tiger caller  walle


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Preprocess training data
data_train['Headline'] = data_train['Headline'].apply(lambda x:preprocess(x))
data_train['Short'] = data_train['Short'].apply(lambda x:preprocess(x))

# TRAINING DATA: instance of a dataset object from the datasets library
dataset_train = Dataset.from_pandas(data_train)

# Preprocess test data
data_test['Headline'] = data_test['Headline'].apply(lambda x:preprocess(x))
data_test['Short'] = data_test['Short'].apply(lambda x:preprocess(x))

# TEST DATA: instance of a dataset object from the datasets library
dataset_test = Dataset.from_pandas(data_test)

# Preprocess validation data
data_validate['Headline'] = data_validate['Headline'].apply(lambda x:preprocess(x))
data_validate['Short'] = data_validate['Short'].apply(lambda x:preprocess(x))

# VALIDATION DATA: instance of a dataset object from the datasets library
dataset_validate = Dataset.from_pandas(data_validate)

In [None]:
# Access the data within dataset_train using keys corresponding to the column names in data_train. 
# For example, dataset_train['Headline'] would give you the preprocessed headlines, and 
# dataset_train['Short'] would give you the preprocessed short texts
dataset_train[10]

{'Headline': 'syria39s aleppo slaughterhouse unhr chief',
 'Short': 'un human rights chief zeid ra39ad alhussein friday said bombing syria39s aleppo constituted 34crimes historic proportions34 calling 34a slaughterhouse34 .  alhussein called involved conflict set aside 34political disagreements34 refer situation international criminal court .  comes russia extended ceasefire around aleppo 24 hours . '}

In [None]:
dataset_train['Headline'][:5]

['russia reveals plans send cosmonauts mars',
 'sensex dips 465 points nifty 153 points',
 'cabinet nod sought setting itis 2500 blocks',
 '10 terrorists killed indian forces uri',
 'minister urges pm bring back udham singh39s pistol uk']

In [None]:
dataset_train['Short'][:5]

['russia recently announced 2018 test nuclear engine could help cosmonauts reach mars six weeks compared 18 months spacecraft currently needs travel mars .  274 million project uses nuclear propulsion rockets weighs around half much chemical rocket without reducing thrust . ',
 'bse sensex thursday plunged 465 . 28 points close 27827 . 53 nse nifty fell 153 . 90 points close 8591 . 25 .  fall came amid indian army confirming conducted 34surgical strikes34 pakistani territory wednesday night .  tcs top gainer adani ports sez major loser day . ',
 'skill development entrepreneurship ministry soon seek approval cabinet establish industrial training institutes  2500 blocks country union minister rajiv pratap rudy said wednesday .  pointing disparity courses offered itis said nearly 61 students going itis trained two trades despite 127 trades . ',
 'many 10 15 terrorists trying infiltrate indian side pakistan killed tuesday indian security forces jammu kashmir39s lachipura area uri sector .

### Prepare The Dataset
Writing a function to handle the tokenization and change the format of the data to an acceptable structure for the model. The dataset by default has Short, and Headline columns that needed to be changed to input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, labels using the set_format() function.

In [None]:
short_length=512
headline_length=64

def process_data_to_model_inputs(batch):
  '''
  Takes a data batch (a dictionary) as input and processes it to create model inputs
  to return an updated batch (a dictionary)
  '''
  # Tokenize the inputs and labels by splitting the text into tokens and encoding them
  # inputs = instance of tokenizer() object -- for Article/"Short"
  # outputs = instance of tokenizer() object -- for Summary/"Headline"
  inputs = tokenizer(batch["Short"], padding="max_length", truncation=True, max_length=short_length)
  outputs = tokenizer(batch["Headline"], padding="max_length", truncation=True, max_length=headline_length)

  # Update batch dictionary with new keys, with values assigned from inputs object's attributes
  # and outputs object's attributes
  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # We have to make sure that the PAD token is ignored for calculating the loss
  # Replacing the PAD token IDs with -100, a special value that indicates they should be ignored during training
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

In [None]:
# Train data
# Returns a transformed instance of DatasetDict class for dataset_train instance by applying
# process_data_to_model_inputs() function

dataset_train_dict = dataset_train.map(
    process_data_to_model_inputs, 
    batched=True,
    remove_columns=["Short", "Headline"]
)

dataset_train_dict.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids",
                           "decoder_attention_mask", "labels"],
)

# Validation data
# Returns a transformed instance of DatasetDict class for dataset_validate instance by applying
# process_data_to_model_inputs() function

dataset_validate_dict = dataset_validate.map(
    process_data_to_model_inputs, 
    batched=True,
    remove_columns=["Short", "Headline"]
)

dataset_validate_dict.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids",
                           "decoder_attention_mask", "labels"],
)

# Test data
# Returns a transformed instance of DatasetDict class for dataset_test instance by applying
# process_data_to_model_inputs() function

dataset_test_dict = dataset_test.map(
    process_data_to_model_inputs, 
    batched=True,
    remove_columns=["Short", "Headline"]
)

dataset_test_dict.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids",
                           "decoder_attention_mask", "labels"],
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
dataset_train_dict[0]

Based on the provided `process_data_to_model_inputs` function, the output of `dataset_train_dict[0]` will be a dictionary-like object containing the processed example at index 0 of the "train" split.

The output dictionary will have the following keys and values:

- `"input_ids"`: A list of token IDs representing the tokenized and encoded "Short" text sequence, with a maximum length of 512 tokens after padding or truncation.
- `"attention_mask"`: A list of binary values indicating which tokens in the "Short" text sequence should be attended to (1) and which ones should be ignored (0) during model training or inference.
- `"decoder_input_ids"`: A list of token IDs representing the tokenized and encoded "Headline" text sequence, with a maximum length of 64 tokens after padding or truncation.
- `"decoder_attention_mask"`: A list of binary values indicating which tokens in the "Headline" text sequence should be attended to (1) and which ones should be ignored (0) during model training or inference.
- `"labels"`: A list of token IDs representing the encoded "Headline" text sequence, with the same maximum length as `"decoder_input_ids"`. This serves as the target labels for the model's training.

The `"labels"` are adjusted to ignore the padding tokens by replacing them with `-100` to ensure they are not considered in the loss calculation.

Overall, `dataset_train_dict[0]` will contain the processed example with the input and target information necessary for training the model.

The dataset is not batched yet. We can use PyTorch's DataLoader function to take care of batching the data. Consider using a larger batch_size if you do not have hardware limitation.

In [None]:
from torch.utils.data import DataLoader

batch_size      = 10
# Instantiate dataloader objects for train, test, validation
train_data      = DataLoader(dataset_train_dict, batch_size=batch_size)
validation_data = DataLoader(dataset_validate_dict, batch_size=batch_size)
test_data = DataLoader(dataset_test_dict, batch_size=batch_size)

In [None]:
first_element = next(iter(train_data))
print(first_element)


### Loss function

In [None]:
from torch.nn import CrossEntropyLoss
loss_fct = CrossEntropyLoss()

### Optimizer

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-4)



### Learning Rate Scheduler

- The learning rate scheduler adjusts the learning rate during training to optimize the model's performance.
- A linear scheduler increases the learning rate linearly over the training steps.
- The num_warmup_steps and num_training_steps help determine the learning rate schedule.
- The lr_scheduler object can be used during model training to update the learning rate based on the schedule.

In [None]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_data)
num_validation_steps = num_epochs * len(validation_data)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

### Encoder, Decoder, Last Linear Layer

The encoder component of the `BartForConditionalGeneration` model is called. We will use it to process input sequences and extract their contextual representations. 

`the_encoder` is an instance of the `BartEncoder` class, which is composed of multiple layers of self-attention and feed-forward neural networks.

The `BartEncoder` class typically contains the following key attributes:

- `embed_tokens`: The embedding layer that maps input tokens to continuous representations.
- `encoder`: A stack of Transformer encoder layers that process the input sequence and generate contextual representations.
- `embed_positions`: Positional embeddings that encode the position information of the input tokens.
- `layernorm_embedding`: A layer normalization module applied to the output of the embedding layer.
- `layer_norm`: A layer normalization module applied to the output of each encoder layer.

In [None]:
# accessing the modified model object's encoder which has last 2 layers unfrozen
the_encoder = model.model.encoder  

The decoder component of the `BartForConditionalGeneration` model is called.  We use it to generate output sequences, conditioned on the contextual representations. This is done in autoregressive manner, where the decoder generates tokens one at a time, taking into account the previously generated tokens.

The decoder is designed to take the encoder's contextual representations as input and produce the output sequence step by step.

`the_decoder` object is typically an instance of the `BartDecoder` class. The `BartDecoder` class typically contains the following key attributes:

- `embed_tokens`: The embedding layer that maps output tokens to continuous representations.
- `decoder`: A stack of Transformer decoder layers that generate the output sequence based on the contextual representations from the encoder.
- `embed_positions`: Positional embeddings that encode the position information of the output tokens.
- `layernorm_embedding`: A layer normalization module applied to the output of the embedding layer.
- `layer_norm`: A layer normalization module applied to the output of each decoder layer.

In [None]:
# accessing the modified model object's decoder which has last 2 layers unfrozen
the_decoder = model.model.decoder

`last_linear_layer` is an instance of a linear layer that performs a linear transformation on the input data. the `last_linear_layer` is an instance of the `torch.nn.Linear` class. 

The attribute `lm_head` refers to the last linear layer of the model, which is responsible for generating the output logits or scores for each token in the output sequence.

The last linear layer is typically a linear transformation that takes the contextual representations from the decoder and maps them to the vocabulary space. It essentially serves as a language model head, providing probabilities or scores for each token in the vocabulary, indicating the likelihood of that token being the next token in the generated sequence.

We can use the last linear layer to generate the output logits or scores for each token in the output sequence. These logits can then be used for various purposes, such as calculating the loss, generating the next token using sampling or beam search, or evaluating the model's performance.

The `torch.nn.Linear` class represents a linear transformation that maps an input tensor of shape `(batch_size, input_size)` to an output tensor of shape `(batch_size, output_size)` through a matrix multiplication followed by a bias addition.

The `torch.nn.Linear` class typically contains the following key attributes:

- `weight`: The weight matrix of shape `(output_size, input_size)` that is used for the linear transformation.
- `bias`: The bias vector of shape `(output_size)` that is added to the output of the linear transformation.

In [None]:
last_linear_layer = model.lm_head

### Training + Validation Loop

In [None]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps + num_validation_steps))
val_losses = []

for epoch in range(num_epochs):
    # The Training Loop for One Epoch
    model.train()
    training_loss = 0.0
    validation_loss = 0.0
    print("Training...")
    for batch in train_data:
        if torch.cuda.is_available():
          # k = input_id, v = tensor for input_id
          # k = attention_mask, v = tensor for attention_mask
          # k  = decoder_input_ids, v = tensort ...
          batch = {k: v.to('cuda') for k, v in batch.items()}

        encoder_output = the_encoder(input_ids = batch['input_ids'],
                                    attention_mask = batch['attention_mask'])
        
        decoder_output = the_decoder(input_ids=batch['decoder_input_ids'],
                                    attention_mask=batch['decoder_attention_mask'],
                                    encoder_hidden_states=encoder_output[0],
                                    encoder_attention_mask=batch['attention_mask'])

        decoder_output = decoder_output.last_hidden_state
        lm_head_output = last_linear_layer(decoder_output)

        loss = loss_fct(lm_head_output.view(-1, model.config.vocab_size),
                        batch['labels'].view(-1))
        training_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    # Evaluate the Model performance on Validation set
    # after the 1 epoch Training.
    model.eval()
    print("Validating...")
    for batch in validation_data:
        if torch.cuda.is_available():
          batch = {k: v.to('cuda') for k, v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        validation_loss += loss
        progress_bar.update(1)
    
    training_loss = training_loss / len( train_data )
    validation_loss = validation_loss / len( validation_data )
    print("Epoch {}:\tTraining Loss {:.2f}\t/\tValidation Loss {:.2f}".format(epoch+1, training_loss, validation_loss))
    val_losses.append(validation_loss)
    if validation_loss <= min(val_losses):
        torch.save(model.state_dict(), "/gdrive/MyDrive/AML/FinalProject/Best_BART_V2.pt")

  0%|          | 0/4000 [00:00<?, ?it/s]

Training...
Validating...
Epoch 1:	Training Loss 1.14	/	Validation Loss 0.05
Training...
Validating...
Epoch 2:	Training Loss 0.05	/	Validation Loss 0.05
Training...
Validating...
Epoch 3:	Training Loss 0.03	/	Validation Loss 0.04
Training...
Validating...
Epoch 4:	Training Loss 0.02	/	Validation Loss 0.03
Training...
Validating...
Epoch 5:	Training Loss 0.01	/	Validation Loss 0.02
Training...
Validating...
Epoch 6:	Training Loss 0.01	/	Validation Loss 0.02
Training...
Validating...
Epoch 7:	Training Loss 0.00	/	Validation Loss 0.03
Training...
Validating...
Epoch 8:	Training Loss 0.01	/	Validation Loss 0.01
Training...
Validating...
Epoch 9:	Training Loss 0.01	/	Validation Loss 0.01
Training...
Validating...
Epoch 10:	Training Loss 0.00	/	Validation Loss 0.01


### Observing the outputs

In [None]:
from tqdm.auto import tqdm

# Instance of pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# Load the trained model
model.load_state_dict(torch.load("/gdrive/MyDrive/AML/FinalProject/Best_BART_V2.pt"))
if torch.cuda.is_available():
    model.cuda()
# Set the model to evaluation mode:
model.eval()
predictions = []

# Iterate over the test_dataset and generate predictions using the model:
with tqdm(total=len(test_data), desc="Generating Predictions") as progress_bar:
    for batch in test_data:
        if torch.cuda.is_available():
            batch = {k: v.to('cuda') for k, v in batch.items()}

        with torch.no_grad():
            outputs = model.generate(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=headline_length,
                num_beams=4,
                early_stopping=True
            )

        # Convert the generated output ids back to text
        generated_headlines = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(generated_headlines)

        progress_bar.update(1)

# Get the actual headlines from data_test
actual_headlines = data_test["Headline"]

Generating Predictions:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# Compare the predictions with actual headlines
for prediction, actual in zip(predictions, actual_headlines):
    print("Prediction:", prediction)
    print("Actual:", actual)
    print()


Prediction:  the the thethethethetototobutbutbutnotnotnotandandandbutnoteveneveneventhethenoteventhenotnottotonotevennotnoteventotothethebutnottonotnotbutnotthethehownottobutnotbutbuttonot
Actual: chennai restaurant introduces 39donald trump white dosa39

Prediction:  the the thethethethetototobutbutbutnotnotandandandbutnoteveneveneven even even even right right now now now that there is now now also now now not not not even even now now actually now now currently currently currently also now currently also not now currently actually
Actual: europe39s 1st spacebased data satellite launched

Prediction:  the the thethethethetototobutbutbutnotnotnotandandandnotnotbutnotbutbutwhenbutbutandandbutbutaccordingaccordingaccordingbutnotevenevenbutbuteveneveneventhethebutnottobutnotyetyetyetnotnoteventhenotnot
Actual: mathura clashes sp died brain haemorrhage

Prediction:  the the thethethethetototobutbutbutnotnotnotandandandbutnoteveneveneventhethenoteventhenotnottotonotevennotnoteventotothethe

### BLEU Scores

In [None]:
! pip install evaluate
import evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [None]:
bleu = evaluate.load("google_bleu")
total_bleu_score = 0
for prediction, actual in zip(predictions, actual_headlines):
    total_bleu_score = total_bleu_score + bleu.compute(predictions=[prediction], references=[actual])['google_bleu']
avg_bleu_score = total_bleu_score/len(predictions)
print(avg_bleu_score)

Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

4.347826086956522e-06
