# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**Make sure to install the required libraries first ('nltk', 'rouge-score', 'bert-score') using 'pip'. Here's the command:**<br>
*!pip install nltk rouge-score bert-score*

In [2]:
!pip install nltk rouge-score bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=0b3a5a79e737bb149f03b72127c29ec57c1e60315b41fc2404b146a93c33b6cd
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [3]:
import nltk
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score

In [4]:
import warnings

# Suppress the warning
warnings.filterwarnings("ignore", message="Your max_length is set to *")

# Reading the Dataset

In [5]:
dataset = load_dataset("ccdv/cnn_dailymail","3.0.0")

Downloading builder script:   0%|          | 0.00/9.27k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/ccdv___cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/ccdv___cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

**Here I have used 'ccdv/cnn_dailymail' dataset from HuggingFace.**

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [7]:
df= dataset['test'].to_pandas()

# Analyzing the Dataset

In [8]:
df= df.sample(n= 150, replace= False).reset_index(drop= True)          # you may proceed with the complete test data 

In [9]:
df.head()

Unnamed: 0,article,highlights,id
0,A large gas pipeline exploded into a tower of ...,The explosion at the Fresno County Sherrif's g...,95a942ed60852ec257f145d8bafd0206295dab7b
1,Chip shop boss John Clarkson has created a new...,John Clarkson's chocolate pie is covered in pa...,63b642d8ef28195623c688424a8481525ba342f0
2,A luxury Knightsbridge flat which is brushed w...,The Knightsbridge flat is located in the same ...,84455a9fb5977a2450a827cba4408c12c8c502c8
3,"Boston (CNN)When the bomb went off, Steve Wool...","Tsarnaev family members arrive in Boston, but ...",3dbabe067edd51229516912001c3d7b212f64f1b
4,The owner of a takeaway where pizza boxes were...,Inspectors found Pizza Plus Fried Chicken in G...,bc8ec02c90c8a2058e1b69148f03959d8c6ef5f0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   article     150 non-null    object
 1   highlights  150 non-null    object
 2   id          150 non-null    object
dtypes: object(3)
memory usage: 3.6+ KB


In [11]:
print('Article:\n',df['article'][0])
print('\nSummary:\n',df['highlights'][0])

Article:
 A large gas pipeline exploded into a tower of fire on Friday in Central California, closing both directions of a major highway and injuring at least 15 people, four of them critically, authorities said. It was not clear what caused the explosion at the Fresno County Sheriff's gun range that brought traffic in the area to a halt. The explosion on a Pacific Gas & Electric Co. pipe carrying natural gas happened while an equipment operator and a group of county jail inmates were expanding a road alongside Highway 99, according to Fresno County Sheriff Margaret Mims. Scroll down for video . A fireball erupts after a large gas pipeline exploded in Fresno, California, on Friday. The explosion and fire closed both directions of Highway 99, authorities said . A firefighter watches the blaze shot well over 100 feet into the air. Eleven people were injured, three critically, in the explosion . The flames shot well over 100 feet into the air, witnesses said. Traffic heading north and sou

# Model Metrics Evaluation

**Here's a brief description of the metrics I have used to compare the different text summarization models:**<br><br>
**1) BLEU Score (Bilingual Evaluation Understudy):** BLEU evaluates the quality of a generated summary by comparing it to one or more reference summaries. It calculates precision by comparing n-grams (sequences of n consecutive words) between the generated and reference summaries. BLEU scores range between 0 and 1, where 1 indicates perfect similarity between the candidate and reference summaries. <br>
<br>
**2) BERT Score:** BERTScore is a metric for evaluating the quality of text generation tasks by leveraging pre-trained BERT embeddings. It measures the similarity between the model-generated summary and the reference summary at the token level. BERTScore typically ranges between 0 and 1, with higher values indicating better agreement between the model-generated summary and the reference summary.<br>
<br>
**3) Rouge Scores (Recall-Oriented Understudy for Gisting Evaluation):** It is a set of metrics commonly used for evaluating automatic summarization tasks. It measures the overlap between the model-generated summary and the reference summary in terms of n-grams (sequences of n consecutive words). The three main variants of ROUGE are ROUGE-N (measuring n-gram overlap), ROUGE-L (measuring the longest common subsequence), and ROUGE-W (measuring the weighted longest common subsequence).ROUGE scores typically range between 0 and 1, with higher values indicating better overlap and similarity between the model-generated summary and the reference summary.<br>
<br>
**4) Redundancy Score:** The redundancy score quantifies the extent to which a summary unnecessarily repeats information present in the original text or across multiple summaries. It measures the ratio of unique words to the total number of words in the summaries, where lower scores indicate less repetition and higher efficiency in conveying information.

In [12]:
def calculate_redundancy(summaries):
    
    total_tokens = sum(len(summary.split()) for summary in summaries)
    unique_tokens = len(set(token for summary in summaries for token in summary.split()))
    redundancy_score = 1 - (unique_tokens / total_tokens)
    
    return redundancy_score

In [13]:
def calc_metrics(actual_summaries, pred_summaries):
    
    # Calculate BLEU Score
    actual_summaries_tokenized = [[ref.split()] for ref in actual_summaries]         # tokenizing the actual summary
    pred_summaries_tokenized = [output.split() for output in pred_summaries]         # tokenizing the predicted summary
    bleu_score = corpus_bleu(actual_summaries_tokenized, pred_summaries_tokenized)   # comparing the tokens to calculate BLEU score
    
    
    # Calculate BERT Score
    P, R, F1 = score(actual_summaries, pred_summaries, lang='en', verbose=False)     # returns Precision, Recall and F1 score
    bert_score = F1.mean().item()                                                    # takes the mean of F1 scores across all examples (.item() used to convert PyTorch tensor into scalar value) 
    
    
    # Calculate ROUGE Scores
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)          # creates an object 'rouge' that will be used to compute ROUGE scores with ROUGE-1, ROUGE-2, and ROUGE-L metrics, using stemming
             
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_L_scores = []
    for pred, actual in zip(pred_summaries, actual_summaries):
        rouge_scores = rouge.score(pred, actual)                                                # returns a dictionary of mentioned ROUGE scores each of which contain precison, recall and F1 score
        rouge_1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge_2_scores.append(rouge_scores['rouge2'].fmeasure)
        rouge_L_scores.append(rouge_scores['rougeL'].fmeasure)
    
    rouge_1_f1 = sum(rouge_1_scores) / len(rouge_1_scores)                                      # calculating the average rouge scores (considering F1 score)
    rouge_2_f1 = sum(rouge_2_scores) / len(rouge_2_scores)
    rouge_L_f1 = sum(rouge_L_scores) / len(rouge_L_scores)
    
    # Calculate Redundancy Score
    redundancy_score = calculate_redundancy(pred_summaries)
    
    
    return bleu_score, bert_score, rouge_1_f1, rouge_2_f1, rouge_L_f1, redundancy_score        # returning all the calculated metrics

# Testing on Different Models

### 1. facebook/bart-large-cnn

In [14]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

2024-02-01 07:29:35.054864: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-01 07:29:35.055042: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-01 07:29:35.285569: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
print(summarizer(df['article'][0], max_length= 130, min_length=30, truncation= True))  # truncation used to truncate input text while max_length for output text

[{'summary_text': 'Explosion on a Pacific Gas & Electric Co. pipe carrying natural gas happened at the gun range while an equipment operator and a group of county jail inmates were expanding a road. The flames shot well over 100 feet into the air, witnesses said. Traffic heading north and south on Highway 99 in Fresno was halted by the explosion about 2.30pm as flames towered over the roadway. The highway was reopened three hours later, the CHP said.'}]


In [16]:
summarizer(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text']       # setting max output length to 130

'Explosion on a Pacific Gas & Electric Co. pipe carrying natural gas happened at the gun range while an equipment operator and a group of county jail inmates were expanding a road. The flames shot well over 100 feet into the air, witnesses said. Traffic heading north and south on Highway 99 in Fresno was halted by the explosion about 2.30pm as flames towered over the roadway. The highway was reopened three hours later, the CHP said.'

In [17]:
predictions= []
for i in range(0,len(df)):
    pred= summarizer(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [18]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# initializing empty lists to store metrics of all the models

BLEU_scores= []
BERT_scores= []
Rouge_1_scores= []
Rouge_2_scores= []
Rouge_L_scores= []
Redundancy_scores= []

In [20]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)

BLEU Score:  0.1285869547069145
BERT Score:  0.877794086933136
Rouge-1 Score:  0.43767660912738987
Rouge-2 Score:  0.2042084676561586
Rouge-L Score:  0.30111026012099273
Redundancy Score:  0.5657821694237815


In [21]:
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

### 2. sshleifer/distilbart-cnn-12-6

In [22]:
from transformers import pipeline

pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [23]:
pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text']       # setting max output length to 130

' A large gas pipeline exploded into a tower of fire on Friday in Central California, closing both directions of a major highway and injuring at least 15 people, four of them critically . The explosion on a Pacific Gas & Electric Co. pipe carrying natural gas happened at the gun range while an equipment operator and a group of county jail inmates were expanding a road alongside Highway 99 . The flames shot well over 100 feet into the air, witnesses said .'

In [24]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [25]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)

BLEU Score:  0.13890381126708942
BERT Score:  0.8815963864326477
Rouge-1 Score:  0.432002219801714
Rouge-2 Score:  0.19704600474371883
Rouge-L Score:  0.29118396511639155
Redundancy Score:  0.6242317822651449


In [27]:
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

### 3. philschmid/bart-large-cnn-samsum

In [28]:
from transformers import pipeline

pipe = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [29]:
print(pipe(df['article'][0], max_length= 130, min_length=30, truncation= True))  # truncation used to truncate input text while max_length for output text

[{'summary_text': "A large gas pipeline exploded on Friday in Central California, closing both directions of a major highway and injuring at least 15 people, four of them critically. It was not clear what caused the explosion at the Fresno County Sheriff's gun range. It happened while an equipment operator and a group of county jail inmates were expanding a road alongside Highway 99. The driver of the front-loader was a county public works employee who had been working at the shooting range all day working on a tall berm that confines gunfire to the range. The flames shot well over 100 feet into the air. Traffic heading north and south on Highway 99 in Fresno"}]


In [30]:
pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text']       # setting max output length to 130

"A large gas pipeline exploded on Friday in Central California, closing both directions of a major highway and injuring at least 15 people, four of them critically. It was not clear what caused the explosion at the Fresno County Sheriff's gun range. It happened while an equipment operator and a group of county jail inmates were expanding a road alongside Highway 99. The driver of the front-loader was a county public works employee who had been working at the shooting range all day working on a tall berm that confines gunfire to the range. The flames shot well over 100 feet into the air. Traffic heading north and south on Highway 99 in Fresno"

In [31]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [32]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)

BLEU Score:  0.114001718584896
BERT Score:  0.8751322627067566
Rouge-1 Score:  0.4191393887900409
Rouge-2 Score:  0.18219041751332063
Rouge-L Score:  0.27514953597129865
Redundancy Score:  0.5833051706657655


In [34]:
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

### 4. google/pegasus-cnn_dailymail

In [35]:
from transformers import pipeline

pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [36]:
print(pipe(df['article'][0], max_length= 130, min_length=30, truncation= True))  # truncation used to truncate input text while max_length for output text

[{'summary_text': "The explosion on a Pacific Gas & Electric Co. pipe happened while an equipment operator and a group of county jail inmates were expanding a road .<n>It was not clear what caused the explosion at the Fresno County Sheriff's gun range that brought traffic in the area to a halt .<n>The flames shot well over 100 feet into the air, witnesses said .<n>PG&E's natural-gas operations have been under scrutiny following a fiery 2010 PG&E pipeline blast that killed eight people in the San Francisco suburb of San Bruno ."}]


In [37]:
pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text']       # setting max output length to 130

"The explosion on a Pacific Gas & Electric Co. pipe happened while an equipment operator and a group of county jail inmates were expanding a road .<n>It was not clear what caused the explosion at the Fresno County Sheriff's gun range that brought traffic in the area to a halt .<n>The flames shot well over 100 feet into the air, witnesses said .<n>PG&E's natural-gas operations have been under scrutiny following a fiery 2010 PG&E pipeline blast that killed eight people in the San Francisco suburb of San Bruno ."

In [38]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [39]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)

BLEU Score:  0.13457708739583674
BERT Score:  0.8745781183242798
Rouge-1 Score:  0.4276316502842483
Rouge-2 Score:  0.199597059988215
Rouge-L Score:  0.2912140850728996
Redundancy Score:  0.5800445016980911


In [41]:
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

### 5. knkarthick/MEETING_SUMMARY

In [42]:
from transformers import pipeline

pipe = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")

config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [43]:
print(pipe(df['article'][0], max_length= 130, min_length=30, truncation= True))  # truncation used to truncate input text while max_length for output text

[{'summary_text': 'Eleven people were injured, three critically, when a gas pipeline exploded in Central California, closing a major highway and injuring at least 15 others.'}]


In [44]:
pipe(df['article'][0], max_length= 130, min_length=30, truncation= True)[0]['summary_text']       # setting max output length to 130

'Eleven people were injured, three critically, when a gas pipeline exploded in Central California, closing a major highway and injuring at least 15 others.'

In [45]:
predictions= []
for i in range(0,len(df)):
    pred= pipe(df['article'][i], max_length=130, min_length=30, truncation= True)[0]['summary_text']
    predictions.append(pred)

In [46]:
BLEU,BERT,Rouge_1,Rouge_2,Rouge_L,Redundancy = calc_metrics(df['highlights'].tolist(), predictions)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
print('BLEU Score: ',BLEU)
print('BERT Score: ',BERT)
print('Rouge-1 Score: ',Rouge_1)
print('Rouge-2 Score: ',Rouge_2)
print('Rouge-L Score: ',Rouge_L)
print('Redundancy Score: ',Redundancy)

BLEU Score:  0.06398575282637979
BERT Score:  0.8719260692596436
Rouge-1 Score:  0.35280306328842403
Rouge-2 Score:  0.13373615800088431
Rouge-L Score:  0.2374173758717094
Redundancy Score:  0.5459575227556666


In [48]:
BLEU_scores.append(BLEU)
BERT_scores.append(BERT)
Rouge_1_scores.append(Rouge_1)
Rouge_2_scores.append(Rouge_2)
Rouge_L_scores.append(Rouge_L)
Redundancy_scores.append(Redundancy)

# TOPSIS to find out best model

In [49]:
models= ['facebook/bart-large-cnn','sshleifer/distilbart-cnn-12-6','philschmid/bart-large-cnn-samsum','google/pegasus-cnn_dailymail','knkarthick/MEETING_SUMMARY']

In [50]:
scores= [BLEU_scores,BERT_scores,Rouge_1_scores,Rouge_2_scores,Rouge_L_scores,Redundancy_scores]
for score in scores:
    for i in range(len(score)):
        score[i]= np.round(score[i],3)

In [51]:
df_topsis= pd.DataFrame({
    'Model': models,
    'BLEU': BLEU_scores,
    'BERT': BERT_scores,
    'Rouge-1': Rouge_1_scores,
    'Rouge-2': Rouge_2_scores,
    'Rouge-L': Rouge_L_scores,
    'Redundancy': Redundancy_scores
})

In [52]:
weights= [1,1,1,1,1,1]             # assuming equal weights (you may choose weights according to your priorities)        
impacts= ['+','+','+','+','+','-']

In [53]:
def normalize(matrix):
    norm_matrix = matrix / np.sqrt(np.sum(matrix**2, axis=0))                    # normalize the matrix
    return norm_matrix

def weighted_normalize(norm_matrix, weights):
    weighted_norm_matrix = norm_matrix * weights                                 # calculate the weighted normalized matrix
    return weighted_norm_matrix

def ideal_best_worst(weighted_norm_matrix, impacts):
    ideal_solution = np.max(weighted_norm_matrix, axis=0) * impacts              # calculate the ideal_best and ideal_worst solutions
    ideal_worst_solution = np.min(weighted_norm_matrix, axis=0) * impacts
    return ideal_solution, ideal_worst_solution

def euclidean_distances(weighted_norm_matrix, ideal_solution, ideal_worst_solution):
    dist_to_ideal = np.sqrt(np.sum((weighted_norm_matrix - ideal_solution)**2, axis=1))           # Calculate the Euclidean distances to the ideal_best and ideal_worst solutions.
    dist_to_ideal_worst = np.sqrt(np.sum((weighted_norm_matrix - ideal_worst_solution)**2, axis=1))
    return dist_to_ideal, dist_to_ideal_worst

def performance_score(dist_to_ideal, dist_to_ideal_worst):
    score = dist_to_ideal_worst / (dist_to_ideal + dist_to_ideal_worst)            # calculate the topsis score for each model
    return score

def topsis(matrix, weights, impacts):                                              # perform TOPSIS analysis
    # Step 1: Normalize the decision matrix
    norm_matrix = normalize(matrix)
    
    # Step 2: Calculate the weighted normalized decision matrix
    weighted_norm_matrix = weighted_normalize(norm_matrix, weights)
    
    # Step 3: Determine the ideal_best and ideal_worst solutions
    ideal_solution, ideal_worst_solution = ideal_best_worst(weighted_norm_matrix, impacts)
    
    # Step 4: Calculate the Euclidean distances to the ideal_best and ideal_worst solutions
    dist_to_ideal, dist_to_ideal_worst = euclidean_distances(weighted_norm_matrix, ideal_solution, ideal_worst_solution)
    
    # Step 5: Calculate the performance score for each alternative/model
    score = performance_score(dist_to_ideal, dist_to_ideal_worst)
    
    # Step 6: Rank the alternatives/models based on their performance scores
    sorted_indices = np.argsort(score)[::-1]                                       # Indices of scores sorted in descending order
    rankings = np.empty_like(sorted_indices)                                       # Create an empty array to store rankings
    rankings[sorted_indices] = np.arange(len(score)) + 1                           # Assign ranks
    
    return score, rankings

In [54]:
df_metrics= df_topsis.drop('Model',axis=1)
impacts_as_integers = [1 if impact == '+' else -1 for impact in impacts]

In [55]:
topsis_score, rankings = topsis(df_metrics, weights, impacts_as_integers)

In [56]:
for i in range(len(topsis_score)):
    topsis_score[i] = np.round(topsis_score[i], 3)

In [57]:
df_topsis['TOPSIS Score'] = topsis_score
df_topsis['TOPSIS Rank'] = rankings

In [58]:
df_topsis

Unnamed: 0,Model,BLEU,BERT,Rouge-1,Rouge-2,Rouge-L,Redundancy,TOPSIS Score,TOPSIS Rank
0,facebook/bart-large-cnn,0.129,0.878,0.438,0.204,0.301,0.566,0.5,3
1,sshleifer/distilbart-cnn-12-6,0.139,0.882,0.432,0.197,0.291,0.624,0.501,1
2,philschmid/bart-large-cnn-samsum,0.114,0.875,0.419,0.182,0.275,0.583,0.49,4
3,google/pegasus-cnn_dailymail,0.135,0.875,0.428,0.2,0.291,0.58,0.5,2
4,knkarthick/MEETING_SUMMARY,0.064,0.872,0.353,0.134,0.237,0.546,0.465,5


# Thank You !!