In [2]:
import pandas as pd
import numpy as np
import time
import os
import pickle
import logging
import time
import datetime
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from openai import OpenAI,AzureOpenAI
from summarizer import Summarizer
from datasets import load_dataset, load_from_disk, load_metric
from transformers import pipeline, set_seed,AutoModelForSeq2SeqLM, AutoTokenizer, BertTokenizer,EncoderDecoderModel
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import torch
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!





In [3]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO, 
    filename='./logs/Abstractive Text Summary Generation.log')    

In [4]:
logging.info("==========================================================================================================")
logging.info("Abstractive Text Summarization Start ")

In [5]:
NO_OF_TEST_RECORDS = 100
logging.info(f"No of test records - {NO_OF_TEST_RECORDS}")

In [6]:
start_time_ats=time.time()

sample_test_df = pd.read_csv('./input/test_cleaned.csv', nrows=NO_OF_TEST_RECORDS)
sample_test_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,highlights,article
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...


In [7]:
summaries = {}

In [8]:
duration={}

# Zero Shot Abstractive Summarization

### Baseline Model Bi LSTM based ENC-DEC Model

In [9]:
def roundTS(startTime, endTime):
    return round((endTime -startTime),4)

def avgTimePerRecord(startTime, endTime, no_of_recs):
     return round((endTime -startTime)/no_of_recs ,4)

In [10]:
def generate_abs_baseline(text):
    return "\n".join(sent_tokenize(text)[:4]) # First 4 sentences

In [11]:
# Load the model
enc_model = tf.keras.models.load_model('./models/baseline_abs/encoder_modelBiLSTM.h5', compile=True)
dec_model = tf.keras.models.load_model('./models/baseline_abs/decoder_modelBiLSTM.h5', compile=True)
enc_model.compile()
dec_model.compile()





In [12]:
s_tokenizer = ""
with open('./models/baseline_abs/s_tokenizerBiLSTM.pkl', 'rb') as f:
    s_tokenizer = pickle.load(f)

def generate_summary(input_text):

    # Tokenize the input text
    input_seq = s_tokenizer.texts_to_sequences([input_text])
    input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=300, padding='post')

    # Generate the summary
    h, c = enc_model.predict(input_seq,verbose=0)

    next_token = np.zeros((1, 1))
    next_token[0, 0] = s_tokenizer.word_index['sostok']
    output_seq = ''

    stop = False
    count = 0

    while not stop:
        if count > 100:
            break
        decoder_out, state_h, state_c = dec_model.predict([next_token]+[h, c],verbose=0)
        token_idx = np.argmax(decoder_out[0, -1, :])

        if token_idx == s_tokenizer.word_index['eostok']:
            stop = True
        elif token_idx > 0 and token_idx != s_tokenizer.word_index['sostok']:
            token = s_tokenizer.index_word[token_idx]
            output_seq = output_seq + ' ' + token

        next_token = np.zeros((1, 1))
        next_token[0, 0] = token_idx
        h, c = state_h, state_c
        count += 1

    return output_seq.strip()

In [13]:
t_tokenizer = Tokenizer(num_words=77024)
maxlen_text = 300
maxlen_summ = 50

In [14]:
test_data = sample_test_df.drop(['id'], axis=1)
test_data = test_data.reset_index(drop=True)

In [15]:
test_final_story =[]
test_final_summary =[]

for i, sty in enumerate(test_data['article']):
    sty_len = len(sty.split())
    sumy_len = len(test_data['highlights'][i].split())
    if (sty_len > sumy_len):
        test_final_story.append(sty)
        test_final_summary.append(test_data['highlights'][i])
print(len(test_final_story), len(test_final_summary))

df_testfinal=pd.DataFrame({'story':test_final_story,'summary':test_final_summary})
#df_testfinal.to_csv("test863.csv")
df_testfinal.head(2)

100 100


Unnamed: 0,story,summary
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...


In [16]:
test_inputs = [sent for sent in df_testfinal['story']]
test_inputs = t_tokenizer.texts_to_sequences(list(test_inputs))
test_inputs = pad_sequences(test_inputs, maxlen=maxlen_text, padding='post')

In [17]:
logging.info("Generating Baseline Abstractive Summaries...")
st_baseline_ats=time.time()
df_testfinal['baseline-abs'] = df_testfinal['story'].apply(generate_summary)
sample_test_df['baseline-abs'] = df_testfinal['baseline-abs'].tolist()
end_baseline_ats=time.time()
logging.info(f"Baseline ATS Duration - {roundTS(st_baseline_ats, end_baseline_ats)} seconds")  
df_testfinal.head(3)

Unnamed: 0,story,summary,baseline-abs
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,police say they are not to be a new police say...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,police say they are not to be a new york polic...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,the new york s office says the new york is the...


In [18]:
"""
logging.info("Generating Baseline Abstractive Summaries...")
st_baseline_ats=time.time()
sample_test_df['baseline-abs'] = sample_test_df['article'].apply(generate_abs_baseline)
end_baseline_ats=time.time()
logging.info(f"Baseline ATS Duration - {roundTS(st_baseline_ats, end_baseline_ats)} seconds")  
sample_test_df.head()
"""

'\nlogging.info("Generating Baseline Abstractive Summaries...")\nst_baseline_ats=time.time()\nsample_test_df[\'baseline-abs\'] = sample_test_df[\'article\'].apply(generate_abs_baseline)\nend_baseline_ats=time.time()\nlogging.info(f"Baseline ATS Duration - {roundTS(st_baseline_ats, end_baseline_ats)} seconds")  \nsample_test_df.head()\n'

In [19]:
duration['Baseline'] = avgTimePerRecord(st_baseline_ats, end_baseline_ats, NO_OF_TEST_RECORDS)

In [20]:
print('article: \n', sample_test_df.iloc[0]['article'][:512])
print('baseline-ext: \n', sample_test_df.iloc[0]['baseline-abs'])

article: 
 Ever noticed how plane seats appear to be getting smaller and smaller With increasing numbers of people taking to the skies some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable it s putting our health and safety in danger. More than squabbling over the arm rest shrinking space on planes putting our health and safety in danger This week a U.S consumer advisory group set up by the Department of Tr
baseline-ext: 
 police say they are not to be a new police say they are not to be a police say they are not to be a police say they are not to be a police say they are not to be a man


#### BERT Abstractive Summarization

In [21]:
# bert-base-uncased model zero shot summarization

def generate_abs_bert_zs(text):
    pipe = pipeline("summarization", model="bert-base-uncased", max_new_tokens=80)
    pipe_out = pipe(text[:512])
    #print(pipe_out)
    return "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

logging.info("Generating BERT Abstractive Summaries (Before Fine Tuning) ...")    
st_bert_zs_ats=time.time()
sample_test_df['bert-base-abs-zs'] = sample_test_df['article'].apply(generate_abs_bert_zs)
end_bert_zs_ats=time.time()
logging.info(f"BERT ZS ATS Duration - {roundTS(st_bert_zs_ats,end_bert_zs_ats)} seconds")  
sample_test_df.head(3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Some weights of the model checkpoint at bert-base-uncased were no

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The model 'BertForMaskedLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseFo

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...


In [22]:
duration['BERT'] = avgTimePerRecord(st_bert_zs_ats, end_bert_zs_ats, NO_OF_TEST_RECORDS)

In [23]:
print('article: \n', sample_test_df.iloc[0]['article'])
print('bert-base-abs-zs: \n', sample_test_df.iloc[0]['bert-base-abs-zs'])

article: 
 Ever noticed how plane seats appear to be getting smaller and smaller With increasing numbers of people taking to the skies some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable it s putting our health and safety in danger. More than squabbling over the arm rest shrinking space on planes putting our health and safety in danger This week a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes it does not stipulate a minimum amount of space for humans. In a world where animals have more rights to space and food than humans said Charlie Leocha consumer representative on the committee. It is time that the DOT and FAA take a stand for humane treatment of passengers. But could crowding on planes lead to more serious issues than fighting for space in

#### T5 Abstractive Summarization

In [24]:
def generate_abs_t5_zs(text):
    pipe = pipeline('summarization', model = 't5-small' )
    pipe_out = pipe(text)
    return 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

In [25]:
logging.info("Generating T5 Abstrctive Summaries ...")    
st_t5_zs_ats=time.time()
sample_test_df['t5-small-abs-zs'] = sample_test_df['article'].apply(generate_abs_t5_zs)
end_t5_zs_ats=time.time()
logging.info(f"T5 ZS ATS Duration - {roundTS(st_t5_zs_ats, end_t5_zs_ats) } seconds")    
sample_test_df.head(3)

Your max_length is set to 200, but your input_length is only 162. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=81)
Token indices sequence length is longer than the specified maximum sequence length for this model (970 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1120 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1125 > 512). Running this sequence through the model will result in indexing errors
Token indices sequen

Token indices sequence length is longer than the specified maximum sequence length for this model (1750 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (835 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2034 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (990 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...


In [26]:
duration['T5'] = avgTimePerRecord(st_t5_zs_ats, end_t5_zs_ats, NO_OF_TEST_RECORDS)

In [27]:
print('article: \n', sample_test_df.iloc[0]['article'])
print('t5-small-abs-zs: \n', sample_test_df.iloc[0]['t5-small-abs-zs'])

article: 
 Ever noticed how plane seats appear to be getting smaller and smaller With increasing numbers of people taking to the skies some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable it s putting our health and safety in danger. More than squabbling over the arm rest shrinking space on planes putting our health and safety in danger This week a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes it does not stipulate a minimum amount of space for humans. In a world where animals have more rights to space and food than humans said Charlie Leocha consumer representative on the committee. It is time that the DOT and FAA take a stand for humane treatment of passengers. But could crowding on planes lead to more serious issues than fighting for space in

#### GPT 3.5 Abstractive Summarization

In [28]:
client = OpenAI(
    # This is` the default and can be omitted
    api_key='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',
)
AZURE_OPENAI_KEY = "ee16f3418f0740ce8a1a21e262a839a3"
AZURE_OPENAI_ENDPOINT = "https://tmap-openai.openai.azure.com"

In [29]:
client = AzureOpenAI(
  azure_endpoint = "https://tmap-openai.openai.azure.com/", 
  api_key="ee16f3418f0740ce8a1a21e262a839a3",  
  api_version="2023-09-15-preview")

In [30]:
def generate_abs_gpt_zs(record, model, temp, max_tokens):
    summary = ""
    
    abs_pmt = f"""
                Your task is to create a concise, factual summary, 
                
                by selecting and combining key sentences from  the original text. 
                
                Text is delimited by triple backticks. 

                TEXT: ```{record['article']}```
            """
    try:
        response = client.chat.completions.create(
        model=model, 
        messages=[
            {"role": "system", "content": "You are a LLM trained by OpenAI."},
            {"role": "user", "content": abs_pmt}

        ],
        max_tokens = max_tokens,
        temperature = temp,
        n=1,        
        )
        summary = response.choices[0].message.content
    except Exception as e:
        logging.info(f"Exception occured - {e}")
        summary = "ERROR"
    
    return summary

In [31]:
logging.info("Generating GPT3.5 Abstractive Summaries (Before Prompt Tuning) ...") 
st_gpt35_zs_ats=time.time()
sample_test_df['gpt-3.5-abs-zs'] = sample_test_df.apply(lambda rec: generate_abs_gpt_zs(rec, 'gpt35', 0, 60), axis=1)
end_gpt35_zs_ats=time.time()
logging.info(f"GPT35 ZS ETS Duration - {roundTS(st_gpt35_zs_ats, end_gpt35_zs_ats)} seconds")    

In [32]:
sample_test_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...,Experts are concerned that the shrinking space...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...,
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...,Dougie Freedman is close to signing a new two-...


In [33]:
duration['GPT35'] = avgTimePerRecord(st_gpt35_zs_ats, end_gpt35_zs_ats, NO_OF_TEST_RECORDS)

In [34]:
print('article: \n', sample_test_df.iloc[0]['article'][:200])
print('gpt-3.5-abs-zs: \n', sample_test_df.iloc[0]['gpt-3.5-abs-zs'])

article: 
 Ever noticed how plane seats appear to be getting smaller and smaller With increasing numbers of people taking to the skies some experts are questioning if having such packed out planes is putting pas
gpt-3.5-abs-zs: 
 Experts are concerned that the shrinking space on aeroplanes is not only uncomfortable but also putting passengers' health and safety in danger. A US consumer advisory group has said that the government does not stipulate a minimum amount of space for humans on planes, despite setting standards for animals. Tests conducted by the


#### GPT 4 Abstractive Summarization

In [35]:

client = AzureOpenAI(
  azure_endpoint = "https://tmap-openai.openai.azure.com/", 
  api_key="ee16f3418f0740ce8a1a21e262a839a3",  
  api_version="2023-09-15-preview")

In [36]:
logging.info("Generating GPT4 Abstractive Summaries (Before Prompt Tuning) ...") 
st_gpt4_zs_ats=time.time()
#sample_test_df['gpt-4-abs-zs'] = sample_test_df['article'].apply(generate_abs_gpt_4_zs)
sample_test_df['gpt-4-abs-zs'] = sample_test_df.apply(lambda rec: generate_abs_gpt_zs(rec, 'gpt4', 0, 60), axis=1)
end_gpt4_zs_ats=time.time()
logging.info(f"GPT4 ZS ETS Duration - {roundTS(st_gpt4_zs_ats, end_gpt4_zs_ats)} seconds")    
sample_test_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...,Experts are concerned that the shrinking space...,Experts are raising concerns that the decreasi...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...,,"Rahul Kumar, a 17-year-old intoxicated teenage..."
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...,Dougie Freedman is close to signing a new two-...,Dougie Freedman is close to signing a new two-...


In [37]:
duration['GPT4'] = avgTimePerRecord(st_gpt4_zs_ats, end_gpt4_zs_ats, NO_OF_TEST_RECORDS)

In [38]:
print('article: \n', sample_test_df.iloc[0]['article'])
print('gpt-4-abs-zs: \n', sample_test_df.iloc[0]['gpt-4-abs-zs'])

article: 
 Ever noticed how plane seats appear to be getting smaller and smaller With increasing numbers of people taking to the skies some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable it s putting our health and safety in danger. More than squabbling over the arm rest shrinking space on planes putting our health and safety in danger This week a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes it does not stipulate a minimum amount of space for humans. In a world where animals have more rights to space and food than humans said Charlie Leocha consumer representative on the committee. It is time that the DOT and FAA take a stand for humane treatment of passengers. But could crowding on planes lead to more serious issues than fighting for space in

In [39]:
sample_test_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...,Experts are concerned that the shrinking space...,Experts are raising concerns that the decreasi...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...,,"Rahul Kumar, a 17-year-old intoxicated teenage..."
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...,Dougie Freedman is close to signing a new two-...,Dougie Freedman is close to signing a new two-...


# Fine Tuned - Abstractive Text Summarization

###### Prerequisite - Fine Tuned BERT, T5 Pretrained Models 

#### BERT Abstractive Summarization

In [40]:
sample_test_df

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...,Experts are concerned that the shrinking space...,Experts are raising concerns that the decreasi...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...,,"Rahul Kumar, a 17-year-old intoxicated teenage..."
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...,Dougie Freedman is close to signing a new two-...,Dougie Freedman is close to signing a new two-...
3,3,caabf9cbdf96eb1410295a673e953d304391bfbb,Fiorentina goalkeeper Neto has been linked wit...,Liverpool target Neto is also wanted by PSG an...,the man is a new york city of the city of the ...,liverpool target neto is also wanted by psg an...,neto is wanted by a number of top european clu...,Liverpool faces competition from PSG and Spani...,Liverpool is interested in signing Fiorentina ...
4,4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,"Tell-all interview with the reality TV star, 6...",Bruce Jenner will break his silence in a two h...,the woman s mother says the girl s mother says...,bruce jenner will break his silence in a two h...,the former Olympian will speak in a two hour i...,"Bruce Jenner, the former Olympian and reality ...",Bruce Jenner will discuss his life in a two-ho...
...,...,...,...,...,...,...,...,...,...
95,95,64ee7c9eb9f1efbb7da0ce80498434c623615b84,Zlatan Ibrahimovic will line up against former...,As Zlatan Ibrahimovic famously believes the Wo...,the man says the woman is a woman to be a new ...,as zlatan ibrahimovic famously believes the wo...,Zlatan Ibrahimovic will take centre stage agai...,"Zlatan Ibrahimovic, the PSG striker, is set to...","Zlatan Ibrahimovic, known for his confidence a..."
96,96,5cf4682cd03238d5867027ce9492b626cd1ed011,"Jameela Jamil, 29, is convinced dental work tr...",Jameela spent GBP3 000 on having all her amalg...,police say they are not to be a new york polic...,jameela spent gbp3 000 on having all her amalg...,dental amalgam has been used for more than 150...,"Jameela Jamil, a television presenter and form...","Jameela Jamil, a 29-year-old television presen..."
97,97,3815d19af18ff22be6ad6095722d7367bb7271af,"Christopher Bridger, 25, attacked three women ...",A paramedic who pretended he was gay to get cl...,the man s body was found in the hospital in th...,a paramedic who pretended he was gay to get cl...,"Christopher Bridger 25, from Stevenage Hertfor...","Christopher Bridger, a paramedic from Stevenag...",
98,98,fb207604ffa7e8371c622840445825db8993d4d2,Paris Saint-Germain captain Thiago Silva suffe...,Paris Saint Germain face Nice on Saturday hopi...,police say they are not to be a new york city ...,paris saint germain face nice on saturday hopi...,Thiago Silva is recovering at home from a thig...,Paris Saint Germain will face Nice on Saturday...,"Paris Saint Germain, missing key players inclu..."


In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [42]:
bert_tokenizer = BertTokenizer.from_pretrained("./models/bert-base-cnn-finetuned")
bert_model_ft = EncoderDecoderModel.from_pretrained("./models/bert-base-cnn-finetuned")
bert_model_ft.to(device)


def generate_abs_bert_ft(text):
    # cut off at BERT max length 512
    inputs = bert_tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    output = bert_model_ft.generate(input_ids, attention_mask=attention_mask)
    return bert_tokenizer.decode(output[0], skip_special_tokens=True)

logging.info("Generating BERT Abstractive Summaries (After Fine Tuning) ...")
st_bert_ft_ats=time.time()                   
sample_test_df['bert-base-abs-ft'] = sample_test_df['article'].apply(generate_abs_bert_ft)
end_bert_ft_ats=time.time()
logging.info(f"BERT FT ATS Duration - {roundTS(st_bert_ft_ats, end_bert_ft_ats)} seconds")  
sample_test_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs,bert-base-abs-ft
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...,Experts are concerned that the shrinking space...,Experts are raising concerns that the decreasi...,the dot and faa are happy to set standards for...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...,,"Rahul Kumar, a 17-year-old intoxicated teenage...",rahul kumar 17 climbed into the enclosure fenc...
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...,Dougie Freedman is close to signing a new two-...,Dougie Freedman is close to signing a new two-...,dougie freedman is set to sign a new two year ...


In [43]:
duration['BERT-FT'] = avgTimePerRecord(st_bert_ft_ats, end_bert_ft_ats,NO_OF_TEST_RECORDS)

#### T5 Abstractive Summarization

In [44]:
#Invoking HuggingFace custom finetuned T5 model from local folder
from transformers import pipeline
summarizer = pipeline("summarization", model="./models/t5-small-cnn-hf-finetuned")
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

def generate_abs_t5_ft(text):
    response = summarizer(text)
    return response[0]['summary_text']


In [45]:
logging.info(f"T5 Model Fine Tuned Arch: \n{summarizer.model}")

In [46]:
logging.info("Generating T5 Abstract Summaries (After Prompt Tuning) ...") 
st_t5_ft_ats=time.time()
sample_test_df['t5-small-abs-ft'] = sample_test_df['article'].apply(generate_abs_t5_ft)
end_t5_ft_ats=time.time()
logging.info(f"GPT35 FT ETS Duration - {roundTS(st_t5_ft_ats, end_t5_ft_ats)} seconds")
sample_test_df.head(3)

Your max_length is set to 200, but your input_length is only 162. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=81)
Token indices sequence length is longer than the specified maximum sequence length for this model (970 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 200, but your input_length is only 188. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)


Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs,bert-base-abs-ft,t5-small-abs-ft
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...,Experts are concerned that the shrinking space...,Experts are raising concerns that the decreasi...,the dot and faa are happy to set standards for...,A U.S consumer advisory group set up by the De...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...,,"Rahul Kumar, a 17-year-old intoxicated teenage...",rahul kumar 17 climbed into the enclosure fenc...,Rahul Kumar 17 climbed into a lions enclosure ...
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...,Dougie Freedman is close to signing a new two-...,Dougie Freedman is close to signing a new two-...,dougie freedman is set to sign a new two year ...,Dougie Freedman is set to sign a new two year ...


In [47]:
duration['T5-FT'] = avgTimePerRecord(st_t5_ft_ats, end_t5_ft_ats,NO_OF_TEST_RECORDS)

In [48]:
sample_test_df.iloc[0]['t5-small-abs-ft']

'A U.S consumer advisory group set up by the Department of Transportation said that while the government is happy to set standards for animals flying on planes it does not stipulate a minimum amount of space for humans . It is time that the DOT and FAA take a stand for humane treatment of passengers . But these tests are conducted using planes with a 31 inch pitch a standard which on some airlines has decreased .'

#### GPT 3.5 Abstractive Summarization

In [49]:
# Computation of Length of article and hughlights
def article_len(row):
    return len(row['article'].split())

def highlights_len(row):
    return len(row['highlights'].split())


sample_test_df['article_len'] = sample_test_df.apply(lambda r: article_len(r), axis= 1)
sample_test_df['highlights_len'] = sample_test_df.apply(lambda r: highlights_len(r), axis= 1)

In [50]:
sorted_by_article_size_df = sample_test_df.sort_values('article_len')

In [51]:
def generate_GPT_Abs_1_Shot(record, model, temp, max_tokens):
    summary = ""
    abs_pmt = f"""
                Your task is to create a concise, factual summary, 
                
                by selecting and combining key sentences from  the original text. 
                
                Text is delimited by triple backticks. 

                TEXT: ```{record['article']}```
            """
    
    #create a concise summary by selecting and combining key sentences from the original text : {text}"
    try:
        response = client.chat.completions.create(
        model=model, # model = "deployment_name".
        messages=[
            {"role": "system", "content": "You are a LLM trained by OpenAI."},
            {"role":"user","content":sorted_by_article_size_df.iloc[0]['article']},
            {"role":"assistant","content":sorted_by_article_size_df.iloc[0]['highlights']},
            {"role": "user", "content": abs_pmt},
            ],
        max_tokens = max_tokens,
        temperature = temp, #top_p=.9
        n=1,
        )
        summary = response.choices[0].message.content
    except Exception as e:
        logging.info("Exception occured - {e}") 
        summary = "ERROR"
    
    return summary

def generate_GPT_Abs_3_Shots(record, model, temp, max_tokens):
    summary = ""
    
    abs_pmt = f"""
                Your task is to create a concise, factual summary, 
                
                by selecting and combining key sentences from  the original text. 
                
                Text is delimited by triple backticks. 

                TEXT: ```{record['article']}```
            """
    try:
        response = client.chat.completions.create(
        model=model, 
        messages=[
            {"role": "system", "content": "You are a LLM trained by OpenAI."},

            {"role":"user","content":sorted_by_article_size_df.iloc[0]['article']},
            {"role":"assistant","content":sorted_by_article_size_df.iloc[0]['highlights']},
            
            {"role":"user","content":sorted_by_article_size_df.iloc[1]['article']},
            {"role":"assistant","content":sorted_by_article_size_df.iloc[1]['highlights']},

            {"role":"user","content":sorted_by_article_size_df.iloc[2]['article']},
            {"role":"assistant","content":sorted_by_article_size_df.iloc[2]['highlights']},
            
            {"role": "user", "content": abs_pmt},
            
        ],
        max_tokens = max_tokens,
        temperature = temp,
        n=1,
        )
        summary = response.choices[0].message.content
    except Exception as e:
        logging.info("Exception occured - {e}") 
        summary = "ERROR"
    
    return summary

def generate_GPT_Abs_5_Shots(record, model, temp, max_tokens):
    summary = ""
       
    abs_pmt = f"""
                Your task is to create a concise, factual summary, 

                by selecting and combining key sentences from  the original text. 

                Text is delimited by triple backticks. 

                TEXT: ```{record['article']}```
            """
    try:
        response = client.chat.completions.create(
            model= model, # model = "deployment_name".
            messages=[
                {"role": "system", "content": "You are a LLM trained by OpenAI."},

                {"role":"user","content":sorted_by_article_size_df.iloc[0]['article']},
                {"role":"assistant","content":sorted_by_article_size_df.iloc[0]['highlights']},

                {"role":"user","content":sorted_by_article_size_df.iloc[1]['article']},
                {"role":"assistant","content":sorted_by_article_size_df.iloc[1]['highlights']},

                {"role":"user","content":sorted_by_article_size_df.iloc[2]['article']},
                {"role":"assistant","content":sorted_by_article_size_df.iloc[2]['highlights']},

                {"role":"user","content":sorted_by_article_size_df.iloc[3]['article']},
                {"role":"assistant","content":sorted_by_article_size_df.iloc[3]['highlights']},

                {"role":"user","content":sorted_by_article_size_df.iloc[4]['article']},
                {"role":"assistant","content":sorted_by_article_size_df.iloc[4]['highlights']},

                {"role": "user", "content": abs_pmt},

            ],
            max_tokens = max_tokens,
            temperature = temp, #top_p=.9
            n=1,
            )
        summary = response.choices[0].message.content
    except Exception as e:
        logging.info("Exception occured - {e}") 
        summary = "ERROR"
  
    return summary

In [52]:
logging.info("Generating GPT35 Extractive Summaries (After Prompt Tuning) ...") 
st_gpt35_ft_ats=time.time()
sample_test_df['gpt-3.5-abs-ft'] = sample_test_df.apply(lambda rec: generate_GPT_Abs_1_Shot(rec, 'gpt35', 0, 60), axis=1)
end_gpt35_ft_ats=time.time()
logging.info(f"GPT35 FT ATS Duration - {roundTS(st_gpt35_ft_ats, end_gpt35_ft_ats)} seconds")   
sample_test_df.tail(3)

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs,bert-base-abs-ft,t5-small-abs-ft,article_len,highlights_len,gpt-3.5-abs-ft
97,97,3815d19af18ff22be6ad6095722d7367bb7271af,"Christopher Bridger, 25, attacked three women ...",A paramedic who pretended he was gay to get cl...,the man s body was found in the hospital in th...,a paramedic who pretended he was gay to get cl...,"Christopher Bridger 25, from Stevenage Hertfor...","Christopher Bridger, a paramedic from Stevenag...",,christopher bridger 25 attacked three women af...,Christopher Bridger 25 from Stevenage Hertford...,738,60,"Christopher Bridger, a paramedic from Stevenag..."
98,98,fb207604ffa7e8371c622840445825db8993d4d2,Paris Saint-Germain captain Thiago Silva suffe...,Paris Saint Germain face Nice on Saturday hopi...,police say they are not to be a new york city ...,paris saint germain face nice on saturday hopi...,Thiago Silva is recovering at home from a thig...,Paris Saint Germain will face Nice on Saturday...,"Paris Saint Germain, missing key players inclu...",thiago silva is recuperating at home from a th...,Thiago Silva is recuperating at home from a th...,565,40,Paris Saint Germain will face Nice on Saturday...
99,99,d25d52c434a13c1df5faa593e8a097d2f501a2b6,.50-caliber bullets equipped with optical sens...,CNN You know the phrase dodging a bullet Forge...,the man was arrested in the scene of the incid...,cnn you know the phrase dodging a bullet forge...,the smart bullets .50 caliber projectiles pass...,The US military has made significant progress ...,The U.S. military has made significant advance...,the u. s. military has made great progress in ...,In February the smart bullets .50 caliber proj...,331,39,The US military has made progress in developin...


In [53]:
duration['GPT35-FT'] = avgTimePerRecord(st_gpt35_ft_ats, end_gpt35_ft_ats,NO_OF_TEST_RECORDS)

#### GPT 4 Abstractive Summarization

In [54]:
logging.info("Generating GPT4 Extractive Summaries (After Prompt Tuning) ...") 
st_gpt4_ft_ats=time.time()
sample_test_df['gpt-4-abs-ft'] = sample_test_df.apply(lambda rec: generate_GPT_Abs_1_Shot(rec, 'gpt4', 1, 60), axis=1)
end_gpt4_ft_ats=time.time()
logging.info(f"GPT4 FT ETS Duration - {roundTS(st_gpt4_ft_ats, end_gpt4_ft_ats)} seconds")   
sample_test_df.tail(3)

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs,bert-base-abs-ft,t5-small-abs-ft,article_len,highlights_len,gpt-3.5-abs-ft,gpt-4-abs-ft
97,97,3815d19af18ff22be6ad6095722d7367bb7271af,"Christopher Bridger, 25, attacked three women ...",A paramedic who pretended he was gay to get cl...,the man s body was found in the hospital in th...,a paramedic who pretended he was gay to get cl...,"Christopher Bridger 25, from Stevenage Hertfor...","Christopher Bridger, a paramedic from Stevenag...",,christopher bridger 25 attacked three women af...,Christopher Bridger 25 from Stevenage Hertford...,738,60,"Christopher Bridger, a paramedic from Stevenag...",
98,98,fb207604ffa7e8371c622840445825db8993d4d2,Paris Saint-Germain captain Thiago Silva suffe...,Paris Saint Germain face Nice on Saturday hopi...,police say they are not to be a new york city ...,paris saint germain face nice on saturday hopi...,Thiago Silva is recovering at home from a thig...,Paris Saint Germain will face Nice on Saturday...,"Paris Saint Germain, missing key players inclu...",thiago silva is recuperating at home from a th...,Thiago Silva is recuperating at home from a th...,565,40,Paris Saint Germain will face Nice on Saturday...,"Paris Saint Germain, missing key players inclu..."
99,99,d25d52c434a13c1df5faa593e8a097d2f501a2b6,.50-caliber bullets equipped with optical sens...,CNN You know the phrase dodging a bullet Forge...,the man was arrested in the scene of the incid...,cnn you know the phrase dodging a bullet forge...,the smart bullets .50 caliber projectiles pass...,The US military has made significant progress ...,The U.S. military has made significant advance...,the u. s. military has made great progress in ...,In February the smart bullets .50 caliber proj...,331,39,The US military has made progress in developin...,The U.S. military has made significant advance...


In [55]:
duration['GPT4-FT'] = avgTimePerRecord(st_gpt4_ft_ats, end_gpt4_ft_ats,NO_OF_TEST_RECORDS)

#### Saving the generated summaries for evaluation

In [56]:
end_time_ats=time.time()
logging.info(f"Total ATS Duration - {roundTS(start_time_ats, end_time_ats)} seconds")

In [57]:
duration

{'Baseline': 1.6643,
 'BERT': 9.1136,
 'T5': 2.3688,
 'GPT35': 1.2674,
 'GPT4': 6.3076,
 'BERT-FT': 0.471,
 'T5-FT': 1.8913,
 'GPT35-FT': 1.1947,
 'GPT4-FT': 5.7775}

In [58]:
duration_df = pd.DataFrame(duration.items(), columns=['models', 'avg_inf_time'])
duration_df

Unnamed: 0,models,avg_inf_time
0,Baseline,1.6643
1,BERT,9.1136
2,T5,2.3688
3,GPT35,1.2674
4,GPT4,6.3076
5,BERT-FT,0.471
6,T5-FT,1.8913
7,GPT35-FT,1.1947
8,GPT4-FT,5.7775


In [59]:
logging.info(f"MAP - ATS Average Inference Time Per Request - {duration}")

In [60]:
duration_file_path = './output/abs-ts-duration-final.csv'
logging.info(f"Duration will be saved in this file : {duration_file_path}")
duration_df.to_csv(duration_file_path,  mode="w+")

In [61]:
file_path = './output/abs-ts-final.csv'
print("Summaries will be saved in this file : ", file_path)

Summaries will be saved in this file :  ./output/abs-ts-final.csv


In [62]:
logging.info("Abstractive Text Summarization End ")

In [63]:
sample_test_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs,bert-base-abs-ft,t5-small-abs-ft,article_len,highlights_len,gpt-3.5-abs-ft,gpt-4-abs-ft
0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...,Experts are concerned that the shrinking space...,Experts are raising concerns that the decreasi...,the dot and faa are happy to set standards for...,A U.S consumer advisory group set up by the De...,374,36,Experts are concerned that the shrinking space...,Experts are warning that the decreasing space ...
1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...,,"Rahul Kumar, a 17-year-old intoxicated teenage...",rahul kumar 17 climbed into the enclosure fenc...,Rahul Kumar 17 climbed into a lions enclosure ...,317,38,,"A drunk teenager, Rahul Kumar, 17, jumped into..."
2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...,Dougie Freedman is close to signing a new two-...,Dougie Freedman is close to signing a new two-...,dougie freedman is set to sign a new two year ...,Dougie Freedman is set to sign a new two year ...,114,35,"Nottingham Forest's manager, Dougie Freedman, ...",Dougie Freedman is close to signing a new two-...


In [64]:
sample_test_df.to_csv(file_path,  mode="w+")

In [65]:
read_content = pd.read_csv(file_path)
read_content.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,highlights,article,baseline-abs,bert-base-abs-zs,t5-small-abs-zs,gpt-3.5-abs-zs,gpt-4-abs-zs,bert-base-abs-ft,t5-small-abs-ft,article_len,highlights_len,gpt-3.5-abs-ft,gpt-4-abs-ft
0,0,0,92c514c913c0bdfe25341af9fd72b29db544099b,Experts question if packed out planes are put...,Ever noticed how plane seats appear to be gett...,police say they are not to be a new police say...,ever noticed how plane seats appear to be gett...,some experts are questioning if shrinking spac...,Experts are concerned that the shrinking space...,Experts are raising concerns that the decreasi...,the dot and faa are happy to set standards for...,A U.S consumer advisory group set up by the De...,374,36,Experts are concerned that the shrinking space...,Experts are warning that the decreasing space ...
1,1,1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,Drunk teenage boy climbed into lion enclosure ...,A drunk teenage boy had to be rescued by secur...,police say they are not to be a new york polic...,a drunk teenage boy had to be rescued by secur...,Rahul Kumar 17 climbed into the enclosure fenc...,,"Rahul Kumar, a 17-year-old intoxicated teenage...",rahul kumar 17 climbed into the enclosure fenc...,Rahul Kumar 17 climbed into a lions enclosure ...,317,38,,"A drunk teenager, Rahul Kumar, 17, jumped into..."
2,2,2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Nottingham Forest are close to extending Dougi...,Dougie Freedman is on the verge of agreeing a ...,the new york s office says the new york is the...,dougie freedman is on the verge of agreeing a ...,Dougie Freedman is set to sign a new two year ...,Dougie Freedman is close to signing a new two-...,Dougie Freedman is close to signing a new two-...,dougie freedman is set to sign a new two year ...,Dougie Freedman is set to sign a new two year ...,114,35,"Nottingham Forest's manager, Dougie Freedman, ...",Dougie Freedman is close to signing a new two-...
