# Import Libraries & Dataset, and Instantiate Constant Variables

In [1]:
# Import libraries ****************************************************************************

# Preemptive Packages ----------------------------------------
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import sent_tokenize, word_tokenize
import networkx as nx
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from string import punctuation
punctuation = list(punctuation)
from nltk.stem import WordNetLemmatizer
from sumy.summarizers.lsa import LsaSummarizer

# Extractive Summarization --------------------------
from summa.summarizer import summarize

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# Abstractive Summarization ------------------------------------------------------------------------------------
from transformers import BartTokenizer, BartForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration

# Constant Variables **********************************************************************************************
rawdata_filename = "RawData.xlsx";
summarydata_filename = "SummaryData.xlsx";

# BERT ----------------------------------------------------------------------
model_name_BERT = "bert-base-uncased";
tokenizer_BERT = BertTokenizer.from_pretrained(model_name_BERT);
model_BERT = BertForMaskedLM.from_pretrained(model_name_BERT);

# BART ----------------------------------------------------------------------
model_name_BART = "facebook/bart-large-cnn";
tokenizer_BART = BartTokenizer.from_pretrained(model_name_BART);
model_BART = BartForConditionalGeneration.from_pretrained(model_name_BART);

# T5 -------------------------------------------------------------------------
model_name_T5 = "t5-small";
tokenizer_T5 = T5Tokenizer.from_pretrained(model_name_T5);
model_T5 = T5ForConditionalGeneration.from_pretrained(model_name_T5);

# Import and show raw dataset ****************************************************************************************
df_raw = pd.read_excel(rawdata_filename);
# df_raw

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if yo

> The following sequence would be performed for each section of the PDF report.

|Section|Summarization Type|Package|Technique|
|--|--|--|--|
|Section 1|Extractive|summa|Textrank|
|Section 1|Extractive|transformers|BERT|
|Section 1|Extractive|sumy|Lexrank|
|Section 1|Extractive|sumy|Luhn|
|Section 1|Extractive|sumy|LSA|
|Section 1|Abstractive|transformers|BART|
|Section 1|Abstractive|transformers|T5|
|Section 2|Extractive|summa|Textrank|
|Section 2|Extractive|transformers|BERT|
|Section 2|Extractive|sumy|Lexrank|
|Section 2|Extractive|sumy|Luhn|
|Section 2|Extractive|sumy|LSA|
|Section 2|Abstractive|transformers|BART|
|Section 2|Abstractive|transformers|T5|
|Section 3|Extractive|summa|Textrank|
|Section 3|Extractive|transformers|BERT|
|Section 3|Extractive|sumy|Lexrank|
|Section 3|Extractive|sumy|Luhn|
|Section 3|Extractive|sumy|LSA|
|Section 3|Abstractive|transformers|BART|
|Section 3|Abstractive|transformers|T5|
|Section 4|Extractive|summa|Textrank|
|Section 4|Extractive|transformers|BERT|
|Section 4|Extractive|sumy|Lexrank|
|Section 4|Extractive|sumy|Luhn|
|Section 4|Extractive|sumy|LSA|
|Section 4|Abstractive|transformers|BART|
|Section 4|Abstractive|transformers|T5|
|Section 1|Extractive|summa|Textrank|
|Section 5|Extractive|transformers|BERT|
|Section 5|Extractive|sumy|Lexrank|
|Section 5|Extractive|sumy|Luhn|
|Section 5|Extractive|sumy|LSA|
|Section 5|Abstractive|transformers|BART|
|Section 5|Abstractive|transformers|T5|
|Section 6|Extractive|summa|Textrank|
|Section 6|Extractive|transformers|BERT|
|Section 6|Extractive|sumy|Lexrank|
|Section 6|Extractive|sumy|Luhn|
|Section 6|Extractive|sumy|LSA|
|Section 6|Abstractive|transformers|BART|
|Section 6|Abstractive|transformers|T5|
|Section 7|Extractive|summa|Textrank|
|Section 7|Extractive|transformers|BERT|
|Section 7|Extractive|sumy|Lexrank|
|Section 7|Extractive|sumy|Luhn|
|Section 7|Extractive|sumy|LSA|
|Section 7|Abstractive|transformers|BART|
|Section 7|Abstractive|transformers|T5|

In [2]:
# List/s and Dictionary list/s used to help through the dataframe and above specified parameters
# => Note - Package parameter can be skipped!

summarization_type_list = [];
technique_list = [];

# Summarization Type:
for i in range(7):
    if i < 5:
        summarization_type_list.append("Extractive");
    else:
        summarization_type_list.append("Abstractive");
print(summarization_type_list);

# Technique:
for technique in ["Textrank","BERT","Lexrank","Luhn","LSA","BART","T5"]:
    technique_list.append(technique);
print(technique_list);

['Extractive', 'Extractive', 'Extractive', 'Extractive', 'Extractive', 'Abstractive', 'Abstractive']
['Textrank', 'BERT', 'Lexrank', 'Luhn', 'LSA', 'BART', 'T5']


# Define Functions

In [3]:
# Function used to process text in preparation for summarization
def process_extracted_text(text, include_newline):
    redundant_list = ["QUALITY, SECURITY", "HEALTH & SAFETY ","SUSTAINABILITY","QUALITY, SECURITY, HEALTH & SAFETY, ","SUSTAINABILITY IN BOUTIQUE","17/03/2022"];
    new_text = "";
    for line in text.split("\n"):
        if line not in redundant_list:
            if include_newline == True:
                new_text += line + "\n";
            else:
                new_text += line + " ";
    new_text = new_text[:-1];
    
    return new_text;

In [4]:
# Function used to perform Textrank, BERT, Lexrank, Luhn, LSA, and BERT Extractive Summarization
def extractive_summarization(text, technique, num_sentence, max_length):
    summarized_text = "";
    
    if (num_sentence == None) and (max_length == None):
        if technique == "Textrank":
            summarized_text = summarize(text);
        elif technique == "BERT":
            inputs = tokenizer_BERT(text, return_tensors="pt", add_special_tokens=True, max_length=512, truncation=True);
            outputs = model_BERT(**inputs);
            summary_ids = torch.argmax(outputs.logits, dim=-1);
            summary = tokenizer_BERT.decode(summary_ids[0], skip_special_tokens=True);
            summarized_text = summary;
    else:
        parser = PlaintextParser.from_string(text, Tokenizer("english"));
        summarizer = None;
        
        if technique == "Lexrank":
            summarizer = LexRankSummarizer();
        elif technique == "Luhn":
            summarizer = LuhnSummarizer();
        elif technique == "LSA":
            summarizer = LsaSummarizer();
        
        summary = summarizer(parser.document, num_sentence);
        for sentence in summary:
            summarized_text += str(sentence) + " ";
        summarized_text = summarized_text[:-1];

    return summarized_text;

In [5]:
# Function used to perform BART & T5 Abstractive Summarization
def abstractive_summarization(text, technique, max_length):
    summarized_text = "";
    
    if technique == "BART":
        # Tokenize and summarize the input text using BART
        inputs = tokenizer_BART.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True);
        summary_ids = model_BART.generate(inputs, max_length=max_length, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True);

        # Decode and output the summary
        summary = tokenizer_BART.decode(summary_ids[0], skip_special_tokens=True);
        summarized_text = summary;
    elif technique == "T5":
        # Tokenize and summarize the input text using T5
        inputs = tokenizer_T5.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True);
        summary_ids = model_T5.generate(inputs, max_length=max_length, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True);

        # Decode and output the summary
        summary = tokenizer_T5.decode(summary_ids[0], skip_special_tokens=True)
        summarized_text = summary;
    
    return summarized_text;

In [6]:
# Function used to both pre-process and summarize extracted text based on pre-specified parameters
def summarize_everything(dataframe, summarization_type_list, technique_list):
    include_new_line = True;
    col_list = ["Section","Summarization","Technique","Summarized Text"];
    data = [];

    for j in dataframe.index:
    # for i in range(len(summarization_type_list)):
        # for j in dataframe.index:
        for i in range(len(summarization_type_list)):
            summarization_type = summarization_type_list[i];
            technique = technique_list[i];
            text = process_extracted_text(dataframe.loc[j, "Extracted Text"], include_new_line);
            num_sentence = None;
            max_length = None;
            if technique not in ["Textrank","BERT"]:
                max_length = 300;
                num_sentence = 3;
            if summarization_type == "Extractive":
                summarized_text = extractive_summarization(text, technique, num_sentence, max_length);
                # summarized_text = f"{summarization_type} {technique} {num_sentence} {max_length}";
            elif summarization_type == "Abstractive":
                summarized_text= abstractive_summarization(text, technique, max_length);
                # summarized_text = f"{summarization_type} {technique} {max_length}";
            row = [
                dataframe.loc[j, "Section"],
                summarization_type,
                technique
            ];
            print(row);
            row.append(summarized_text);
            data.append(row);

    dff = pd.DataFrame(data, columns=col_list);
    return dff;

# Pre-Process and Summarize Extracted Text

In [7]:
# Summarize all text based on pre-specified parameters and save to a dataframe
df = summarize_everything(df_raw, summarization_type_list, technique_list);
# df.head()

['1.Nestlé Nespresso Quality and SHE policies –QSHE Role & Responsibilities', 'Extractive', 'Textrank']


['1.Nestlé Nespresso Quality and SHE policies –QSHE Role & Responsibilities', 'Extractive', 'BERT']
['1.Nestlé Nespresso Quality and SHE policies –QSHE Role & Responsibilities', 'Extractive', 'Lexrank']
['1.Nestlé Nespresso Quality and SHE policies –QSHE Role & Responsibilities', 'Extractive', 'Luhn']
['1.Nestlé Nespresso Quality and SHE policies –QSHE Role & Responsibilities', 'Extractive', 'LSA']
['1.Nestlé Nespresso Quality and SHE policies –QSHE Role & Responsibilities', 'Abstractive', 'BART']
['1.Nestlé Nespresso Quality and SHE policies –QSHE Role & Responsibilities', 'Abstractive', 'T5']
['2.Emergency Preparedness and Response', 'Extractive', 'Textrank']
['2.Emergency Preparedness and Response', 'Extractive', 'BERT']
['2.Emergency Preparedness and Response', 'Extractive', 'Lexrank']
['2.Emergency Preparedness and Response', 'Extractive', 'Luhn']
['2.Emergency Preparedness and Response', 'Extractive', 'LSA']
['2.Emergency Preparedness and Response', 'Abstractive', 'BART']
['2.Eme

In [8]:
# Dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Section          49 non-null     object
 1   Summarization    49 non-null     object
 2   Technique        49 non-null     object
 3   Summarized Text  49 non-null     object
dtypes: object(4)
memory usage: 1.7+ KB


In [9]:
# Save dataframe to Excel workbook
df.to_excel(summarydata_filename, index=False);