<a href="https://colab.research.google.com/github/kabilan942/Natural-Language-Processing/blob/main/Legal%20Document%20Summarization/BART_chunking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Modules Required

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install rouge
!pip install rouge_score
!pip install datasets evaluate transformers[sentencepiece]
!pip install nltk
!pip install datasets
!pip install -U sentence-transformers
!pip install transformers[torch]

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
nltk.download('punkt')
from rouge import Rouge
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from nltk.tokenize import sent_tokenize
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from nltk import tokenize
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import torch.nn.functional as F
import torch
import json
import random
import os
import sys
sys.path.insert(0, '../')
from IPython.display import display, HTML
#from transformers import LEDTokenizer, LEDForConditionalGeneration
#from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
rouge_score = evaluate.load('rouge')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Getting the data

In [None]:
def get_root_path():
    '''
    function to get root path of dataset
    change the path variable to the path of the dataset
    '''
    # path = "/home/pahelibhattacharya/Abhayv2/camera_ready/summarization/dataset"
    path = "/content/drive/MyDrive/AbstractiveDataset/"
    return path

def get_summary_data():
    '''
    function to get names, documents, and summaries
    change the path variable to the path of the dataset
    '''

    path = get_root_path() + '/judgement'
    all_files = glob.glob(path + "/*.txt")
    data_source = []
    names = []
    for filename in all_files:
        with open(filename, 'r') as f:
            p = filename.rfind("/")
            names.append(filename[p+1:])
            a = f.read()
            data_source.append(a)
    path = get_root_path() + '/summary'
    all_files = glob.glob(path + "/*.txt")
    data_summary = []
    for filename in all_files:
        with open(filename, 'r') as f:
            a = f.read()
            l = len(a)
            data_summary.append(a)

    return names, data_source, data_summary

def split_to_sentences(para):
    sents = nltk.sent_tokenize(para)   # returns list of sentences from para
    return sents

In [None]:
names, data_source, data_summary = get_summary_data()
print(len(names))
print(len(data_source))
print(len(data_summary))

3
3
3


## Converting Abstractive summary to Extractive

In [None]:
from rouge_score import rouge_scorer

In [None]:
def saliency_score(doc, summary, alpha=0.5):
  # takes in two inputs and returns the saliency score
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
  scores = scorer.score(doc, summary)
  R1 = scores['rouge1'][2]
  R2 = scores['rouge2'][2]

  return alpha * R1 + (1 - alpha) * R2

In [None]:
docs = []
extractive_summary = []
abstractive_summary = []
doc_id = []
abs_len = []
extr_len = []
doc_len = []

for i in tqdm(range(len(data_source))):
  doc = data_source[i]
  summary = data_summary[i]
  doc_sentences = split_to_sentences(doc)
  sum_sentences = split_to_sentences(summary)
  pred_summary = []
  sal_index = []
  for s_sent in sum_sentences:
    # looping through every sentence in the summary
    rouge_score_list = []
    # looping through every sentence in the doc for each sent in summary
    for d_sent in doc_sentences:
      rouge_score_list.append(saliency_score(d_sent, s_sent))
    temp = sorted(rouge_score_list)[-3:]
    #sal_index = []
    # getting 3 indices of doc sentence with highest saliency score
    for ele in temp:
      sal_index.append(rouge_score_list.index(ele))
    # remove any duplicate indices
    #sal_index = list(set(sal_index))
  sal_index = list(set(sal_index))
  sal_index_sorted = sorted(sal_index)

  # pred_summary - list of sentences having the top 3 saliency scores as predicted by the model
  for ind in sal_index_sorted:
    pred_summary.append(doc_sentences[ind])
  # the sentences in pred_summary are joined
  final_pred_summary = ' '.join(pred_summary)
  extractive_summary.append(final_pred_summary)
  abstractive_summary.append(data_summary[i])
  docs.append(data_source[i])
  doc_id.append(names[i])
  abs_len.append(len(data_summary[i].split(' ')))
  extr_len.append(len(final_pred_summary.split(' ')))
  doc_len.append(len(data_source[i].split(' ')))

100%|██████████| 3/3 [00:09<00:00,  3.15s/it]


In [None]:
df = pd.DataFrame({'doc_id':doc_id,'Doc':docs,'Abstractive summary (Original)':abstractive_summary,'Extractive Summary':extractive_summary,
                   'Length of Doc':doc_len,'Length of Abstractive (Original) Summary':abs_len,'Length of Extractive Summary':extr_len})
df.to_excel('extr_abstr_legal.xlsx')

In [None]:
df.head()

Unnamed: 0,doc_id,Doc,Abstractive summary (Original),Extractive Summary,Length of Doc,Length of Abstractive (Original) Summary,Length of Extractive Summary
0,1.txt,Appeal No. LXVI of 1949.\nAppeal from the High...,The charge created in respect of municipal pro...,Appeal No. This is an appeal against a judgmen...,3278,150,374
1,2.txt,XXIX of 1950.\nApplication under article 32 of...,Section 7 (1) (c) of the East Punjab Public Sa...,XXIX of 1950. The facts are stated in the judg...,4683,418,1184
2,3.txt,XXXVII of 1950.\nApplication under article 32 ...,"Section 4, sub section\n(1) (c), of the East P...",The following judgments were delivered: KANIA ...,5128,932,1410


## Chunking - using similarity measures


In [None]:
from rouge_score import rouge_scorer
from evaluate import load

def saliency_score_rougef(doc, summary, alpha=0.5):
  # takes in two inputs and returns the saliency score
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
  scores = scorer.score(doc, summary)
  R1 = scores['rouge1'][2]
  R2 = scores['rouge2'][2]

  return alpha * R1 + (1 - alpha) * R2

def saliency_score_rouger(doc, summary, alpha=0.5):
  # takes in two inputs and returns the saliency score
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
  scores = scorer.score(doc, summary)
  R1 = scores['rouge1'][0]
  R2 = scores['rouge2'][0]

  return alpha * R1 + (1 - alpha) * R2

def calculate_bleu_score(reference, candidate):
    # Convert the reference and candidate sentences into lists of tokens
    reference_tokens = nltk.word_tokenize(reference)
    candidate_tokens = nltk.word_tokenize(candidate)

    # Calculate BLEU score using NLTK's sentence_bleu function
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
    bleu_score = nltk.translate.bleu_score.sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothing_function)

    return bleu_score

def saliency_score_bleu(doc, summary):
  # inputs: (doc[reference_sentence], summary[candidate_sentence])
  # output: bleu score

  return calculate_bleu_score(doc, summary)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity

def similarity_l_l(l1, l2, metric=1):
    '''
    Function to find the most similar sentence in the document for each sentence in the summary
    input:  l1 - Summary sentences
            l2 - Document sentences
    returns a list of document sentence indexes for each sentence in the summary
    '''
    if metric==7:
      document_embeddings = sbert_model.encode(l1+l2)
      similarities=cosine_similarity(document_embeddings)
      #print('Cosine similarity Roberta Sentence Embeddings')

    if metric==6:
      doc_summ_list = l1+l2
      similarities = []
      for i in doc_summ_list:
        temp = []
        for j in doc_summ_list:
          temp.append(saliency_score_bleu(i,j))
        similarities.append(temp)
      print('BLEU Score')

    if metric==5:
      doc_summ_list = l1+l2
      similarities = []
      for i in doc_summ_list:
        temp = []
        for j in doc_summ_list:
          temp.append(saliency_score_rouger(i,j))
        similarities.append(temp)
      print('ROUGE Recall')

    if metric==4:
      doc_summ_list = l1+l2
      similarities = []
      for i in doc_summ_list:
        temp = []
        for j in doc_summ_list:
          temp.append(saliency_score_rougef(i,j))
        similarities.append(temp)
      print('ROUGE f1')

    if metric==3:
      document_embeddings = sbert_model.encode(l1+l2)
      similarities=euclidean_distances(document_embeddings)
      print('Euclidean similarity')

    if metric==2:
      document_embeddings = sbert_model.encode(l1+l2)
      similarities=manhattan_distances(document_embeddings)
      print('Manhattan similarity')

    if metric==1:
      document_embeddings = sbert_model.encode(l1+l2)
      similarities=cosine_similarity(document_embeddings)
      print('Cosine similarity')

    result = []
    for i in range(len(l1)):        # loops through all sentences in summary since first l1 sentences are summary sentences
        vals = similarities[i]      # list of similarity of summary sentence i with all other l1+l2 sentences
        vals = vals[len(l1):]       # considering similarity values of ith summary sentence with all other doc sentences and leaving out the summary sentences
        idx = np.argmax(vals)       # getting the index argument of the doc sent having the highest similarity with ith summary sentence
        result.append(idx)
    return result

In [None]:
def nest_sentencesV2(document,chunk_length):
    '''
    function to chunk a document
    input:  document           - Input document
            chunk_length        - chunk length
    output: list of chunks. Each chunk is a list of sentences.
    '''
    nested = []  # to store list of chunks of a doc
    sent = []    # to store sentences per chunk in loop
    length = 0
    for sentence in nltk.sent_tokenize(document):
        length += len(sentence.split(" "))      # adding no of words per sentence
        if length < chunk_length:
            sent.append(sentence)  # sentences appended to sent till chunk_length is reached
        else:
            nested.append(sent)                 # when word limit exceed chunk length (512 for t5) add the list of sentences (a chunk) after joining to the nested list
            sent = []                           # create a new list to store next batch fof sentences for subsequent chunk
            sent.append(sentence)
            length = 0
            length += len(sentence.split(" "))   # added to include length i=of sentence in else loop. Initially not added
    if len(sent)>0:                             # when all sentences are over and the chunk of remaining sentences are appended to sent
        nested.append(sent)
    return nested

In [None]:
def get_chunks_data_from_docV2(doc, summ, m):
    '''
    Function to generate chunks along with their summaries
    input:  doc - legal Document
            summ - Gold standard summary
    returns a list of chunks and their summaries
    '''
    chunk_summ_word_threshold = 100
    sentence_mapping = {}
    doc_sents = split_to_sentences(doc)
    summ_sents = split_to_sentences(summ)

    result = (similarity_l_l(summ_sents,doc_sents,m)) #metric = (1,2,3,4,5,6,7) = (cosine, manhattan, euclidean, Rouge f1, Rouge Recall, BLEU, cosine with roberta sentence embeddings)

    for i in range(len(summ_sents)):
        sentence_mapping[doc_sents[result[i]]] = summ_sents[i]
    # summ_sents[i]: each sentence in the summary
    # doc_sents[result[i]]: sent in doc corresponding to the index in result that had the highest similarity with that summary sentence

    final_chunks = []
    final_summ = []
    for chunk in nest_sentencesV2(doc, 750):                   # divides the doc into chunks with word limit < 512 and looping through all such chunk in doc
        summ = ""
        for chunk_sent in chunk:
            if chunk_sent in sentence_mapping:                 # if that sentence exist in the sentence_mapping keys - True
                summ = summ + sentence_mapping[chunk_sent]     # add the summary corresponding to that doc sentence to summ
        if len(summ.split(" ")) >= chunk_summ_word_threshold:  # add the chunk of doc-summary pair if the word limit of summary is more that a particular threshold
            final_chunks.append(" ".join(chunk))
            final_summ.append(summ)
    return final_chunks, final_summ

In [None]:
filepath = 'extr_abstr_legal.xlsx'
df = pd.read_excel(filepath,usecols='B,C,E')  # Doc id, Doc, Extractive summary (converted from abstractive)

In [None]:
df.head()

Unnamed: 0,doc_id,Doc,Extractive Summary
0,1.txt,Appeal No. LXVI of 1949.\nAppeal from the High...,Appeal No. This is an appeal against a judgmen...
1,2.txt,XXIX of 1950.\nApplication under article 32 of...,XXIX of 1950. The facts are stated in the judg...
2,3.txt,XXXVII of 1950.\nApplication under article 32 ...,The following judgments were delivered: KANIA ...


In [None]:
names = df['doc_id']
data_source = df['Doc']
data_summary = df['Extractive Summary']

In [None]:
from sentence_transformers import SentenceTransformer

training_chunks = []
training_summs = []
doc_id = []
doc_chunk_length = []
summ_chunk_length = []
metr = 7
if metr==7:
  sbert_model = SentenceTransformer('sentence-transformers/roberta-base-nli-stsb-mean-tokens')
else:
  sbert_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
for i in tqdm(range(len(data_source))):
    cks, summs = get_chunks_data_from_docV2(data_source[i],data_summary[i],metr) #metric = (1,2,3,4,5,6,7) = (cosine, manhattan, euclidean, Rouge f1, Rouge Recall, BLEU, cosine with roberta sentence embeddings)
    training_chunks = training_chunks + cks
    training_summs = training_summs + summs
    temp_doc_len = []
    temp_summ_len = []
    for j in range(len(cks)):
      temp_doc_len.append(len(cks[j].split(' ')))
      temp_summ_len.append(len(summs[j].split(' ')))
    doc_chunk_length+=temp_doc_len
    summ_chunk_length+=temp_summ_len
    for j in range(len(cks)):
      doc_id.append(names[i])

df = pd.DataFrame({'doc_id':doc_id,'data':training_chunks,'summary':training_summs,'Doc Chunk Length':doc_chunk_length,'Summary Chunk Length':summ_chunk_length })
df.to_excel('chunked_mcs.xlsx')


 33%|███▎      | 1/3 [00:00<00:01,  1.68it/s]

Cosine similarity Roberta Sentence Embeddings


 67%|██████▋   | 2/3 [00:01<00:00,  1.82it/s]

Cosine similarity Roberta Sentence Embeddings


100%|██████████| 3/3 [00:01<00:00,  1.68it/s]

Cosine similarity Roberta Sentence Embeddings





In [None]:
df.shape

(10, 5)

In [None]:
df.head()

Unnamed: 0,doc_id,data,summary,Doc Chunk Length,Summary Chunk Length
0,1.txt,Appeal No. LXVI of 1949. Appeal from the High ...,Appeal No.This is an appeal against a judgment...,838,222
1,2.txt,XXIX of 1950. Application under article 32 of ...,XXIX of 1950.The facts are stated in the judgm...,812,377
2,2.txt,The question raised in this case relates to th...,The question raised in this case relates to th...,827,294
3,2.txt,"This, as I have already indicated, is a somewh...",We therefore cannot ignore the fact that prese...,834,180
4,2.txt,It appears that the framers of the Constitutio...,If the Act is to be viewed as I have suggested...,650,132
