In [39]:
from pathlib import Path
from langchain.document_loaders import Docx2txtLoader
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer
from langchain import PromptTemplate, HuggingFaceHub, LLMChain

# Attempting summarisation with Legal-Bert

The most downloaded/liked transformer on Huggingface with the "legal" tag. 

Models tried - 
* google/flan-t5-large - Internal server error
* kolpadkar/legal-flan-t5-base - Very poor poor performance
* nlpaueb/legal-bert-base-uncased - Model not supported by HuggingfaceHub (dont have resources to run locally)
* facebook/bart-large-cnn - Shows potential but fails on map-reduce

In [29]:
SUMMARY_SAVE_DIR = Path("data", "summaries")
DATA_DIR = Path("data", "raw")
toc_2015_fname = Path(DATA_DIR, "Jan 2015.docx")
toc_2023_fname = Path(DATA_DIR, "Mar 2023.docx")
loader_2015 = Docx2txtLoader(str(toc_2015_fname))  # str reqd for loader
data_2015 = loader_2015.load()
loader_2023 = Docx2txtLoader(str(toc_2023_fname))
data_2023 = loader_2023.load()

## Map Reduce Template

In [30]:
# Map
map_template = """
Write a concise summary of the following:
"{docs}"
CONCISE SUMMARY:
"""

# Reduce
reduce_template = """Write a concise summary of the following text delimited by triple backquotes.
Return your response in bullet points which covers the key points of the text.
```{doc_summaries}```
BULLET POINT SUMMARY:"""

## facebook/bart-large-cnn

In [38]:
MODEL = "facebook/bart-large-cnn"

In [44]:
llm = HuggingFaceHub(repo_id=MODEL, model_kwargs={"temperature": 0, "max_length": 1000})

In [45]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer=tokenizer, chunk_size=4000, chunk_overlap=0
)
split_2015 = text_splitter.split_documents(data_2015)
split_2023 = text_splitter.split_documents(data_2023)

In [46]:
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Run chain
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [47]:
output_map_reduce_15 = map_reduce_chain.run(split_2015)
output_map_reduce_23 = map_reduce_chain.run(split_2023)

In [48]:
output_map_reduce_15

'You agree that you will pay for all products you purchase through the Stores. You agree to receipt of all invoices in an electronic format, which may include email. You are responsible for the timely payment of all fees and for providing iTunes with a valid payment method. Your total price will include the price of the product plus any applicable VAT.'

In [49]:
output_map_reduce_23

'Apple Media Services Terms and Conditions create a contract between you and Apple (the “Agreement”). Please read the Agreement carefully. Write a concise summary of the following text delimited by triple backquotes. Return your response in bullet points which covers the key points of the text.'