In [2]:
from pathlib import Path
from langchain.document_loaders import Docx2txtLoader
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from create_summaries import load_template
from typing import Union, Optional
from langchain.docstore.document import Document

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


# Attempting summarisation with huggingface models

The most downloaded/liked transformer on Huggingface with the "legal" tag. 

Models tried - 
* google/flan-t5-large - Internal server error
* kolpadkar/legal-flan-t5-base - Very poor poor performance
* nlpaueb/legal-bert-base-uncased - Model not supported by HuggingfaceHub (dont have resources to run locally)
* facebook/bart-large-cnn - Shows potential but fails on map-reduce

In [3]:
SUMMARY_SAVE_DIR = Path("data", "summaries")
DATA_DIR = Path("data", "raw")
toc_2015_fname = Path(DATA_DIR, "Jan 2015.docx")
toc_2023_fname = Path(DATA_DIR, "Mar 2023.docx")
loader_2015 = Docx2txtLoader(str(toc_2015_fname))  # str reqd for loader
data_2015 = loader_2015.load()
loader_2023 = Docx2txtLoader(str(toc_2023_fname))
data_2023 = loader_2023.load()

## facebook/bart-large-cnn

In [4]:
MODEL = "facebook/bart-large-cnn"

In [23]:
def map_reduce_huggingface(model_name:str, 
                           input_document:list[Document],
                           save_dir:Optional[Union[Path,str]] = None, 
                            model_kwargs:Optional[dict] = None, 
                            chunk_size:Optional[int] = 4000):
    if model_kwargs is None:
        model_kwargs = {"temperature": 0, "max_length": 1000}
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=0
    )
    doc_split = text_splitter.split_documents(input_document)
    map_template = load_template(Path("prompts_templates", "map_template.txt"))
    reduce_template = load_template(Path("prompts_templates", "reduce_template.txt"))
    map_prompt = PromptTemplate.from_template(map_template)
    llm = HuggingFaceHub(repo_id=model_name, model_kwargs=model_kwargs)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)

    # Run chain
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )

    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=750,
    )

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    output = map_reduce_chain.run(doc_split)
    if save_dir is not None:
        with open(save_dir, "w") as f:
            f.write(output)
        return output
    return output


In [10]:
bart_large_2015_fname = Path("data", "summaries", "bart_large_2015_summary.txt")
bart_large_2015_output = map_reduce_huggingface(MODEL, 
                                                data_2015, 
                                                bart_large_2015_fname)



In [11]:
bart_large_2023_fname = Path("data", "summaries", "bart_large_2023_summary.txt")
bart_large_2023_output = map_reduce_huggingface(MODEL, 
                                                data_2023, 
                                                bart_large_2023_fname)

# BLOOM

In [12]:
MODEL = "bigscience/bloom"

In [26]:
model_kwargs = {"temperature": 1, "max_length": 500}
bloom_large_2015_fname = Path("data", "summaries", "bloom_2015_summary.txt")
bloom_large_2015_output = map_reduce_huggingface(model_name=MODEL, 
                                                input_document = data_2015, 
                                                save_dir = bloom_large_2015_fname, 
                                                model_kwargs = model_kwargs, 
                                                chunk_size=250)

Created a chunk of size 412, which is longer than the specified 250
Created a chunk of size 333, which is longer than the specified 250
Created a chunk of size 384, which is longer than the specified 250
Created a chunk of size 274, which is longer than the specified 250
Created a chunk of size 369, which is longer than the specified 250
Created a chunk of size 297, which is longer than the specified 250


ValueError: Error raised by inference API: Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate