In [34]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv
import os
import pprint

In [28]:
def map_reduce():

    # Init LLM -> groq
    load_dotenv()
    os.environ["GROQ_API_KEY"] = os.environ.get('GROQ_API_KEY')
    llm = init_chat_model("llama3-8b-8192", model_provider="groq")

    # Def map template and init prompt
    map_template = "Write a concise summary of the following: {docs}."
    map_prompt = ChatPromptTemplate([("human", map_template)])
    map_chain = LLMChain(llm=llm, prompt=map_prompt)

    reduce_template = """
    The following is a set of summaries:
    {docs}
    Take these and distill it into a final, consolidated summary
    of the main themes.
    """
    reduce_prompt = ChatPromptTemplate([("human", reduce_template)])

    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="docs"
    )   
    
    # reduce mapped docs | This is final chain that is called.
    reduce_documents_chain = ReduceDocumentsChain(
        combine_documents_chain=combine_documents_chain, # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        token_max=1000, # The maximum number of tokens to group documents into.
    )
    
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
        )

In [38]:
%%time
if __name__ == "__main__":

    # Iterate through PDF files in the articles directory
    articles_dir = "article/"
    
    # Split docs to chunk
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    
    for filename in os.listdir(articles_dir):
        if filename.endswith(".pdf"):
            file_path = os.path.join(articles_dir, filename)
            loader = PyPDFLoader(file_path)
            docs = loader.load()    
            
        split_docs = text_splitter.split_documents(docs)
        print(f"Generated {len(split_docs)} documents of {filename}.")

        # Do map reduce
        map_reduce()
        # Get final summarise text
        result = map_reduce_chain.invoke(split_docs)
        pprint.pp(f"Summary of {filename}:")
        pprint.pp(result["output_text"])

Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 156 0 (offset 0)
Ignoring wrong pointing object 158 0 (offset 0)
Ignoring wrong pointing object 164 0 (offset 0)
Ignoring wrong pointing object 170 0 (offset 0)
Ignoring wrong pointing object 172 0 (offset 0)
Ignoring wrong pointing object 184 0 (offset 0)


Generated 46 documents of LLMs in medicine_accepted.pdf.
'Summary of LLMs in medicine_accepted.pdf:'
('Here is a final, consolidated summary of the main themes:\n'
 '\n'
 'Large Language Models (LLMs) have significant potential to transform '
 'industries, including healthcare, education, and research, but their '
 'development and deployment require careful consideration of challenges and '
 'concerns. Key issues include limitations in answering complex questions, '
 'lack of evaluation and validation, ethical concerns, and potential '
 'inaccuracies. To mitigate these issues, LLMs should be used with expert '
 'oversight, updated training data, and transparency measures. Researchers and '
 'developers must address data quality and bias issues, develop '
 'interpretability measures, implement safeguards for uncertainty and safety, '
 'and establish clear regulations and guidelines for LLM use in healthcare '
 'settings. Ultimately, the successful development and deployment of LLMs '
 