In [1]:
%pip install langchain langchain_openai pypdf langchain-community --quiet --upgrade

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.5/294.5 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.8/401.8 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.7/383.7 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
import requests

In [3]:
# Download the PDF from blob storage:
url = (
    "https://storage.googleapis.com/strapi_cms_assets/principles_of_marketing_book.pdf"
)
response = requests.get(url)
with open("principles_of_marketing_book.pdf", "wb") as f:
    f.write(response.content)

In [4]:
# Create a text splitter, load the PDF and split it:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
loader = PyPDFLoader("principles_of_marketing_book.pdf")
pages = loader.load_and_split()

In [5]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

··········


In [6]:
# # Setup a chat_model in LangChain:
# import os

# os.environ["OPENAI_API_KEY"] = ""
llm = ChatOpenAI()  # type: ignore

```python
def load_summarize_chain(
    llm: BaseLanguageModel,
    chain_type: str = "stuff",
    verbose: bool | None = None,
    **kwargs: Any
) -> BaseCombineDocumentsChain
```

The chain type can be either:

- stuff: Combine all of the documents into a single LLM call and then summarize.
- map_reduce: Summarize each document individually and then combine the summaries.
- refine: Summarize each document sequentially, refining the previous summary with each new document.


In [7]:
summarization_chain = load_summarize_chain(llm=llm, chain_type="map_reduce")

In [8]:
# Summarize the document using the summarization chain:
result = summarization_chain.invoke(pages)



In [None]:
result

'This passage covers a range of topics related to marketing, including defining marketing, strategic planning, consumer behavior, and business buying behavior. It emphasizes the importance of creating value for customers, building relationships, and understanding the internal and external environments. The passage also discusses market segmentation, targeting, and positioning, as well as branding, packaging, and the development of new products. It covers aspects of supply chain management, marketing research, and integrated marketing communications. The passage also touches on public relations, sales, social media, and ethical considerations in marketing. Overall, it provides an overview of key marketing concepts and strategies.'

In [None]:
# Doing a smaller summarization using the same chain:
smaller_amount_of_pages = pages[:10]
second_result = summarization_chain.invoke(smaller_amount_of_pages)

In [None]:
second_result

'"Principles of Marketing" is a comprehensive book that covers various marketing concepts and strategies such as market segmentation, target marketing, advertising, and branding. It is designed to help individuals effectively promote products and services. The book is an adapted version of a work released under a Creative Commons license in 2010, with minor changes made by the University of Minnesota Libraries. It includes chapters on topics like target marketing, product development, marketing channels, and supply chains. Additionally, there is a chapter on pricing and another on the marketing plan. The book defines marketing and discusses its importance, and also provides discussion questions and activities for further engagement. However, it notes that the traditional four Ps of marketing (product, promotion, place, price) do not fully capture all the activities of marketing.'

# Doing a custom `MapReduceChain` to generate a summary in Spanish:


In [None]:
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter


map_template_string = """Give the following pages of a marketing book. Generate a summary in Spanish:
Pages:
{pages}

Summary:
"""

reduce_template_string = """Given the following Spanish summaries of pages of a marketing book, generate a high level description of the book in Spanish:
Summaries:
{summaries}
"""

# Prompt to use in map and reduce stages
MAP_PROMPT = PromptTemplate(input_variables=["pages"], template=map_template_string)
REDUCE_PROMPT = PromptTemplate(
    input_variables=["summaries"], template=reduce_template_string
)

# LLM to use in map and reduce stages
map_llm_chain = LLMChain(llm=llm, prompt=MAP_PROMPT)
reduce_llm_chain = LLMChain(llm=llm, prompt=REDUCE_PROMPT)

# Takes a list of documents and combines them into a single string
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_llm_chain,
    document_variable_name="summaries",
)

# Combining documents by mapping a chain over them, then combining results with reduce chain
combine_documents = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_llm_chain,
    # Reduce chain
    # The variable name in the llm_chain to put the documents in:
    document_variable_name="pages",
    combine_document_chain=combine_documents_chain,
)

map_reduce = MapReduceChain(
    combine_documents_chain=combine_documents,
    text_splitter=CharacterTextSplitter(
        separator="\n##\n", chunk_size=100, chunk_overlap=0
    ),
)

In [None]:
map_reduce_result = map_reduce(
    {"input_text": "\n".join([doc.page_content for doc in pages])[0:100]}
)

{'output_text': 'El libro de marketing proporciona una introducción a los principios fundamentales del marketing y su aplicación en el mundo empresarial. El autor destaca la importancia de conocer al cliente, segmentar el mercado y dirigirse a diferentes segmentos de clientes de manera efectiva. También explora conceptos como el posicionamiento y la diferenciación, así como el papel del marketing en la creación de valor y la construcción de relaciones sólidas con los clientes. En resumen, el libro establece las bases para comprender y aplicar con éxito las estrategias de marketing en el mundo empresarial actual.'}