In [None]:
%pip install langchain pinecone-client pypdf openai tiktoken

In [None]:
%pip show langchain #Version: 0.1.4

# Imports and Environment

In [1]:
import os
from dotenv import load_dotenv # type: ignore

In [6]:
load_dotenv()

True

In [7]:
from langchain.document_loaders import PyPDFDirectoryLoader # type: ignore
from langchain.text_splitter import RecursiveCharacterTextSplitter # type: ignore
from langchain.embeddings import OpenAIEmbeddings # type: ignore
from langchain.llms import OpenAI # type: ignore
from langchain.vectorstores import Pinecone# type: ignore
from langchain.chains import RetrievalQA# type: ignore
from langchain.prompts import PromptTemplate# type: ignore

# Load and Chunks Doucment

In [11]:
loader = PyPDFDirectoryLoader("pdfs")

In [12]:
data = loader.load()

In [13]:
data[1:5]

[Document(page_content='Crime and Punishment \x18Translator’s Preface\nA few words about Dostoevsky himself may help the Eng -\nlish reader to understand his work.\nDostoevsky was the son of a doctor. His parents were \nvery hard- working and deeply religious people, but so poor \nthat they lived with their five children in only two rooms. \nThe father and mother spent their evenings in reading aloud \nto their children, generally from books of a serious charac -\nter.\nThough always sickly and delicate Dostoevsky came out \nthird in the final examination of the Petersburg school of \nEngineering. There he had already begun his first work, \n‘Poor Folk.’\nThis story was published by the poet Nekrassov in his \nreview and was received with acclamations. The shy, un -\nknown youth found himself instantly something of a \ncelebrity. A brilliant and successful career seemed to open \nbefore him, but those hopes were soon dashed. In 1849 he \nwas arrested.\nThough neither by temperament nor

In [18]:
print(f"You have {len(data)} pages") 

You have 767 pages


In [19]:
data_remove_authorsnote = data[5:]

In [20]:
temp = data_remove_authorsnote[1].page_content

In [21]:
for string in data_remove_authorsnote:
    string.page_content = string.page_content.replace("\x18 Free eBooks at Planet eBook.com", "")


In [22]:
for string in data_remove_authorsnote:
    string.page_content = string.page_content.replace("Crime and Punishment \x18", "")

In [23]:
from pprint import pprint
print(data_remove_authorsnote[1].page_content)

Chapter I
On an exceptionally hot evening early in July a young 
man came out of the garret in which he lodged in S. 
Place and walked slowly, as though in hesitation, towards 
K. bridge.
He had successfully avoided meeting his landlady on the 
staircase. His garret was under the roof of a high, five-sto -
ried house and was more like a cupboard than a room. The 
landlady who provided him with garret, dinners, and at -
tendance, lived on the floor below, and every time he went 
out he was obliged to pass her kitchen, the door of which 
invariably stood open. And each time he passed, the young 
man had a sick, frightened feeling, which made him scowl 
and feel ashamed. He was hopelessly in debt to his landlady, 
and was afraid of meeting her.
This was not because he was cowardly and abject, quite 
the contrary; but for some time past he had been in an over -
strained irritable condition, verging on hypochondria. He 
had become so completely absorbed in himself, and iso -
lated from his 

In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap=20, 
                                               separators=[".",",",""] ) 

# default separator list of ["\n\n", "\n", " ", ""] 
#can cause words to be split between chunks

    #separators=[
    #     "\n\n",
    #     "\n",
    #     " ",
    #     ".",
    #     ",",
    #     "\u200B",  # Zero-width space
    #     "\uff0c",  # Fullwidth comma
    #     "\u3001",  # Ideographic comma
    #     "\uff0e",  # Fullwidth full stop
    #     "\u3002",  # Ideographic full stop
    #     "",
    # ]

In [25]:
text_chunks = text_splitter.split_documents(data_remove_authorsnote)

In [26]:
text_chunks[:5]

[Document(page_content='Part I', metadata={'source': 'pdfs\\crime-and-punishment.pdf', 'page': 5}),
 Document(page_content='Chapter I\nOn an exceptionally hot evening early in July a young \nman came out of the garret in which he lodged in S. \nPlace and walked slowly, as though in hesitation, towards \nK. bridge.\nHe had successfully avoided meeting his landlady on the \nstaircase. His garret was under the roof of a high, five-sto -\nried house and was more like a cupboard than a room. The \nlandlady who provided him with garret, dinners, and at -\ntendance, lived on the floor below, and every time he went \nout he was obliged to pass her kitchen, the door of which \ninvariably stood open. And each time he passed, the young \nman had a sick, frightened feeling, which made him scowl \nand feel ashamed. He was hopelessly in debt to his landlady, \nand was afraid of meeting her.\nThis was not because he was cowardly and abject, quite \nthe contrary; but for some time past he had been in 

In [27]:
print(f"You have {len(text_chunks)} chunks") 

You have 762 chunks


tree summrisation
stuff
map reduce

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain_community.chat_models import AzureChatOpenAI

llm = AzureChatOpenAI(
    deployment_name="chat",
    model="gpt-35-turbo-16k",
    openai_api_base=os.environ["OPENAI_API_BASE"],
    openai_api_type="azure",
    api_version = "2023-03-15-preview"
)

In [30]:
map_prompt_template = """
                      Write a summary of this chunk of text that includes the main points and any important details of the mentioned story.
                      {text}
                      """

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

combine_prompt_template = """
                      Write a concise summary of the following text delimited by triple backquotes.
                      Return your response in bullet points which covers the key points of the text.
                      ```{text}```
                      BULLET POINT SUMMARY:
                      """

combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["text"]
)

In [31]:
map_reduce_chain = load_summarize_chain(
    llm,
    chain_type="map_reduce",
    map_prompt=map_prompt,
    combine_prompt=combine_prompt,
    return_intermediate_steps=True,
)

Refine method for text summarization with LLMs can pull in more relevant context and may be less lossy than Map Reduce. However, it requires many more calls to the LLM than Stuffing, and these calls are not independent, meaning they cannot be parallelized. Additionally, there is some potential dependency on the ordering of the documents. Latest documents they might become more relevant as this method suffers from recency bias.

In [32]:
map_reduce_outputs = map_reduce_chain({"input_documents": text_chunks})

  warn_deprecated(


ValueError: Azure has not provided the response due to a content filter being triggered

In [33]:
question_prompt_template = """
                  Please provide a summary of the following text.
                  TEXT: {text}
                  SUMMARY:
                  """

question_prompt = PromptTemplate(
    template=question_prompt_template, input_variables=["text"]
)

refine_prompt_template = """
              Write a concise summary of the following text delimited by triple backquotes.
              Return your response in bullet points which covers the key points of the text.
              ```{text}```
              BULLET POINT SUMMARY:
              """

refine_prompt = PromptTemplate(
    template=refine_prompt_template, input_variables=["text"]
)

In [34]:
refine_chain = load_summarize_chain(
    llm,
    chain_type="refine",
    question_prompt=question_prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
)

In [35]:
refine_outputs = refine_chain({"input_documents": text_chunks})

BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400}}

https://atlassc.net/2023/11/18/turning-off-azure-openai-service-s-content-filters