# Step 1. Load PDF

In [1]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("../docs/Unilever-Annual-Report-and-Accounts-2018.pdf")
doc = loader.load_and_split()

In [2]:
doc

[Document(page_content='UNILEVER ANNUAL REPORT\n \nAND ACCOUNTS 2018\n \nThis document is made up of the Strategic Report, the Governance \nReport, the Financial Statements and Notes, and Additional Information for US Listing Purposes. \nThe Unilever Group consists of Unilever N.V. (NV) and Unilever PLC \n(PLC) together with the companies they control. The terms “Unilever”, the “Group”, “we”, “our” and “us” refer to the Unilever Group. \nOur Strategic Report, pages 1 to 35, contains information about \nus, how we create value and how we run our business. It includes \nour strategy, business model, market outlook and key performance \nindicators, as well as our approach to sustainability and risk. The \nStrategic Report is only part of the Annual Report and Accounts 2018. \nThe Strategic Report has been approved by the Boards and signed \non their behalf by Ritva Sotamaa – Group Secretary. \nOur Governance Report, pages 36 to 65 contains detailed corporate \ngovernance information, our 

# Step 2. Text Splitting (Spliting the PDF into small chunk)

In [3]:
# Modify the size of chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

In [4]:
chunks = text_splitter.split_documents(doc)

In [5]:
chunks[0]

Document(page_content='UNILEVER ANNUAL REPORT\n \nAND ACCOUNTS 2018\n \nThis document is made up of the Strategic Report, the Governance \nReport, the Financial Statements and Notes, and Additional Information for US Listing Purposes. \nThe Unilever Group consists of Unilever N.V. (NV) and Unilever PLC \n(PLC) together with the companies they control. The terms “Unilever”, the “Group”, “we”, “our” and “us” refer to the Unilever Group. \nOur Strategic Report, pages 1 to 35, contains information about \nus, how we create value and how we run our business. It includes \nour strategy, business model, market outlook and key performance \nindicators, as well as our approach to sustainability and risk. The \nStrategic Report is only part of the Annual Report and Accounts 2018. \nThe Strategic Report has been approved by the Boards and signed \non their behalf by Ritva Sotamaa – Group Secretary. \nOur Governance Report, pages 36 to 65 contains detailed corporate \ngovernance information, our C

In [6]:
print(len(doc), len(chunks))

192 739


<font size =3 color=red> A 192 pages document is divided into 739 chunks. 

In [7]:
# Add index
for idx, chunk in enumerate(chunks):
    chunk.metadata["idx"]=idx
    chunk.metadata["context"]=chunk.page_content

# Step 3. Emedding the text into vector

<font size =3 color=red> Embedding is used for translating the text to a numeric vector.

In [8]:
from langchain.embeddings.openai import OpenAIEmbeddings
import os 
from dotenv import load_dotenv
from tqdm import tqdm
import concurrent.futures
from joblib import Parallel, delayed

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key
embeddings = OpenAIEmbeddings()



pinecone_vectors = []
def process_chunk(chunk):
    idx = chunk.metadata["idx"]
    text = chunk.page_content
    embedded_vector = embeddings.embed_documents([text])
#     source = chunk.metadata["source"]
    return str(idx), embedded_vector, chunk.metadata

results = Parallel(n_jobs=-1)(delayed(process_chunk)(chunk) for chunk in tqdm(chunks))

# Extract the results and populate 'pinecone_vectors'
pinecone_vectors = results


100%|█████████████████████████████████████████| 739/739 [00:27<00:00, 26.53it/s]


In [9]:
pinecone_vectors[:2]

[('0',
  [[-0.006968143117539449,
    -0.011570835954851807,
    -0.009224616553699229,
    -0.035180470431239,
    -0.011384932800518836,
    0.009801555964450117,
    -0.012090080865734085,
    0.006942501479778831,
    -0.007538671769604231,
    -0.01459015102477291,
    0.00907717608883782,
    0.009250257725798579,
    -0.047667996915093304,
    0.001801332270433097,
    -0.00041948286303817054,
    0.016179939085189003,
    0.01346191430669302,
    -0.02109033157562878,
    0.024244266524361617,
    -0.016115834292295555,
    -0.004817442310257098,
    0.00074040525892844,
    -0.03594972049538008,
    -0.004759748276049755,
    -0.014051674303493583,
    -0.0003223247056971112,
    0.017718442938761323,
    -0.027462304403342828,
    0.009634884620514195,
    0.01787229369664757,
    -0.027872572470157796,
    0.0006462519846504316,
    0.006596337740196044,
    0.02151342022849342,
    -0.012775997120552282,
    0.023564760562568256,
    -0.0006222128619383641,
    0.0034456088

# Step 4. Upload Embedding Vectors to Pinecone

In [10]:
# Setting of pinecone
import pinecone

pinecone_api_key = os.getenv('PINECONE_KEY')

env_name = "us-west4-gcp-free"
index_name = "test-chatbot-ran"
pinecone.init(api_key=pinecone_api_key,environment=env_name)
index = pinecone.Index(index_name)

# upload vectors 
my_namespace = 'Unilever-2018'
index.delete(deleteAll='true', namespace=my_namespace)
upsert_response = index.upsert(vectors=pinecone_vectors[:100], namespace=my_namespace)

  from tqdm.autonotebook import tqdm


# Step 5. Use the Vectorstore with Langchain

In [11]:
# Load the setting of vectorstore
from langchain.vectorstores import Pinecone
pinecone.init(api_key=pinecone_api_key,environment=env_name)
index = pinecone.Index(index_name)
vectorstore = Pinecone(index, embeddings.embed_query, "context", namespace=my_namespace)

In [12]:
# Get relevant documents/Chunks and use Langchain (QA with source)
from langchain.llms import OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
query = "In what ways could a slowed technological change risk affect our data management enhancement programmes?"
ref_docs = vectorstore.similarity_search(query, k=10)
chain({"input_documents": ref_docs, "question": query}, return_only_outputs=True)

{'output_text': ' A slowed technological change could risk the effectiveness of data management enhancement programmes by reducing the speed of innovation and the ability to keep up with changing trends. It could also reduce the ability to use digital technology to target and drive marketing, and reduce the ability to use digital channels to reach consumers.\nSOURCES: Unilever-Annual-Report-and-Accounts-2018.pdf'}

In [13]:
ref_docs

[Document(page_content='Climate change also threatens our food system which must produce \n50% more food to feed over 9 billion people by 2050. However, \nchanging weather patterns and growing seasons threaten suitable \ncultivation areas around the world. Business can spur positive change \nand achieving food security could create 80 million jobs and business \nopportunities worth $2.3 trillion annually by 2030. Linked to climate change is water scarcity, a threat to 3.2 billion people. If current usage \ncontinues the world will have only 60% of its required water by 2030. \nSee pages 30 and 33 to 35 for more on climate change risks. \nOther environmental concerns are growing in significance, such as \nplastic packaging. The Ellen MacArthur Foundation found that 95% \nof the value of plastic packaging is lost to the economy after one short \nuse, equivalent of $80-120 billion lost to the global economy each year. See pages 14 to 15 and 30 for more on plastic packaging risks and oppor

Options: https://towardsdatascience.com/4-ways-of-question-answering-in-langchain-188c6707cc5a

1. Embeddings: In the example, we used OpenAI Embeddings. But there are many other embedding options such as Cohere Embeddings, and HuggingFaceEmbeddings from specific models. <br>

2. TextSplitter: We used Character Text Splitter in the example where the text is split by a single character. You can also different text splitters and different tokens mentioned in this doc.
 
4. Retrievers: We used a VectoreStoreRetriver, which is backed by a VectorStore. To retrieve text, there are two search types you can choose: search_type: “similarity” or “mmr”. search_type="similarity" uses similarity search in the retriever object where it selects text chunk vectors that are most similar to the question vector. search_type="mmr" uses the maximum marginal relevance search where it optimizes for similarity to query AND diversity among selected documents.
    
5. Chain Type: You can also define the chain type as one of the four options: “stuff”, “map reduce”, “refine”, “map_rerank”.

# Test with different text splitters or chunk size
# Test with chain type, retrivers