# Installs

In [1]:
! pip install langchain

# Load PDFs.
! pip install pypdf

# Access to the Google Palm API
! pip install google-generativeai

# Database for embeddings
! pip install chromadb

# Colored output
! pip install colorama

Collecting langchain
  Downloading langchain-0.0.321-py3-none-any.whl (1.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.9 MB[0m [31m16.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m28.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.1-py3-none-any.whl (27 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.43 (from langchain)
  Downloading langsmith-0.0.49-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m4.1

Collecting chromadb
  Downloading chromadb-0.4.14-py3-none-any.whl (448 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/448.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m440.3/448.1 kB[0m [31m16.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.1/448.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.104.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)

# Imports and setup

In [2]:
import os
from colorama import Fore

In [3]:
# Connect to drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Palm Setup

In [4]:
# Generate yours @ https://developers.generativeai.google/products/palm
with open('/content/drive/My Drive/key.txt', 'r') as f:
  palm_api_key = f.readlines()[0].strip('\n')

os.environ["GOOGLE_API_KEY"] = palm_api_key

In [5]:
import langchain
from langchain.prompts import PromptTemplate
from langchain.llms.google_palm import GooglePalm
from langchain.embeddings.google_palm import GooglePalmEmbeddings

llm = GooglePalm(temperature=0.0, max_output_tokens=256)
embeddings = GooglePalmEmbeddings()

# Ingestion Phase

In [6]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [7]:
doc_path = "https://www.abc.xyz/assets/86/99/68122c444c4a93d2228e21ecc16b/20230426-alphabet-10q.pdf"
loader = PyPDFLoader(doc_path)
text_splitter = CharacterTextSplitter(
    chunk_size=512, chunk_overlap=0, separator="\n"
)

# load
documents = loader.load()
# split
texts = text_splitter.split_documents(documents)
# embed and store
vector_store = Chroma.from_documents(
    texts, embeddings, collection_name="google_2023", persist_directory="./"
)

In [8]:
#@title Exploring Stored Data
data = vector_store._collection.peek(5)
for idx, (id, embedding, metadata, document) in enumerate(zip(
    data["ids"], data["embeddings"], data["metadatas"], data["documents"]
)):
  print(Fore.YELLOW + "# Document " + str(idx + 1) + "\n\n" + Fore.RESET)
  print(Fore.BLUE + "Id: " + Fore.RESET + id + "\n")
  print(Fore.BLUE + "Embedding: " + Fore.RESET + str(embedding)[:40] + " ...\n")
  print(Fore.BLUE + "Metadata: " + Fore.RESET + str(metadata) + "\n")
  print(
      Fore.BLUE + "Text: \n\n" + Fore.RESET
      + document
      + "\n\n------------------------------------------------------------ \n"
  )

[33m# Document 1

[39m
[34mId: [39m6adfd2ca-71d9-11ee-82fb-0242ac1c000c

[34mEmbedding: [39m[-0.007248384412378073, 0.01239798404276 ...

[34mMetadata: [39m{'page': 0, 'source': '/tmp/tmp2cvv8we5/tmp.pdf'}

[34mText: 

[39mUNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
________________________________________________________________________________________
FORM 10-Q  
________________________________________________________________________________________
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the quarterly period ended March 31, 2023
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

------------------------------------------------------------ 

[33m# Document 2

[39m
[34mId: [39m6adfd4fa-71d9-11ee-82fb-0242ac1c000c

[34mEmbedding: [39m[-0.004968447610735893, -0.0063596065156 ...

[34mMetadata: [39m{'page': 0, 'source': '/tmp/tmp2cvv8

# Generation Phase

In [9]:
#@title prompt
prompt_template = PromptTemplate.from_template(
    "You are finance expert who is tasked with finidng insights about a  \n"
    "company from text. Please answer the question at the end using the \n"
    "information given below. \n"
    "\n------------------------------------\n"
    "Information: \n"
    "{context}"
    "\n------------------------------------\n"
    "Question: \n"
    "{question}"
)

In [34]:
# 1. user asks query
query = "What was Google's total income from operations in 2022?"
# 2. langchain gets context
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
relevant_docs = retriever.get_relevant_documents(query=query)
context = "\n ------------------------------------------------\n".join(
    [doc.page_content for doc in relevant_docs]
)
# 3. generate response
prompt = prompt_template.format(context=context, question=query)
answer = llm(prompt)
# 4. return response
print(answer)

Google's total income from operations in 2022 was $20,094.


In [35]:
#@title Exploring Context

for idx, document in enumerate(relevant_docs):
  print(Fore.YELLOW + "# Document " + str(idx + 1) + "\n\n" + Fore.RESET)
  print(
      document.page_content
      + "\n\n------------------------------------------------------------ \n"
  )

[33m# Document 1

[39m
Total revenues $ 68,011 $ 69,787 
 Three Months Ended
March 31,
 2022 2023
Operating income (loss):
Google Services $ 21,973 $ 21,737 
Google Cloud  (706)  191 
Other Bets  (835)  (1,225) 
Corporate costs, unallocated  (338)  (3,288) 
Total income from operations $ 20,094 $ 17,415 
For revenues by geography, see Note 2 .
The following table presents long-lived assets by geographic area, which includes property and equipment, net 
and operating lease assets (in millions) :
As of 
December 31, 2022As of

------------------------------------------------------------ 

[33m# Document 2

[39m
2022 2023
Net cash provided by operating activities $ 25,106 $ 23,509 
Net cash used in investing activities $ (9,051) $ (2,946) 
Net cash used in financing activities $ (16,214) $ (16,568) 
Cash Provided by Operating Activities
Our largest source of cash provided by operations are advertising revenues generated by Google Search & 
other properties, Google Network properties, 

# Improving RAG

In [36]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_store.as_retriever(search_kwargs={"k": 3})
)

compressed_docs = compression_retriever.get_relevant_documents(query)



In [37]:
#@title Compressed contexts
data = vector_store._collection.peek(5)
for idx, document in enumerate(compressed_docs):
  print(Fore.YELLOW + "# Document " + str(idx + 1) + "\n\n" + Fore.RESET)
  print(Fore.BLUE + "Metadata: " + Fore.RESET + str(document.metadata) + "\n")
  print(
      Fore.BLUE + "Text: \n\n" + Fore.RESET
      + document.page_content
      + "\n\n------------------------------------------------------------ \n"
  )

[33m# Document 1

[39m
[34mMetadata: [39m{'page': 28, 'source': '/tmp/tmp2cvv8we5/tmp.pdf'}

[34mText: 

[39mTotal income from operations $ 20,094

------------------------------------------------------------ 

[33m# Document 2

[39m
[34mMetadata: [39m{'page': 39, 'source': '/tmp/tmp2cvv8we5/tmp.pdf'}

[34mText: 

[39m2022 2023
Net cash provided by operating activities $ 25,106 $ 23,509

------------------------------------------------------------ 

[33m# Document 3

[39m
[34mMetadata: [39m{'page': 34, 'source': '/tmp/tmp2cvv8we5/tmp.pdf'}

[34mText: 

[39mTotal revenues $ 68,011 $ 69,787

------------------------------------------------------------ 



In [31]:
#@title Generating Answer
context = "\n ------------------------------------------------\n".join(
    [doc.page_content for doc in compressed_docs]
)
# 3. generate response
prompt = prompt_template.format(context=context, question=query)
answer = llm(prompt)
# 4. return response
print(answer)

20,094
