In [1]:
## Objective
##Ingest documents which involves download the documents.
##Extract text from the PDF by using LangChain PyPDFLoader.
##Select context for identifying the relevant parts of the document that are needed to answer the question.
##Design prompt for question-answering
##Leverage chains for handling large contexts (with embeddings)

In [1]:
# Base system dependencies
!sudo apt -y -qq install tesseract-ocr libtesseract-dev

# required by PyPDF2 for page count and other pdf utilities
!sudo apt-get -y -qq install poppler-utils python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig

libtesseract-dev is already the newest version (4.1.1-2.1).
tesseract-ocr is already the newest version (4.1.1-2.1).
0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.
E: Package 'pstotext' has no installation candidate


In [2]:
# Install the packages
import os

if not os.getenv("IS_TESTING"):
    USER = "--user"
else:
    USER = ""
# Install Vertex AI LLM SDK, langchain and dependencies
! pip install google-cloud-aiplatform langchain==0.0.323 chromadb==0.3.26 pydantic==1.10.8 typing-inspect==0.8.0 typing_extensions==4.5.0 pandas datasets google-api-python-client transformers==4.33.1 pypdf faiss-cpu config --user



In [3]:
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

warnings.filterwarnings("ignore")
# restart python kernal if issues with langchain import.

In [4]:
PROJECT_ID = "genai-387907"  # @param {type:"string"}
REGION = "us-central1"

import vertexai

vertexai.init(project=PROJECT_ID, location=REGION)

2023-11-11 18:36:30.836567: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-11 18:36:32.011671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-11-11 18:36:32.011811: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

In [5]:
vertex_llm_text = VertexAI(model_name="text-bison@001")
vertex_embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@001")

In [6]:
data_folder = p.cwd() / "data"
p(data_folder).mkdir(parents=True, exist_ok=True)

pdf_url = "https://abc.xyz/assets/9a/bd/838c917c4b4ab21f94e84c3c2c65/goog-10-k-q4-2022.pdf"
pdf_file = str(p(data_folder, pdf_url.split("/")[-1]))

urllib.request.urlretrieve(pdf_url, pdf_file)

('/home/jupyter/data/goog-10-k-q4-2022.pdf',
 <http.client.HTTPMessage at 0x7fdae6288280>)

In [7]:
pdf_loader = PyPDFLoader(pdf_file)
pages = pdf_loader.load_and_split()
#print(pages[3].page_content)

In [8]:
question = "What is Experimentation?"
prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
                    not contained in the context, say "answer not available in context" \n\n
                    Context: \n {context}?\n
                    Question: \n {question} \n
                    Answer:
                  """

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [9]:
text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
context = "\n\n".join(str(p.page_content) for p in pages)
texts = text_splitter.split_text(context)

In [10]:
vector_index = Chroma.from_texts(texts, vertex_embeddings).as_retriever()

In [11]:
question = "What is Alphabet?"

In [13]:
docs = vector_index.get_relevant_documents(question)

In [None]:
# Map-Reduce: Split documents into smaller chunks and process them in parallel. This is more efficient than stuffing, but it can be more complex to implement.

In [14]:
question_prompt_template = """
                    Answer the question as precise as possible using the provided context. \n\n
                    Context: \n {context} \n
                    Question: \n {question} \n
                    Answer:
                    """
question_prompt = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

# summaries is required. a bit confusing.
combine_prompt_template = """Given the extracted content and the question, create a final answer.
If the answer is not contained in the context, say "answer not available in context. \n\n
Summaries: \n {summaries}?\n
Question: \n {question} \n
Answer:
"""
combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["summaries", "question"]
)

In [15]:
map_reduce_chain = load_qa_chain(
    vertex_llm_text,
    chain_type="map_reduce",
    return_intermediate_steps=True,
    question_prompt=question_prompt,
    combine_prompt=combine_prompt,
)

In [16]:
questions = ["1. Executive Summary - A brief overview of the company, its industry, and the main conclusions of the report."
,"2. Industry Analysis - A review of the industry in which the company operates, including trends, competition, and growth prospects."
,"3. Financial Analysis - An in-depth analysis of the company's financial statements, including ratio analysis, cash flows, and profitability."
,"3.1. Income Statement Analysis - **Profitability Trends**: Discuss the trends in net income and operating income. Identify any irregularities or notable events.."
,"3.2Balance Sheet Analysis - **Liquidity and Solvency**: Comment on the company's short-term and long-term financial health, referencing relevant items from the balance sheet."
,"3.3 Cash Flow Statement Analysis - **Free Cash Flow Trends**: Comment on the trend in free cash flow and discuss any factors affecting it."]

In [None]:
for question in questions:
    map_reduce_embeddings_outputs = map_reduce_chain(
    {"input_documents": docs, "question": question}
    )
    print(question)
    print(map_reduce_embeddings_outputs["output_text"])

1. Executive Summary - A brief overview of the company, its industry, and the main conclusions of the report.
Alphabet is a holding company that provides internet-related products and services. The company's products include Google Search, YouTube, Android, Google Maps, Gmail, Google Drive, and Google Cloud Platform. Alphabet's services are used by billions of people around the world. The company has a market capitalization of over $1.5 trillion and is one of the most valuable companies in the world.
2. Industry Analysis - A review of the industry in which the company operates, including trends, competition, and growth prospects.
The internet industry is highly competitive and rapidly evolving. The internet industry is characterized by rapid technological change, intense competition, and the need to constantly innovate in order to maintain and grow market share. The internet industry is also subject to changing consumer preferences and regulatory requirements.
3. Financial Analysis - A