In [None]:
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq langchain==0.0.173 --progress-bar off
!pip install -qqq chromadb==0.3.23 --progress-bar off
!pip install -qqq pypdf==3.8.1 --progress-bar off
!pip install -qqq pygpt4all==1.1.0 --progress-bar off
!pip install -qqq pdf2image==1.16.3 --progress-bar off

In [None]:
!gdown 1DpFisoGXsQbpQJvijuvxkLW_pg-FUUMF

Downloading...
From: https://drive.google.com/uc?id=1DpFisoGXsQbpQJvijuvxkLW_pg-FUUMF
To: /content/ms-financial-statement.pdf
  0% 0.00/29.1k [00:00<?, ?B/s]100% 29.1k/29.1k [00:00<00:00, 38.8MB/s]


In [None]:
!wget https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin

--2024-03-05 18:48:50--  https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin
Resolving gpt4all.io (gpt4all.io)... 104.26.0.159, 104.26.1.159, 172.67.71.169, ...
Connecting to gpt4all.io (gpt4all.io)|104.26.0.159|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3785248281 (3.5G)
Saving to: ‘ggml-gpt4all-j-v1.3-groovy.bin.1’


2024-03-05 18:50:46 (31.7 MB/s) - ‘ggml-gpt4all-j-v1.3-groovy.bin.1’ saved [3785248281/3785248281]



In [None]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path

## Load Data

In [None]:
loader = PyPDFLoader("ms-financial-statement.pdf")

In [None]:
documents = loader.load_and_split()

In [None]:
len(documents)

2

In [None]:
print(documents[0].page_content)

9   
Dividends  
Our Board of Directors declared the following dividends:  
  
Declaration Date  Record Date  Payment  Date  Dividend  
Per Share  Amount    
          
Fiscal Year 2022        (In millions)  
          
September  14, 2021   November  18, 2021    December  9, 2021   $   0.62  $ 4,652   
December  7, 2021   February  17, 2022    March  10, 2022    0.62   4,645   
March  14, 2022   May 19, 2022    June  9, 2022    0.62   4,632   
June  14, 2022   August  18, 2022    September  8, 2022    0.62   4,627     
Total      $   2.48  $   18,556             
          
Fiscal Year 2021          
          
September  15, 2020   November  19, 2020    December  10, 2020   $ 0.56  $ 4,230   
December  2, 2020   February  18, 2021    March  11, 2021    0.56   4,221   
March  16, 2021   May 20, 2021    June  10, 2021    0.56   4,214   
June  16, 2021   August  19, 2021    September  9, 2021    0.56   4,206     
          
Total      $ 2.24  $ 16,871             
The dividend declared 

In [None]:
txts = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=180)
all_token_text = txts.split_documents(documents)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(documents)

In [None]:
len(all_token_text)

6

In [None]:
print(texts[0].page_content)

9   
Dividends  
Our Board of Directors declared the following dividends:  
  
Declaration Date  Record Date  Payment  Date  Dividend  
Per Share  Amount    
          
Fiscal Year 2022        (In millions)  
          
September  14, 2021   November  18, 2021    December  9, 2021   $   0.62  $ 4,652   
December  7, 2021   February  17, 2022    March  10, 2022    0.62   4,645   
March  14, 2022   May 19, 2022    June  9, 2022    0.62   4,632   
June  14, 2022   August  18, 2022    September  8, 2022    0.62   4,627     
Total      $   2.48  $   18,556             
          
Fiscal Year 2021          
          
September  15, 2020   November  19, 2020    December  10, 2020   $ 0.56  $ 4,230   
December  2, 2020   February  18, 2021    March  11, 2021    0.56   4,221   
March  16, 2021   May 20, 2021    June  10, 2021    0.56   4,214   
June  16, 2021   August  19, 2021    September  9, 2021    0.56   4,206     
          
Total      $ 2.24  $ 16,871


## Create Embeddings

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
db = Chroma.from_documents(texts, embeddings, persist_directory="db")



## Create Chain

In [None]:
model_n_ctx = 1000
model_path = "./ggml-gpt4all-j-v1.3-groovy.bin"
llm = GPT4All(model=model_path, n_ctx=1000, backend="gptj", verbose=False)

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    verbose=False,
)

## Ask Questions

In [None]:
%%time
prompt = f"""How much is the investment amount in Microsoft on 6/22? Extract the answer from the text."""
res = qa(prompt.strip())

In [None]:
print(res["result"])

 The investment amount in Microsoft on 6/22 is $309.69.


## References

- [GPT4All](https://gpt4all.io/)
- [MICROSOFT 2022 ANNUAL REPORT](https://www.microsoft.com/investor/reports/ar22/download-center/)