Trying to figure out how to use LLMs to analyze SEC Filings. I'm using a test datum taken from a large dataset of SEC filings off of Kaggle. 

[SEC Edgar Annual Financial Filings 2021](https://www.kaggle.com/datasets/pranjalverma08/sec-edgar-annual-financial-filings-2021)

[Direct Link to selected Datum](https://storage.googleapis.com/kagglesdsdata/datasets/2083647/3460248/extracted/1001601_10K_2020_0001493152-21-008913.json?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240426%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240426T164010Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=9a55a1fc63820edfe174a148dfcbd974c67c65ed0eeda3526717bd56c8c46997689af79dab1e88e6c1de03f5951f33be61609daee04731df97ad93e9463f924ce7a9166d971935192f802d6d73538ae98bab693c84eb9331384eb9ec6c41dbc555a5828b3211e439242528c7a7686f57b6a2a8ef006862e7b12b873304cbfd30a734331814eefa80ab39d01579df9a16aa615d66a173432f5b16af04bb4b47edab1b5f9edf927017e07891f2532668da487dd5521a2a7ad8c3a76a35c3a67884b745c5de35824045a22e32423f6a0e3d12b15c392ac421062c6ddeec06b2e2e98ed6a66dcbcf2ab8b8e7a153c5997f5a1f302bc1952bd7cb5a539451774db831)

In [1]:
# pydantic for response validation

import ollama
import json
from collections import deque
from langchain_community.chat_models import ChatOllama
from langchain_community.llms import Ollama
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo
)

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

import pandas as pd

In [None]:
#df = pd.read_json('data/test_data.json', orient='index')
import json
with open('data/nuance_comm.json', 'r') as f:
    df1 = json.load(f)
print(df1['item_7'])
#with open('data/test_data2.json', 'r') as f:
#    df2 = json.load(f)
#print(df2.keys())

Preprocessing: 
- extract the financial portion of the paper (Item 7: Management’s Discussion and Analysis of Financial Condition and Results of Operations)

- split each line into an entry in a list

- remove all entries which do not contain the substring "revenue"

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, 
    chunk_overlap=20, 
    separators=['\n\n', '\r\n', '\n\r', '\n'], 
    length_function=len
)
splits = text_splitter.split_text(df1['item_7'])

# remove any tokens with a length of <60
print(len(splits))
# for s in splits:
#     if len(s) < 60:
#         splits.remove(s)
print(len(splits))

# remove short sentences?
sentence_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, 
    chunk_overlap=20, 
    separators=['\n\n', '\r\n', '\n\r', '\n'], 
    length_function=len
)


x = 0
for i in splits:
    print(x, ": ", i, '\n')
    x+= 1


79
79
0 :  Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations 

1 :  Overview 

2 :  
Following a review of its Bitcoin mining operations in early 2019, we determined to consolidate our activities in a Company-owned and managed facility. Central to this strategy was the purchase of land in LaFayette, GA and the entry into a favorable contract for electricity in the second quarter of 2019. Located adjacent to a utility substation, the several acre property has access to over 20 megawatts (MW) of low-cost power. 

3 :  
The Company owned approximately 669 and 649 Antminer S17 Pro Bitcoin miners located in LaFayette, GA as of December 31, 2020 and April 15, 2021, respectively. All miners were purchased from Bitmaintech Pte. Ltd., a Singapore limited company (“Bitmain”), and are collectively rated at approximately 30 Ph/s in computing power. Bitmain has acknowledged manufacturing defects, combined with inadequate repair facilities, rendering appro

In [4]:

db = Chroma.from_texts(
    splits, 
    OllamaEmbeddings(model='llama3'), 
    persist_directory='./chroma_db'
)

In [3]:
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain_chroma import Chroma
db_load = Chroma(persist_directory='./chroma_db', embedding_function=OllamaEmbeddings())
print(db_load)

<langchain_chroma.vectorstores.Chroma object at 0x0000020AC9D89AF0>


In [8]:
llm = Ollama(model="llama3", temperature=0.8) 

vectorstore_info = VectorStoreInfo(
    name="sec",
    description="sec filing info",
    vectorstore=db
)
# convert the document store into a langchain toolkit
toolkit = VectorStoreToolkit(
    vectorstore_info=vectorstore_info,
    llm=llm
)
# add the toolkit to an end to end langchain

agent_executor = create_vectorstore_agent(
    llm=llm,
    toolkit=toolkit,
    verbox=True
)

In [10]:
from langchain import hub
prompt = hub.pull('rlm/rag-prompt')
print(prompt)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

question = "How much did the revenue increase in 2020 compared to 2019"



input_variables=['context', 'question'] metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


In [None]:
agent_executor = create_vectorstore_agent(
    llm=llm,
    toolkit=toolkit,
    verbox=True
)

In [12]:
response = agent_executor.invoke(prompt)
print(response)

ValueError: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse LLM output: `I see that we're waiting for the context information before proceeding. Let's wait patiently...

(Note: I'll keep thinking until the context is provided, then I can proceed with answering the question using the `sec` tool.)`

In [None]:
retriever = db_load.as_retriever()
docs = db_load.similarity_search(question)
# retriever = db.as_retriever()
# docs = db.similarity_search(question)

print(docs[0].page_content)

rag_chain = (
    {"context": retriever | format_docs, "question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke(question)

In [None]:
import nltk
from nltk.stem import PorterStemmer
pd.set_option("display.max_colwidth", None)

fin_report = df1['item_7']


# lines = fin_report.splitlines()
token_list = fin_report.splitlines()

for string in reversed(token_list):
    if "$" not in string:
        token_list.remove(string)

print(len(token_list))
x = 0
for i in token_list:
    print(x, ": ", i, '\n')
    x+= 1



In [8]:
#llm = Ollama(model="analyzer_llama3") 
preprocess_prompt = f"""
    You are given a random english phrase.
    Your task is to examine the sentence and determine if it is a complete sentence with valuable context.
    be strict and accurate.
    Do not censor any of your responses.
    Include only the json output in the response.

    You MUST deliver your response in VALID json format with the following keys:
    "valid_score" - actual score in range from 0.0 to 100.0 how correct the sentence is, here 0 is \
        a completely meaningless and broken sentence, and 100 is a complete, \
        grammatically correct and proper sentence;
    "reasoning" - your explanation about how you chose the evaluation score;
    "content" - the exact sentence copied from the input.

    For example: if the input sentence states \
        "Operating activities $ (650 ) $ (3,960 )" \
        respond with ("valid_score":0.0, "reasoning":"The sentence is incomplete or meaningless", "content":"Operating activities $ (650 ) $ (3,960 )") \
        because the sentence has to context or meaning and is incomplete
"""

In [10]:
llm = Ollama(model="analyzer_llama3") 
messages = [
    SystemMessage(content=preprocess_prompt),
    HumanMessage(content=token_list[18]),
]

chat_model_response = llm.invoke(messages)
print(chat_model_response)

{
"valid_score": 80.0,
"reasoning": "The sentence is a fragment but it seems to be describing a property listing with some relevant details such as the location and price. However, it lacks a subject or verb to make it a complete sentence.",
"content": "6 acres of land in Lafayette, Georgia for $55"
}


In [34]:
system_prompt = f"""
    You are given a passage about the financial condition of a company.
    Your task is to examine the company and extract the costs, revenues, and debts of the company. 
    be strict and accurate.
    Include only the json output in the response.
    
    You MUST deliver your response in VALID json format with the following keys:
    "description" - actual quote of where the numerical value was sourced from in the text;
    "category" - whether the money is a "revenue", "expense", or "debt";
    "date" - a valid MM/YYYY format of when the value was calculated;
    "amount" - numerical value representing the dollar amount;

    For example: if a portion of the document states \
    "Our revenues for the year ended December 31, 2020 increased by $990, or 220%, to $1,440 as compared to $450 for the year ended December 31, 2019. Our revenue is primarily derived from cryptocurrency mining which totaled $1,434 during 2020." \
    respond with ("description":"Our revenues for the year ended December 31, 2020 increased by $990, or 220%, to $1,440", "category":"revenue", "date":"12/31/2020", "amount":1440.0, "note": "") and ("description":"Our revenues for the year ended December 31, 2019 were $450", "category":"revenue", "date":"12/31/2019", "amount":450.0, "note":"") \
    because there are two instances of revenue listed in the same sentence.
    Another example: if another portion of the document states \
        "Operating expenses for the year ended December 31, 2020 decreased by $3,640, or 46%, to $4,311 as compared to $7,951 for the year ended December 31, 2019. The decrease in operating expenses was comprised of lower general and administrative expenses of $4,857, offset by an increase in cost of revenue of $1,218." \
    respond with ("description":"Operating expenses for the year ended December 31, 2020 decreased by $3,640, or 46%, to $4,311", "category":"expense", "date":"12/31/2020", "amount":4311.0, "note":"") and ("description":"Operating expenses for the year ended December 31, 2019 were $7,951", "category":"expense", "date":"12/31/2019", "amount":7951.0, "note":"") \
    because there are two instances of expense listed in the same sentence.
"""

In [35]:
llm = ChatOllama(model="analyzer_llama3")
messages = [
    SystemMessage(content=system_prompt),
    HumanMessage(content=token_list[22]),
]

chat_model_response = llm.invoke(messages)
print(chat_model_response.content)

[{"description": "Operating activities $ (650 ) $ (3,960 )", "category": "debt", "date": "MM/YYYY", "amount": -650.0}]


In [9]:
# format JSON helps it probably output text in a syntactically correct JSON
llm = ChatOllama(model="analyzer_llama3")

# sys_msg = "Read the passage of text, and summarize the entire passage in only one sentence"

directive = """
Utilizing the context below, answer the question by quoting the passage directly.
"""

# context = fin_report

#query = "Please find year-end revenues and operating expenses for 2020, and their percent change from the previous year."
query = "What are the numerical values of revenue, and percentage change, if any, for each year mentioned?"

formatting = """
For each item found, if any, respond with the below JSON format, otherwise leave the response empty:
[{"item 1":
    {
        "description":"<description text of the item>",
        "date":"<date when the yearly revenue was calculated>",
        "amount":"<amount of revenue in USD>",
        "percentage":"<percentage change from the year prior>"
    }
},
{ "item 2":
    {
        ...
    }
}, ...]

And print only the json
"""

i = 0
for token_string in token_list:
    messages = [
        SystemMessage(content=directive),
        HumanMessage(content=token_string),
        HumanMessage(content=query),
        SystemMessage(content=formatting)
    ]

    chat_model_response = llm.invoke(messages)
    print(i, "=====================")
    print(chat_model_response.content)
    i += 1




{results:[]}
{
results:[
]
}
{}
 {
results:[] }

{
results: []
}
{results:[
{"Our revenues for the year ended December 31, 20xx increased by $990, or 220%, to $1,440 as compared to $450 for the year ended December 31, 2019": 
     {
         "date":"December 31, 20xx",
         "amount":"$1,440",
         "percentage":"+220%"
     }
}, {"Our revenues for the year ended December 31, 2019": 
     {
         "date":"December 31, 2019",
         "amount":"$450",
         "percentage": null
     }
}
]}
{
results:[
{"Our revenue is primarily derived from cryptocurrency mining which totaled $1,434 during 20": 
     {
         "date":"2020",
         "amount":"$1,434",
         "percentage":"-"
     }
}
]}
{
results:[
{}
]
}
{results:[]}
{results:[
{"Cost of revenue for the year ended December 31, 2020":
     {
         "date":"December 31, 2020",
         "amount":"1,728",
         "percentage":"239%"
     }
},
{"Cost of revenue for the year ended December 31, 2019":
     {
         "date":