Load API key from file

In [1]:
# read the api key
f = open("sec_api.key", "r")
api_key = f.readline()

read the metadata saved from the sec-api query

In [3]:
import pandas as pd
df_10k = pd.read_csv('./TSLA/csv/10-K.csv')
# print(df_10k['linkToFilingDetails'])

Pull sections 1a, 7, and 7a from the 10-K files

In [5]:
from sec_api import ExtractorApi
extractor_api = ExtractorApi(api_key=api_key)


texts = []
for idx in df_10k.index:
    url = df_10k['linkToFilingDetails'][idx]

    item_1a = extractor_api.get_section(url, "1A", "text")
    item_7 = extractor_api.get_section(url, "7", "text")
    item_7a = extractor_api.get_section(url, "7A", "text")
    texts.append({
        'companyName': df_10k['companyName'][idx],
        'filedAt': df_10k['filedAt'][idx],
        'item_1a':item_1a,
        'item_7': item_7,
        'item_7a': item_7a
    })



Dump the pulled data to file so we can just reload it in the future

In [18]:
import json
# marshalled = json.dumps(texts)
with open('./TSLA/json/10k.json', 'w') as file:
    json.dump(texts, file)

In [19]:
import json
with open('./TSLA/json/10k.json', 'r') as file:
    tsla_10k = json.load(file)

<class 'list'>


Clean HTML special characters out of the text by changing encoding

In [26]:
# We could save the data with the encoding already changed,
# but in case we need it in that format, we will just change it on load
from bs4 import BeautifulSoup
def fix_encoding(_txt):
    return BeautifulSoup(_txt).contents[0]

Remove tabular data that might choke up the LLM

In [47]:
def extract_table(in_string: str, table_index: int):
    start_idx = in_string.find('##TABLE_START')
    end_idx = in_string.find('##TABLE_END')

    if start_idx == -1 or end_idx == -1:
        return None
    
    end_idx += len('##TABLE_END')

    extracted_table = in_string[start_idx:end_idx]
    amended_text = in_string.replace(extracted_table, str('NOTE: Table removed, refer to table index ' + str(table_index) + '. '))
    extracted_table = in_string[start_idx + len('##TABLE_START'): end_idx - len('##TABLE_END')]

    if start_idx >= end_idx:
        return None
    
    return amended_text, extracted_table

def find_and_remove_tables(in_string: str):
    tables = []
    table_index = 0
    txt_string = in_string
    while txt_string.find('##TABLE_START') != -1:
        txt_string, table_string = extract_table(txt_string, table_index)
        tables.append(table_string)
        table_index += 1
    return txt_string, tables


In [50]:
# start with just section 1a of the first doc
doc = tsla_10k[0]

sec_1a, removed_tables_1a = find_and_remove_tables(fix_encoding(doc['item_1a']))
sec_7, removed_tables_7 = find_and_remove_tables(fix_encoding(doc['item_7']))
sec_7a, removed_tables_7a = find_and_remove_tables(fix_encoding(doc['item_7a']))

Work the sections through the vector database,

first by initializing the model and embedding

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama


Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text")
Settings.llm = Ollama(model='llama3', request_timeout=360.0, seed=42)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser

# Split the text into paragraphs (not sure why I have to do this twice)
Settings.text_splitter = LangchainNodeParser(RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=0
))

# Initialize the vector store index with the transformed text
tsla_vector_store = VectorStoreIndex.from_documents(
    sec_1a,
    transformations=[
        LangchainNodeParser(RecursiveCharacterTextSplitter(
            chunk_size=1500,
            chunk_overlap=0
    ))]
)