In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [7]:
import nltk
nltk.download('averaged_perceptron_tagger')

%pip install libmagic unstructured unstructured[md]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Note: you may need to restart the kernel to use updated packages.


In [8]:
import warnings
warnings.filterwarnings("ignore", message=".*libmagic is unavailable.*")

In [None]:
import os
import glob
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

PROCESSED_DATA_DIRS = [
    "data/processed_data/TSLA",
    # "data/processed_data/company_news"
]
GLOB_PATTERN = "*.md"
VECTOR_STORE_DIR = "vectorstores/business_intel_faiss"

all_documents = []

# ðŸ”¹ Load markdown files from each folder
total_files = 0
for folder in PROCESSED_DATA_DIRS:
    pattern = os.path.join(folder, GLOB_PATTERN)
    count = len(glob.glob(pattern, recursive=True))
    total_files += count
    print(f"Found {count} markdown files in {folder}")

    loader = DirectoryLoader(folder, glob=GLOB_PATTERN, recursive=True, show_progress=True)
    docs = loader.load()
    all_documents.extend(docs)

print(f"\nðŸ“„ Total documents loaded: {len(all_documents)}")

# ðŸ”¹ Split into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
doc_splits = text_splitter.split_documents(all_documents)
print(f"ðŸ§© Total chunks created: {len(doc_splits)}")

# ðŸ”¹ Create embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=32)

# ðŸ”¹ Build FAISS vector store
vector_store = FAISS.from_documents(doc_splits, embeddings)

# ðŸ”¹ Save vector store
os.makedirs(VECTOR_STORE_DIR, exist_ok=True)
vector_store.save_local(VECTOR_STORE_DIR)

print(f"âœ… FAISS index built and saved successfully at '{VECTOR_STORE_DIR}'")

In [12]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=32)
vector_store = FAISS.load_local(VECTOR_STORE_DIR, embeddings, allow_dangerous_deserialization=True)

In [13]:
retreiver = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 5}
)

In [14]:
query = "What are the losses incured in this year"
docs = retreiver.invoke(query)
print(docs)

[Document(id='ed30901a-ed74-4ee4-9afc-e9a204de0d09', metadata={'source': 'data\\processed_data\\TSLA\\10-K_2024-01-29.md'}, page_content='Note 19 - Restructuring and Other During the years ended December 31, 2022 and 2021, we recorded $ 204 million and $ 101 million, respectively, of impairment losses on digital assets. During the years ended December 31, 2022 and 2021 we also realized gains of $ 64 million and $ 128 million, respectively, in connection with converting our holdings of digital assets into fiat currency. We also recorded other expenses of $ 36 million during the second quarter of the year ended December 31, 2022, related to employee terminations. 92'), Document(id='1ac3ecce-fd70-47f8-be7d-114cc93893d1', metadata={'source': 'data\\processed_data\\TSLA\\10-K_2024-01-29.md'}, page_content='During the year ended December 31, 2022, we recorded an impairment loss of $204 million as well as realized gains of $64 million in connection with converting our holdings of digital asse

In [15]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA

prompt = ChatPromptTemplate(messages=[
        ("system", "You are a precise financial research assistant. Answer concisely and cite sources by filename"),
        ("human", "Quesion: \n\n {question} \n\n Use the following context: {context} \n\n Answer:")
    ])
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type = "stuff", # stuffs retrieved docs
    retriever = retreiver,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

result = qa({"query": query})
print("\n=== ANSWER ===\n")
print(result["result"])


print("\n=== SOURCES ===")
for i, d in enumerate(result["source_documents"], 1):
    src = d.metadata.get("source") or d.metadata.get("file_path") or "unknown"
    print(f"[{i}] {src}")

  result = qa({"query": query})



=== ANSWER ===

For the year ended December 31, 2022, the total losses incurred include:

1. Impairment losses on digital assets: **$204 million**
2. Other expenses related to employee terminations: **$36 million**

Total losses for 2022: **$204 million + $36 million = $240 million**.

Realized gains of $64 million from converting digital assets into fiat currency do not offset the losses but are noted separately. 

(Source: Note 19 - Restructuring and Other)

=== SOURCES ===
[1] data\processed_data\TSLA\10-K_2024-01-29.md
[2] data\processed_data\TSLA\10-K_2024-01-29.md
[3] data\processed_data\TSLA\10-Q_2023-10-23.md
[4] data\processed_data\TSLA\10-K_2023-01-31.md
[5] data\processed_data\TSLA\10-K_2024-01-29.md
