In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import nltk
nltk.download('averaged_perceptron_tagger')

%pip install libmagic unstructured unstructured[md]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import glob
from typing import Set
import time

from langchain_community.document_loaders import TextLoader
from langchain_classic.text_splitter import CharacterTextSplitter
from langchain_classic.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# üîπ Root folders containing processed markdown data
PROCESSED_DATA_DIRS = [
    "data/processed_data",
]

GLOB_PATTERN = "*.md"

# üîπ Root directory for FAISS store
VECTOR_STORE_ROOT = "vectorstores"

# üîπ File where we track which markdown files have already been processed
PROCESSED_FILES_LOG = os.path.join(VECTOR_STORE_ROOT, "processed_files.txt")


def add_documents_in_batches(vector_store, docs, batch_size: int = 64):
    """
    Add documents to a FAISS store in smaller batches.
    This is mainly for memory friendliness; no API calls here.
    """
    for i in range(0, len(docs), batch_size):
        batch = docs[i : i + batch_size]
        print(
            f"   - Adding batch {i // batch_size + 1} "
            f"({len(batch)} docs)..."
        )
        vector_store.add_documents(batch)
        # tiny pause just so logs are readable; not required
        time.sleep(0.1)


def load_processed_files(log_path: str) -> Set[str]:
    """Load already processed file paths from log file."""
    if not os.path.exists(log_path):
        return set()
    with open(log_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]
    return set(lines)


def append_processed_files(log_path: str, file_paths):
    """Append newly processed file paths to log file."""
    os.makedirs(os.path.dirname(log_path), exist_ok=True)
    with open(log_path, "a", encoding="utf-8") as f:
        for p in file_paths:
            f.write(p + "\n")


def make_vector_dir_for_folder(folder_path: str) -> str:
    """
    Always return the same FAISS directory path
    (single global index for all folders).
    """
    return os.path.join(VECTOR_STORE_ROOT, "index")


def main():
    os.makedirs(VECTOR_STORE_ROOT, exist_ok=True)

    # üîπ Load global processed file list
    processed_files = load_processed_files(PROCESSED_FILES_LOG)
    print(f"Loaded {len(processed_files)} previously processed files from log.")

    # üîπ Shared splitter & embeddings (reused for all folders)
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        # change to "cuda" if you have a GPU and want speed
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True},
    )

    total_md_files = 0
    total_new_files = 0
    total_chunks = 0

    # We‚Äôll maintain a single global FAISS index
    folder_vs_dir = make_vector_dir_for_folder("unused")
    vector_store = None
    if os.path.exists(folder_vs_dir):
        print(f"\nüì¶ Loading existing FAISS index from: {folder_vs_dir}")
        vector_store = FAISS.load_local(
            folder_vs_dir,
            embeddings,
            allow_dangerous_deserialization=True,
        )

    for root_dir in PROCESSED_DATA_DIRS:
        print(f"\nüîç Walking root: {root_dir}")

        # Walk every subfolder under this root (including the root itself)
        for dirpath, dirnames, filenames in os.walk(root_dir):
            # All markdown files in this specific folder
            md_files = [
                os.path.join(dirpath, f)
                for f in filenames
                if f.lower().endswith(".md")
            ]

            if not md_files:
                continue

            total_md_files += len(md_files)

            # Filter to only files we haven't processed yet
            new_files = [f for f in md_files if f not in processed_files]
            if not new_files:
                print(f"‚úÖ Folder already fully processed, skipping: {dirpath}")
                continue

            print(
                f"\nüìÅ Folder: {dirpath}\n"
                f"   - Total .md files: {len(md_files)}\n"
                f"   - New files to process: {len(new_files)}"
            )

            # üîπ Load only the new files
            docs = []
            for fpath in new_files:
                loader = TextLoader(fpath, encoding="utf-8")
                docs.extend(loader.load())

            if not docs:
                print("   - No docs loaded (unexpected), skipping folder.")
                continue

            # üîπ Chunk
            doc_splits = text_splitter.split_documents(docs)
            total_chunks += len(doc_splits)
            print(f"   - Chunks created: {len(doc_splits)}")

            # üîπ Build or update FAISS global index
            if vector_store is None:
                # First time: create index from first batch
                print(f"   - Creating new FAISS index at: {folder_vs_dir}")

                first_batch_size = min(64, len(doc_splits))
                first_batch = doc_splits[:first_batch_size]
                rest = doc_splits[first_batch_size:]

                vector_store = FAISS.from_documents(first_batch, embeddings)

                if rest:
                    add_documents_in_batches(vector_store, rest, batch_size=64)

            else:
                print(f"   - Updating existing FAISS index at: {folder_vs_dir}")
                add_documents_in_batches(vector_store, doc_splits, batch_size=64)

            # üîπ Save index **after this folder**
            os.makedirs(folder_vs_dir, exist_ok=True)
            vector_store.save_local(folder_vs_dir)
            print(f"   üíæ Saved updated FAISS index to: {folder_vs_dir}")

            # üîπ Mark these files as processed so reruns skip them
            append_processed_files(PROCESSED_FILES_LOG, new_files)
            processed_files.update(new_files)
            total_new_files += len(new_files)
            print(f"   üìù Logged {len(new_files)} processed files.")

    print("\nüìä Run summary:")
    print(f"   - Total .md files seen: {total_md_files}")
    print(f"   - Newly processed files this run: {total_new_files}")
    print(f"   - Total chunks created this run: {total_chunks}")
    print(f"   - Processed files log: {PROCESSED_FILES_LOG}")
    print(f"   - Vector stores root: {VECTOR_STORE_ROOT}")
    print("‚úÖ Done.")


if __name__ == "__main__":
    main()


Loaded 12427 previously processed files from log.

üì¶ Loading existing FAISS index from: vectorstores\index

üîç Walking root: data/processed_data
‚úÖ Folder already fully processed, skipping: data/processed_data\AAPL
‚úÖ Folder already fully processed, skipping: data/processed_data\ABBV
‚úÖ Folder already fully processed, skipping: data/processed_data\ABT
‚úÖ Folder already fully processed, skipping: data/processed_data\ACN
‚úÖ Folder already fully processed, skipping: data/processed_data\ADBE
‚úÖ Folder already fully processed, skipping: data/processed_data\AIG
‚úÖ Folder already fully processed, skipping: data/processed_data\AMD
‚úÖ Folder already fully processed, skipping: data/processed_data\AMGN
‚úÖ Folder already fully processed, skipping: data/processed_data\AMT
‚úÖ Folder already fully processed, skipping: data/processed_data\AMZN
‚úÖ Folder already fully processed, skipping: data/processed_data\AVGO
‚úÖ Folder already fully processed, skipping: data/processed_data\AXP
‚úÖ 

In [6]:
from langchain_classic.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

VECTOR_STORE_DIR = "vectorstores/index"

embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        # change to "cuda" if you have a GPU and want speed
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True},
    )
vector_store = FAISS.load_local(VECTOR_STORE_DIR, embeddings, allow_dangerous_deserialization=True)

  embeddings = HuggingFaceEmbeddings(


In [7]:
retreiver = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 5}
)

In [14]:
query = "Profit and losses of Apple in 2025"
docs = retreiver.invoke(query)
for doc in docs:
    print(doc.page_content, doc.metadata)

Apple Inc. | 2024 Form 10-K | 38

Note 5 - Property, Plant and Equipment

The following table shows the Company's gross property, plant and equipment by major asset class and accumulated depreciation as of September 28, 2024 and September 30, 2023 (in millions): {'source': 'data/processed_data\\AAPL\\10-K_2024-11-01.md'}
Apple Inc. | 2023 Form 10-K | 38

Note 5 - Property, Plant and Equipment

The following table shows the Company's gross property, plant and equipment by major asset class and accumulated depreciation as of September 30, 2023 and September 24, 2022 (in millions): {'source': 'data/processed_data\\AAPL\\10-K_2023-11-03.md'}
Apple Inc. | Q1 2024 Form 10-Q | 14

Products and Services Performance The following table shows net sales by category for the three months ended December 30, 2023 and December 31, 2022 (dollars in millions): {'source': 'data/processed_data\\AAPL\\10-Q_2024-02-02.md'}
Apple Inc. | 2025 Form 10-K | 47

The following tables show net sales for 2025, 2024 

In [None]:
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from langchain_classic.prompts import ChatPromptTemplate
from langchain_classic.chains import RetrievalQA

prompt = ChatPromptTemplate(messages=[
        ("system", "You are a precise financial research assistant. Answer concisely and cite sources by filename"),
        ("human", "Quesion: \n\n {question} \n\n Use the following context: {context} \n\n Answer:")
    ])

llm = ChatGroq(model="llama-3.1-8b-instant")

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type = "stuff", # stuffs retrieved docs
    retriever = retreiver,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

result = qa({"query": query})
print("\n=== ANSWER ===\n")
print(result["result"])


print("\n=== SOURCES ===")
for i, d in enumerate(result["source_documents"], 1):
    src = d.metadata.get("source") or d.metadata.get("file_path") or "unknown"
    print(f"[{i}] {src}")


=== ANSWER ===

Based on the provided information, the profit and losses of Apple in 2025 cannot be directly determined. However, we can find some relevant data.

From the table in Apple Inc. | 2025 Form 10-K | 47, Apple's net sales for the year ended 2025 is $434.7 billion.

Depreciation expense on property, plant and equipment was $8.0 billion in 2025 as per Apple Inc. | 2025 Form 10-K | 39.

To calculate the profit of Apple in 2025, we would need the operating income or net income, which is not directly provided in the context.

=== SOURCES ===
[1] data/processed_data\AAPL\10-K_2024-11-01.md
[2] data/processed_data\AAPL\10-K_2023-11-03.md
[3] data/processed_data\AAPL\10-Q_2024-02-02.md
[4] data/processed_data\AAPL\10-K_2025-10-31.md
[5] data/processed_data\AAPL\10-K_2025-10-31.md


: 