In [1]:
# Cell 2: Imports
import os
import re
from typing import List, Tuple
from docx import Document
from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document as LC_Document
from pinecone import Pinecone, ServerlessSpec

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cell 3: Load API keys from .env
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME", "financetutor")

if not openai_api_key or not pinecone_api_key:
    raise ValueError("OPENAI_API_KEY or PINECONE_API_KEY is missing in .env")

In [3]:
# Cell 5: Extract sectioned chunks from a Word doc
def extract_sectioned_chunks(doc_path: str) -> List[Tuple[str, str]]:
    doc = Document(doc_path)
    chunks = []
    current_heading = ""
    current_text = ""

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue

        if re.match(r'^\d+(\.\d+)*\s', text):
            if current_text:
                chunks.append((current_heading, current_text.strip()))
                current_text = ""
            current_heading = text
        else:
            current_text += ' ' + text

    if current_heading and current_text:
        chunks.append((current_heading, current_text.strip()))

    return chunks

In [4]:
# Cell 6: Convert section heading to OpenStax URL
def format_url(heading: str) -> str:
    clean = re.sub(r'\s+', ' ', heading).strip()
    clean = re.sub(r'[()]', '', clean)
    slug = clean.lower().replace('.', '-').replace(' ', '-')
    return f"https://openstax.org/books/principles-finance/pages/{slug}"

In [5]:
# Cell 7: Prepare chunks with metadata
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

def prepare_documents(chunks: List[Tuple[str, str]]) -> List[LC_Document]:
    docs = []
    for heading, content in chunks:
        url = format_url(heading)
        section_chunks = text_splitter.split_text(content)
        for i, chunk in enumerate(section_chunks):
            doc = LC_Document(
                page_content=chunk,
                metadata={
                    "heading": heading,
                    "url": url,
                    "chunk_index": i
                }
            )
            docs.append(doc)
    return docs

In [6]:
doc_path = "/Users/jcz/Projects/FinanceTutor/principles-finance-docx/1-1-what-is-finance.docx"
doc_dir = "/Users/jcz/Projects/FinanceTutor/principles-finance-docx"
chunks = extract_sectioned_chunks(doc_path)

for heading, content in chunks[:3]:
    print("Heading:", heading)
    print("Text:", content[:200], "...\n")

Heading: 1.1   What Is Finance?
Text: Learning Outcomes By the end of this section, you will be able to: Describe the main areas in finance. Explain the importance of studying finance. Discuss the concepts of risk and return. Definition o ...



In [7]:
# Cell 8: Process all documents in the directory
import glob

doc_dir = "/Users/jcz/Projects/FinanceTutor/principles-finance-docx"
all_documents = []

for path in glob.glob(os.path.join(doc_dir, "*.docx")):
    chunks = extract_sectioned_chunks(path)
    docs = prepare_documents(chunks)
    all_documents.extend(docs)
    print(f"Processed {os.path.basename(path)} → {len(docs)} chunks")

print(f"\n✅ Total: {len(all_documents)} chunks from {len(glob.glob(os.path.join(doc_dir, '*.docx')))} files.")
print("Sample metadata:", all_documents[0].metadata)

Processed 20-multiple-choice.docx → 0 chunks
Processed 18-problems.docx → 0 chunks
Processed 16-6-using-excel-to-make-company-investment-decisions.docx → 21 chunks
Processed 18-multiple-choice.docx → 7 chunks
Processed 4-why-it-matters.docx → 5 chunks
Processed 1-4-careers-in-finance.docx → 17 chunks
Processed 18-review-questions.docx → 0 chunks
Processed 12-why-it-matters.docx → 5 chunks
Processed 18-6-using-excel-to-create-the-long-term-forecast.docx → 19 chunks
Processed 18-3-pro-forma-financials.docx → 23 chunks
Processed 3-review-questions.docx → 0 chunks
Processed 3-key-terms.docx → 0 chunks
Processed 1-6-microeconomic-and-macroeconomic-matters.docx → 22 chunks
Processed 8-4-stated-versus-effective-rates.docx → 16 chunks
Processed 11-1-multiple-approaches-to-stock-valuation.docx → 41 chunks
Processed 8-5-equal-payments-with-a-financial-calculator-and-excel.docx → 39 chunks
Processed 13-why-it-matters.docx → 6 chunks
Processed 18-summary.docx → 7 chunks
Processed 5-1-the-income-st

In [8]:
import json

# Save to file
with open("all_documents.json", "w") as f:
    json.dump([
        {
            "page_content": doc.page_content,
            "metadata": doc.metadata
        } for doc in all_documents
    ], f, indent=2)

print("✅ Saved all_documents to all_documents.json")

✅ Saved all_documents to all_documents.json


In [9]:
# Cell 9: Upload to Pinecone via LangChain
embeddings = OpenAIEmbeddings(api_key=openai_api_key)
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

# Delete all vectors in the index
vectorstore.delete(delete_all=True)

# Upload in batches of 100 documents
batch_size = 100
for i in range(0, len(all_documents), batch_size):
    batch = all_documents[i:i + batch_size]
    vectorstore.add_documents(batch)
    print(f"Uploaded batch {i // batch_size + 1} of {((len(all_documents) - 1) // batch_size) + 1}")
print("✅ Uploaded to Pinecone.")

Uploaded batch 1 of 33
Uploaded batch 2 of 33
Uploaded batch 3 of 33
Uploaded batch 4 of 33
Uploaded batch 5 of 33
Uploaded batch 6 of 33
Uploaded batch 7 of 33
Uploaded batch 8 of 33
Uploaded batch 9 of 33
Uploaded batch 10 of 33
Uploaded batch 11 of 33
Uploaded batch 12 of 33
Uploaded batch 13 of 33
Uploaded batch 14 of 33
Uploaded batch 15 of 33
Uploaded batch 16 of 33
Uploaded batch 17 of 33
Uploaded batch 18 of 33
Uploaded batch 19 of 33
Uploaded batch 20 of 33
Uploaded batch 21 of 33
Uploaded batch 22 of 33
Uploaded batch 23 of 33
Uploaded batch 24 of 33
Uploaded batch 25 of 33
Uploaded batch 26 of 33
Uploaded batch 27 of 33
Uploaded batch 28 of 33
Uploaded batch 29 of 33
Uploaded batch 30 of 33
Uploaded batch 31 of 33
Uploaded batch 32 of 33
Uploaded batch 33 of 33
✅ Uploaded to Pinecone.


In [10]:
# Cell 10: Search and verify
query = "What is finance?"
results = vectorstore.similarity_search(query, k=3)

for i, doc in enumerate(results):
    print(f"\nResult {i+1}")
    print(doc.page_content[:200] + "...")
    print("Heading:", doc.metadata.get("heading"))
    print("URL:", doc.metadata.get("url"))


Result 1
Finance is the study of the trade-off between risk and expected return. There are three broad areas of finance: business finance, investments, and financial markets and institutions....
Heading: 1.1   What Is Finance?
URL: https://openstax.org/books/principles-finance/pages/1-1-what-is-finance?

Result 2
Learning Outcomes By the end of this section, you will be able to: Describe the main areas in finance. Explain the importance of studying finance. Discuss the concepts of risk and return. Definition o...
Heading: 1.1   What Is Finance?
URL: https://openstax.org/books/principles-finance/pages/1-1-what-is-finance?

Result 3
loan. It can also be used as a noun referring to an entire industry. At its essence, the study of finance is about understanding the uses and sources of cash, as well as the concept of risk-reward tra...
Heading: 1.1   What Is Finance?
URL: https://openstax.org/books/principles-finance/pages/1-1-what-is-finance?
