In [6]:
import os
import re
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

# Load the environment variables from .env file where your API key is stored
load_dotenv('.env')  # make sure to provide the path to your .env file if it's not in the same directory

# Now you can use the environment variable
openai_api_key = os.getenv('OPENAI_API_KEY')
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# Base directory
file_dir = "/Users/jcz/Projects/FinanceTutor/principles-finance-docx"


Metadata for organizing
* course_id: general, crnxxx
* material: textbook, slides, problems, terminology, testbank, syllabus
* shareable: yes or no

Load textbook
* course_id: general
* source: 
* material: textbook
* shareable: yes

In [2]:
data_path = "/Users/jcz/Projects/FinanceTutor/principles-finance-docx"

from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader(data_path, show_progress=True)
text_documents = loader.load()
print("The number of text documents once split is: " + str(len(text_documents)))


100%|██████████| 150/150 [00:20<00:00,  7.26it/s]

The number of text documents once split is: 150





In [65]:
text_documents

[Document(page_content='1.1   What Is Finance?\n\nLearning Outcomes\n\nBy the end of this section, you will be able to:\n\nDescribe the main areas in finance.\n\nExplain the importance of studying finance.\n\nDiscuss the concepts of risk and return.\n\nDefinition of Finance\n\nFinance is the study of the management, movement, and raising of money. The word finance can be used as a verb, such as when the First National Bank agrees to finance your home mortgage loan. It can also be used as a noun referring to an entire industry. At its essence, the study of finance is about understanding the uses and sources of cash, as well as the concept of risk-reward trade-off. Finance is also a tool that can help us be better decision makers.\n\nBasic Areas in Finance\n\nFinance is divided into three primary areas in the domestic market: business finance, investments, and financial markets and institutions (see Figure 1.2). We look at each here in turn.\n\nFigure  1.2   The Three Basic Areas of Stud

In [15]:
# """
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split documents to smaller size so that I can create embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
documents = text_splitter.split_documents(text_documents)
print("The number of documents once split is: " + str(len(documents)))
# """

The number of documents once split is: 4881


In [16]:
documents

[Document(page_content='1.1   What Is Finance?\n\nLearning Outcomes\n\nBy the end of this section, you will be able to:\n\nDescribe the main areas in finance.\n\nExplain the importance of studying finance.\n\nDiscuss the concepts of risk and return.\n\nDefinition of Finance', metadata={'source': 'C:\\Users\\kalodimj\\Dropbox\\Academia\\Teaching\\AI\\Principles_Finance_Data\\Principles_Finance_Chapters\\Chapter-1-1-what-is-finance.docx'}),
 Document(page_content='Finance is the study of the management, movement, and raising of money. The word finance can be used as a verb, such as when the First National Bank agrees to finance your home mortgage loan. It can also be used as a noun referring to an entire industry. At its essence, the study of finance is about understanding the uses and sources of cash, as well as the concept of risk-reward trade-off.', metadata={'source': 'C:\\Users\\kalodimj\\Dropbox\\Academia\\Teaching\\AI\\Principles_Finance_Data\\Principles_Finance_Chapters\\Chapter-

In [17]:
# populate metadata for textbook
for document in documents:
    pattern = r".*\\([^\\]+)$"
    document.metadata["filename"] = re.sub(pattern, r"\1", document.metadata["source"]) # this line of code in't doing what I want; it's not updating based on the document
    document.metadata["course_id"] = "general"
    document.metadata["material"] = "textbook"
    document.metadata["shareable"] = "yes"

In [18]:
# Determine number of tokens in document to appropriately embed via OpenAI 
# eventually use this to automatically find out batch size to send to get embedded

# count the number of tokens
# https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken

import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
#encoding.encode("tiktoken is great!")

def num_tokens_from_string(string: str, encoding_name: str) -> int:
#   Returns the number of tokens in a text string.
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print("The length of the document is: " + str(len(str(documents[0]))))
print("The number of tokens in the document are: " + str(num_tokens_from_string(str(documents[50]), "cl100k_base")))

# Function to count tokens
def num_tokens_from_string(text, encoding):
    tokens = encoding.encode(text)
    return len(tokens)

# Calculate the total number of tokens
total_num_tokens = 0
for document in documents:
    total_num_tokens += num_tokens_from_string(str(document), encoding)

print(f"Total number of tokens in all documents: {total_num_tokens}")

avg_tokens_per_document = total_num_tokens / len(documents)
print("The average document has " + str(round(avg_tokens_per_document)) + " tokens.")

The length of the document is: 545
The number of tokens in the document are: 114
Total number of tokens in all documents: 777275
The average document has 159 tokens.


In [19]:
from langchain_openai.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key=openai_api_key)

In [20]:
# Send documents to pinecone to create index
# This step takes a while and costs money because of the embeddings; reuse the vectorstore on pinecone unless intentionally updating
# The batch size is a dumb way to overcome overwhelming the openAI embedding API limit; in an earlier version of the code this was binding
# """
import time 

index_name = "financetutor"

batch_size = len(documents)
n_batches = len(documents) // batch_size + (1 if len(documents) % batch_size != 0 else 0)

for i in range(n_batches):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    batch_documents = documents[start_idx:end_idx]
    
    # Initialize the vector store and insert the batch of documents
    vectorstore_pinecone = PineconeVectorStore.from_documents(batch_documents, embeddings, index_name=index_name)

    # Here you might need to add the batch to the index or perform other operations
    # For example: vectorstore_pinecone.upsert(batch_documents)
    # Make sure to replace the method `upsert` with the actual method you need to use

    print(f"Batch {i + 1}/{n_batches} submitted.")
    
    # Sleep for a certain number of seconds between iterations
    if i < n_batches - 1:  # To prevent sleeping after the last batch
        time.sleep(60)  # Sleep for 60 seconds; adjust this value as necessary # account has 
# """

Batch 1/1 submitted.
