In [11]:
!pip install chromadb pypdf openai langchain



In [5]:
import numpy as np
import chromadb
import pandas as pd
from pypdf import PdfReader
import numpy as np
from dotenv import load_dotenv
import os 

In [7]:
secret = os.getenv("OPENAI_SECRET_KEY")

In [9]:
#provided by instructor 
def project_embeddings(embeddings, umap_transform):
    """
    Projects the given embeddings using the provided UMAP transformer.

    Args:
    embeddings (numpy.ndarray): The embeddings to project.
    umap_transform (umap.UMAP): The trained UMAP transformer.

    Returns:
    numpy.ndarray: The projected embeddings.
    """
    projected_embeddings = umap_transform.transform(embeddings)
    return projected_embeddings


def word_wrap(text, width=87):
    """
    Wraps the given text to the specified width.

    Args:
    text (str): The text to wrap.
    width (int): The width to wrap the text to.

    Returns:
    str: The wrapped text.
    """
    return "\n".join([text[i : i + width] for i in range(0, len(text), width)])


def extract_text_from_pdf(file_path):
    """
    Extracts text from a PDF file.

    Args:
    file_path (str): The path to the PDF file.

    Returns:
    str: The extracted text.
    """
    text = []
    with open(file_path, "rb") as f:
        pdf = PdfReader(f)
        for page_num in range(pdf.get_num_pages()):
            page = pdf.get_page(page_num)
            text.append(page.extract_text())
    return "\n".join(text)


def load_chroma(filename, collection_name, embedding_function):
    """
    Loads a document from a PDF, extracts text, generates embeddings, and stores it in a Chroma collection.

    Args:
    filename (str): The path to the PDF file.
    collection_name (str): The name of the Chroma collection.
    embedding_function (callable): A function to generate embeddings.

    Returns:
    chroma.Collection: The Chroma collection with the document embeddings.
    """
    # Extract text from the PDF
    text = extract_text_from_pdf(filename)

    # Split text into paragraphs or chunks
    paragraphs = text.split("\n\n")

    # Generate embeddings for each chunk
    embeddings = [embedding_function(paragraph) for paragraph in paragraphs]

    # Create a DataFrame to store text and embeddings
    data = {"text": paragraphs, "embeddings": embeddings}
    df = pd.DataFrame(data)

    # Create or load the Chroma collection

    collection = chromadb.Client().create_collection(collection_name)

    # Add the data to the Chroma collection
    for ids, row in df.iterrows():

        collection.add(ids=ids, documents=row["text"], embeddings=row["embeddings"])
        # collection.add(text=row["text"], embedding=row["embeddings"])

    return collection


In [None]:
#extracting the text from the pdf 
reader = PdfReader("data/microsoft-annual-report.pd")
#extracting the text from the pdf 
pdf_texts = [p.extract_text().strip() for p in reader.pages]

In [None]:
#splitting the text into chunks using langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
#doing this helps it make more readable/structured
character_splitter = RecursiveCharacterTextSplitter(
                                                    #the seperators tell where to split
                                                    #there is an order of prefernce
                                                    #first try at \n, then at \n\n, then at . 
                                                    seperators = ["\n", "\n\n", ". ", " ", ""], 
                                                    chunk_size = 1000, 
                                                    chunk_overlap = 0)
combo = "\n\n".join(pdf_texts)
character_split_texts = character_splitter.split_texts(combo)

In [None]:
#now we want to split the text into chunks of about 256 tokens 
#making the object that will split the text by tokens 
token_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk = 260, token_overlap = 0)

#actually splitting by tokens 
token_split_texts = []
for text in character_split_texts:
    