In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [3]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[
        ("#", "Course Title"),
        ("##", "Lecture Title"),
    ],
    strip_headers=True,
)


pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in pages_md_split:
    i.page_content = " ".join(i.page_content.split())

char_splitter = CharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separator=".",
)

page_char_split = char_splitter.split_documents(pages_md_split)

embeddings = OpenAIEmbeddings(openai_api_key=api_key, model="text-embedding-ada-002")

In [4]:
len(page_char_split)

20

In [5]:
vectorstore = Chroma.from_documents(
    documents=page_char_split,
    embedding=embeddings,
    persist_directory="./chroma_data_science_course",
    collection_name="data_science_course",
)

In [6]:
vectorstore_from_directory = Chroma(
    persist_directory="./chroma_data_science_course",
    embedding_function=embeddings,
    collection_name="data_science_course",
)

  vectorstore_from_directory = Chroma(
