In [None]:
# install requirements, use Python 3.9
!pip3 install -r requirements.txt

### Load data

In [None]:
from langchain.document_loaders import DirectoryLoader

#link doc arcive
data_folder = './...'

loader = DirectoryLoader(data_folder)
raw_documents = loader.load()

In [None]:
print("Size raw documents: ",len(raw_documents))

### Slice the documents

In [None]:
# Import RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create the text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# Split the documents
documents = splitter.split_documents(raw_documents)

print("Size documents: ",len(documents))

### Embed the documents and store embeddings in the vector database


In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
import os

os.environ["OPENAI_API_KEY"]="..."


In [None]:
# Import Chromadb and OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import chromadb

# Create the mebedding function
embedding_function = OpenAIEmbeddings(deployment="text-embedding-ada-002",chunk_size = 1)

In [None]:
# Create a database from the documents and embedding function
db = Chroma.from_documents(documents=documents, embedding=embedding_function, persist_directory="my-embeddings")

# Persist the data to disk
db.persist()

# load vector database
db = Chroma(persist_directory="my-embeddings", 
       embedding_function=OpenAIEmbeddings(deployment="text-embedding-ada-002",chunk_size = 1))

#### Query the database

In [None]:
db.get()['documents']

In [None]:
# Specify query
query = "..."

# Call the `similarity_search_with_score` method on `db`
results = db.similarity_search_with_relevance_scores(query, k = 5)

# Print the results
for (doc, score) in results:
    print('score', score)
    print(doc.page_content)
    print('-----------------')