In [1]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import CohereEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [2]:

COHERE_API_KEY = "A1GXIK13oJjJ1ZgxeMHFhz38El38RRKMAxmNbL0l"

In [3]:
loader = PyPDFDirectoryLoader(path="./pdfs/")

documents = loader.load()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [5]:
len(texts)

2524

In [6]:
texts[3]

Document(page_content='individuals\nPHN strategies for nutrition: intervention at the \necological level\nFood and nutrition guidelinesFetal programmingCardiovascular diseaseCancerOsteoporosisDiabetesVitamin A deficiencyIodine deficiencyIron deficiencyMaternal and child healthBreast feedingAdverse outcomes in pregnancyNutrition and Metabolism\nCore concepts of nutritionMolecular aspects of nutritionIntegration of metabolism 1: EnergyIntegration of metabolism 2: MacronutrientsIntegration of metabolism 3: Protein and amino acidsPregnancy and lactationGrowth and agingNutrition and the brainThe sensory systems and food palatabilityThe gastrointestinal tractThe cardiovascular systemThe skeletal systemThe immune and inflammatory systemsPhytochemicalsThe control of food intakeOvernutritionUndernutritionExercise performance\nClinical Nutrition', metadata={'source': 'pdfs/Sports-And-Exercise-Nutrition.pdf', 'page': 3})

In [7]:
embedding_function = CohereEmbeddings(cohere_api_key=COHERE_API_KEY) # type: ignore

In [8]:
# db = Chroma.afrom_documents(texts, embedding_function)
db = Chroma.afrom_documents(texts, embedding_function, persist_directory="./chroma_db")

In [None]:
query = "How many times a day should I eat?"

In [9]:
documents = []
for text in texts:
    documents.append(text.page_content)

documents[:3]

['Lanham_bindex.indd   388Lanham_bindex.indd   388 7/28/2011   4:40:16 PM7/28/2011   4:40:16 PM',
 'Sport and Exercise Nutrition\nLanham_ffirs.indd   iLanham_ffirs.indd   i 7/28/2011   5:49:59 PM7/28/2011   5:49:59 PM',
 'Introduction to Human Nutrition\nIntroduction to human nutrition: a global \nperspective on food and nutrition\nBody compositionEnergy metabolismNutrition and metabolism of proteins and amino acidsDigestion and metabolism of carbohydratesNutrition and metabolism of lipidsDietary reference standardsThe vitaminsMinerals and trace elementsMeasuring food intakeFood compositionFood and nutrition: policy and regulatory issuesNutrition research methodologyFood safety: a public health issue of growing importanceFood and nutrition-related diseases: the global challenge\nPublic Health Nutrition\nAn overview of public health nutritionNutrition epidemiologyFood choiceAssessment of nutritional status at individual and \n population level\nAssessment of physical activityOvernutriti

In [10]:
# NOTE: I am embedding only page_content for now
my_embeddings = embedding_function.embed_documents(documents)

In [None]:
my_embeddings[0][10]

In [None]:
# NOTE: one embedding per page
len(my_embeddings)

In [11]:
import chromadb
from chromadb.config import Settings
# NOTE: docker container should be running before this line is executed
client = chromadb.HttpClient(settings=Settings(allow_reset=True)) # type: ignore

In [12]:
collection = client.create_collection(name="my_collection") # type: ignore

In [None]:
# client.delete_collection("my_collection")

In [14]:
print(collection)

name='my_collection' id=UUID('4a0df89b-d6f9-45f4-a390-e00af179429e') metadata=None


In [15]:
texts[:3]

[Document(page_content='Lanham_bindex.indd   388Lanham_bindex.indd   388 7/28/2011   4:40:16 PM7/28/2011   4:40:16 PM', metadata={'source': 'pdfs/Sports-And-Exercise-Nutrition.pdf', 'page': 1}),
 Document(page_content='Sport and Exercise Nutrition\nLanham_ffirs.indd   iLanham_ffirs.indd   i 7/28/2011   5:49:59 PM7/28/2011   5:49:59 PM', metadata={'source': 'pdfs/Sports-And-Exercise-Nutrition.pdf', 'page': 2}),
 Document(page_content='Introduction to Human Nutrition\nIntroduction to human nutrition: a global \nperspective on food and nutrition\nBody compositionEnergy metabolismNutrition and metabolism of proteins and amino acidsDigestion and metabolism of carbohydratesNutrition and metabolism of lipidsDietary reference standardsThe vitaminsMinerals and trace elementsMeasuring food intakeFood compositionFood and nutrition: policy and regulatory issuesNutrition research methodologyFood safety: a public health issue of growing importanceFood and nutrition-related diseases: the global chall

In [13]:
collection.count()

0

# asyncrnously embed chunks


In [None]:
#NOTE: initialize the collection with the embeddings

i = 1
docs = []
metadata = []
ids = []
for text in texts:
    doc = text.page_content
    metadata = text.metadata
    id = str(i)
    collection.add(
        documents=doc,
        metadatas=metadata,
        ids=id,
        # NOTE: try to find out what does this thing do
        embeddings= embedding_function.embed_documents([doc])[0],
    )
    i += 1

In [None]:
collection = client.create_collection(name="my_collection_2")

In [None]:
i = 1
docs = []
metadata = []
ids = []
for text in texts:
    docs.append(text.page_content)
    metadata.append(text.metadata)
    ids.append(str(i))
    i += 1

In [None]:
docs[:3]

In [None]:
collection.add(
    documents=docs,
    metadatas=metadata,
    ids=ids,
    embeddings=my_embeddings
)

In [None]:
my_embeddings[0]

In [None]:
collection.count()

In [None]:
get_coll = client.get_collection("my_collection_2")

In [None]:
get_coll.count()

In [None]:
client.delete_collection("my_collection_2")
client.delete_collection("my_collection")