In [12]:
import os
from langchain.document_loaders import PyPDFDirectoryLoader
from healthfirstai_prototype.data_models import User
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS, Pinecone
import pinecone


In [13]:
COHERE_API_KEY = ""
SERPER_API_KEY = ""
PINECONE_API_KEY = ""
PINECONE_ENV_NAME = ""

In [14]:
loader = PyPDFDirectoryLoader(path="./pdfs/")

documents = loader.load()

In [15]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [16]:
embedding_function = CohereEmbeddings(cohere_api_key=COHERE_API_KEY) # type: ignore

In [17]:
embedding0 = embedding_function.embed_documents([texts[0].page_content])

In [19]:
# testing vector dimension
len(embedding0[0])

4096

In [20]:
indexname = "pinecone-knowledge-base"

In [21]:
# We initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV_NAME
)

In [24]:
pinecone.list_indexes()

['pinecone-knowledge-base']

In [23]:
if indexname not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=indexname,
        metric='cosine',
        dimension=len(embedding0[0]) # dimension of the vector
    )

In [27]:
# Then we connect to the new index:
index = pinecone.Index(indexname)
# And request the index stats:
index.describe_index_stats()

{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [35]:
docsearch = Pinecone.from_documents(texts, embedding_function, index_name=indexname)

In [36]:
index.describe_index_stats()

{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 498}},
 'total_vector_count': 498}

In [37]:
query = "How many hours a day should I sleep?"
docs = docsearch.similarity_search(query)

In [39]:
i = 1
docs = []
metadatas = []
ids = []
for text in texts:
    docs.append(text.page_content)
    metadatas.append(text.metadata)
    id = str(i)
    i += 1

In [40]:
vectorstore = Pinecone(index, embedding_function.embed_query, "text")

vectorstore.add_texts(docs, metadatas, ids)

Upserted vectors: 100%|██████████| 398/398 [00:09<00:00, 40.69it/s]


['ca187d5c-9d76-432a-9bc4-557278d04882',
 'b1d5ce1e-75a6-490a-a857-2ce17bc176e0',
 '6118d535-e8f1-4a9d-9141-16e6a7678ab9',
 'a66858eb-9277-4ec9-9fbb-3369ed389152',
 '86f9b336-2f3b-4548-857e-3e27f4fa3437',
 '9dc4231c-7aba-4130-a0da-71220fcee5c1',
 '785123b0-67d2-4fd4-8554-73c415a64128',
 'd33bcaa4-12da-4efe-afb6-760cb18d1eaf',
 '49dfffb3-0bce-4def-afd8-0ad5da6a6a43',
 '3d99ef9d-0ec6-47ac-9341-4bb53ab629d8',
 '2f02b429-ace1-4d3c-b390-9e750f609473',
 'a8f47f30-fef2-46ec-a933-8d8af8cd52a7',
 '79a74597-68a1-4762-b5eb-5e8e445ae005',
 '8e518f41-c617-4b41-97f7-8934e847a990',
 'e6f42575-0414-4009-a11a-372881e15384',
 '4019a82a-fa93-4c7d-8f07-9d5ba3754de8',
 '2f4aa207-8319-4286-b6a9-d0ed2d0a3218',
 '599b9b42-2f3e-4a8e-b3e1-8ddbc78716d8',
 '310c4c52-3ed2-49bb-8bf0-332fc4bd5225',
 '24ae1c20-e0ac-4f18-b6cb-881c2395c7a3',
 '9f78804c-aa7a-4a07-be83-7f835e583ba5',
 'a85fbe9f-ce8a-4363-bbb1-9dc95033b1cb',
 'd3bf63a7-af80-46cd-8eb2-c254da685ee3',
 '72e0489f-2130-442d-bb45-b16133089a28',
 '5dd65e66-c9d5-

In [38]:
docs[0].page_content

'Travel 307\nmorning, generally coinciding with the peak in body temperature. Hence, when travelling across time zones to compete, it is important to assess what time of day the competition is to be staged when deciding how many days to leave prior to competition, as well as the impact of the travel duration and mode itself.\nCaffeine can be considered for use in order to pro-\nvide temporary relief of fatigue, but should be avoided in the late afternoon/evening where onset of sleep early in the evening is important. Some research has been undertaken regarding the manipulation of macronutrient content of meals (e.g. CHO, protein) to improve sleep onset and maintenance, although this has not been undertaken with specific reference to jet lag. There is some evidence to suggest that the CHO content of a meal can impact on the onset of sleep since the presence of insulin allows the amino acid tryptophan a competitive advantage in crossing the blood–brain barrier for its conversion to serot

In [41]:
docsearch = Pinecone.from_existing_index(indexname, embedding_function)

In [42]:
query = "Tell me more about nutrition for gaining muscle mass."
docs = docsearch.similarity_search(query)

In [43]:
print(docs[0].page_content)

Losing, Gaining and Making Weight for Athletes 227
nutrient alone, especially when the intake of either is less than optimal. There is also preliminary evidence suggesting that concurrent fat ingestion enhances amino acid uptake into peripheral tissues. The mech-anism by which this occurs remains to be elucidated, although a slowing of protein digestion or associated increase in total energy intake appears plausible.Energy intake and lean mass gainWhile dietary protein is important, total daily energy intake also plays a critical role. At any given protein intake, increasing energy intake creates a more  positive nitrogen balance, presumably because the additional energy allows more of the ingested protein to be directed to protein synthesis. A positive energy balance also creates an anabolic hormonal profile, promoting lean body mass gain, even when under-taken independent of resistance exercise. In fact, upwards of half the weight gain may come from lean body mass, although individua