# Llama Indexing for Chroma

In [13]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding

from IPython.display import Markdown, display


In [8]:

# Create chroma client
# chroma_client = chromadb.Client()
# chroma_client = chromadb.Client(Settings(chroma_api_impl="rest",
#                                 chroma_server_host="18.233.156.143",  # EC2 instance public IPv4
#                                 chroma_server_http_port=8000))

db = chromadb.Client(
    Settings(chroma_db_impl="duckdb+parquet", persist_directory="./chroma_db")
)

# returns a nanosecond heartbeat. Useful for making sure the client remains connected.
print("Nanosecond heartbeat on server", chroma_client.heartbeat())

# Check Existing connections
print(chroma_client.list_collections())

default_ef = embedding_functions.DefaultEmbeddingFunction()
collection = db.get_or_create_collection("fulltext")

Nanosecond heartbeat on server 1688939199295354373000
[Collection(name=fulltext)]


In [None]:
collection.add(*doc)

In [9]:
# set up OpenAI
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")


In [15]:
# define embedding function
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [66]:
import pandas as pd
doc = pd.read_csv("/home/ubuntu/work/data/fulltext_docs_csvs_cleaned/fulltext_doc_1.csv")

In [67]:
doc

Unnamed: 0,0
0,{'documents': ['the implication of metabolical...
1,"{'documents': ['HAPLOMETRA CYLINDRACEA (ZEDER,..."
2,{'documents': ['Harnessing Gene Expression Net...
3,{'documents': ['Phasic Contractions in Urinary...
4,"{'documents': [""Altered Mental Status as a Nov..."
...,...
382,{'documents': ['Utility of the Physical Examin...
383,{'documents': ['Department of Internal Medicin...
384,{'documents': ['Liverpool and Macarthur Cancer...
385,{'documents': ['High Prevalence and Partner Co...


In [68]:
# turn string dictionaries into dictionary objects
import ast
ast.literal_eval(doc.iloc[0, 0])

{'documents': ['the implication of metabolically active Vibrio spp. in the digestive tract of Litopenaeus vannamei for its post-larval development\n0123456789\n\nEstefanía Garibay-Valdez \nCentro de Investigación en Alimentos y Desarrollo A.C (CIAD)\nCarretera a La Victoria S/NCP. 83304Hermosillo, SonoraMexico\n\nLuis Rafael Martínez-Córdova \nDepartamento de Investigaciones Científicas y Tecnológicas (DICTUS)\nUniversidad de Sonora\nBlvd. Luis Donaldo Colosio S/N\nCP. 83000Hermosillo, SonoraMexico\n\nMarco A López-Torres \nDepartamento de Investigaciones Científicas y Tecnológicas (DICTUS)\nUniversidad de Sonora\nBlvd. Luis Donaldo Colosio S/N\nCP. 83000Hermosillo, SonoraMexico\n\nF Javier Almendariz-Tapia \nDepartamento de Ingeniería Química y Metalurgia\nUniversidad de Sonora\nBlvd. Luis Donaldo Colosio S/N\nCP. 83000Hermosillo, SonoraMexico\n\nMarcel Martínez-Porchas \nCentro de Investigación en Alimentos y Desarrollo A.C (CIAD)\nCarretera a La Victoria S/NCP. 83304Hermosillo, Sono

In [52]:
import langchain

In [72]:
test = ast.literal_eval(doc.iloc[0, 0])['documents']

In [18]:
documents = SimpleDirectoryReader('/home/ubuntu/work/data/fulltext_docs_csvs/', num_files_limit= 5).load_data()


In [None]:
# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=collection)


In [None]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)


In [None]:
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)


In [None]:

# Query Data
query_engine = index.as_query_engine(chroma_collection=chroma_collection)
response = query_engine.query("What did the author do growing up?")
display(Markdown(f"<b>{response}</b>"))

Check Work

In [None]:
collection.count()