# Storage documents in Vector DB

API to load document in vector databases:



*   Chroma
*   Faiss
*   Weaviate
*   Pinecone



In [None]:
from datetime import datetime

In [None]:
def load_document(name, src, type_document):
  return None

In [None]:
# document example to store vdb.
docs = [
    "This is a document about pineapple",
    "This is a document about oranges"
]
ids = ["id1", "id2"]

In [None]:
# environment to generate model embedding
from sentence_transformers import SentenceTransformer
model_embedding = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
query = "fruits with tropical flavor"

In [None]:
! pip install python-dotenv

In [None]:
import os
from dotenv import load_dotenv
load_dotenv(".env")

## Chroma

In [None]:
!pip install chromadb

In [None]:
import chromadb

collection_name = "vbd_sample_collection"
chroma_client = chromadb.Client()
#chroma_client = chromadb.Client(Settings(persist_directory="./vector_db")) #Storage in local directory.
collection = chroma_client.get_or_create_collection(
    name= collection_name,
     metadata={
        "description": "my first Chroma collection",
        "created": str(datetime.now())
    }  )

collection.upsert(documents=docs,ids=ids)

In [None]:
results = collection.query(
    query_texts=[query], # Chroma will embed this for you
    n_results=2 # how many results to return
)

print(results)

collections = chroma_client.list_collections()

print(collections)

## Faiss

In [None]:
import faiss
import numpy as np
import pickle



In [None]:
src_index="faiss_index.idx"
src_texts="faiss_docs.pkl"
src_ids="faiss_ids.pkl"

In [None]:
embeddings = model_embedding.encode(docs)

dim = embeddings.shape[1]

index = faiss.IndexFlatL2(dim)

index.add(np.array(embeddings))

faiss.write_index( index, src_index)

In [None]:
with open(src_texts, "wb") as f:
      pickle.dump(docs, f)

with open(src_ids, "wb") as f:
    pickle.dump(ids, f)

print(f"Saved: {src_index}, {src_texts}, {src_ids}")

In [None]:
# Loads ids and docs
index = faiss.read_index("faiss_index.idx")
with open("faiss_docs.pkl", "rb") as f:
    docs = pickle.load(f)
with open("faiss_ids.pkl", "rb") as f:
    ids = pickle.load(f)

# Search
query_emb = model_embedding.encode([query]).astype("float32")
D, I = index.search(query_emb, k=2)

# Show resuls
for idx in I[0]:
    print(f"ID: {ids[idx]} - Texto: {docs[idx]}")


## Weaviate

In [None]:
! pip install weaviate-client

In [None]:
import weaviate
from weaviate.classes.init import Auth
import uuid

In [None]:
# Best practice: store your credentials in environment variables
weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_key = os.getenv("WEAVIATE_API_KEY")

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,                     # Weaviate URL: "REST Endpoint" in Weaviate Cloud console
    auth_credentials=Auth.api_key(weaviate_key),  # Weaviate API key: "ADMIN" API key in Weaviate Cloud console
)

print(client.is_ready())  # Should print: `True`

#client.close()

In [None]:
embeddings = model_embedding.encode(docs).tolist()

In [None]:
from weaviate.classes.config import Configure

name_collection = "DemoCollection"

client.collections.create(
    name=name_collection,
    vectorizer_config=None
    # Additional parameters not shown
)

collection = client.collections.get(name_collection)

print(collection)


In [None]:
# Save docuemtns
for i in range(len(docs)):
    collection.data.insert(
        properties={"text": docs[i]},
        vector=embeddings[i],
        uuid=str(uuid.uuid4())
    )

print("documents registered")


In [None]:
from weaviate.classes.query import MetadataQuery

vector = model_embedding.encode([query])[0].tolist()

#search

result = collection.query.near_vector(
    near_vector=vector, # your query vector goes here
    limit=3,
    return_metadata=MetadataQuery(distance=True)
    )

# Show results
for o in result.objects:
    print(o.properties)
    print(o.metadata.distance)


In [None]:
client.close()

## Pinecone

In [None]:
! pip install pinecone

In [None]:
# Import the Pinecone library
from pinecone import Pinecone

pinecone_key = os.getenv("PINECONE_API_KEY")

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=pinecone_key)


In [None]:
from pinecone import ServerlessSpec

# Create a dense index with integrated embedding
index_name = "vbd-sample-collection"

embeddings = model_embedding.encode(docs).tolist()

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=len(embeddings[0]),
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

In [None]:
# Get idx
index = pc.Index(index_name)

# Insert docs
items = [(f"id{i}", embeddings[i], {"text": docs[i]}) for i in range(len(docs))]
index.upsert(vectors=items)

In [None]:
# search query
query_vector = model_embedding.encode([query])[0].tolist()

# similiarity search
results = index.query(
    vector=query_vector,
    top_k=3,
    include_metadata=True
)

for match in results["matches"]:
    score = match["score"]
    text = match["metadata"]["text"]
    print(f"Tex: {text}\n Score: {score:.3f}\n---")

