### A Notebook to handle Vector DB Creation in Pinecone


In [1]:
import os
from dotenv import load_dotenv

from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv()

# Pinecone Setup

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if PINECONE_API_KEY is None:
    raise ValueError("Please set the PINECONE_API_KEY environment variable")

pc = Pinecone(api_key=PINECONE_API_KEY)

In [3]:
# Index Creation
import time

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "professors-index"

# Delete the index if it already exists
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    
print(f"Creating index {index_name}")

# we create a new index
pc.create_index(
    index_name,
    dimension=1536,  # dimensionality of text-embedding-3-small
    metric='cosine',
    spec=spec
)

while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

Creating index professors-index


In [4]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [5]:
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [6]:
import json
from pathlib import Path
from pprint import pprint

data_path = "./data/sample.json"
data = json.loads(Path(data_path).read_text())
# pprint(data)

In [7]:
from langchain.text_splitter import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=300)
docs = splitter.create_documents(texts=[data], convert_lists = True)

In [8]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=uuids)

['c66683fb-3ce6-4ef6-8138-6e67d34caaaf',
 'c7e82d66-c636-4e58-870a-62aefb1e43c0',
 '48dcd0d9-0fda-4f62-b403-5f84f5b64d76',
 '94980dd8-90a9-4ba8-8c6b-bfa1c3712fd9',
 'ed699394-6c36-47fa-8239-39ee837b3e7f',
 '512cc0b3-0927-4104-871a-ec9488649fd5',
 '1ce677a6-e988-4421-aef9-83f928461b81',
 '505eeaa0-0460-40a1-9d5f-23081dd4e169',
 '7b692edc-034d-49b9-b898-8b70be6d44be',
 '5559fb55-835b-4b6e-8c76-2277c82e1168']

In [9]:
results = vector_store.similarity_search(
    "Biology Professor",
    k=2,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* {"0": {"professor_name": "Dr. Jane Smith", "course": "Intro to Biology", "overall_rating": 4.5, "clarity": 4.7, "helpfulness": 4.6, "easiness": 3.8, "comment": "Very knowledgeable and explains concepts clearly."}} [{}]
* {"9": {"professor_name": "Dr. David Lee", "course": "Environmental Sci", "overall_rating": 3.7, "clarity": 3.6, "helpfulness": 3.8, "easiness": 3.4, "comment": "Lectures can be dry, but he's knowledgeable."}} [{}]
