In [7]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)



In [27]:
import json
# Load the review data
data = json.load(open("reviews.json"))

In [28]:
processed_data = []
client = OpenAI()

# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["restaurant"],
            "metadata":{
                "review": review["review"],
                "cuisine": review["cuisine"],
                "stars": review["stars"],
            }
        }
    )

In [29]:
processed_data[0]

{'values': [-0.08117508,
  -0.004885985,
  -0.046682928,
  -0.024478301,
  0.0111930175,
  -0.035314545,
  -0.032194287,
  0.015794793,
  -0.0063675027,
  -0.036790017,
  -0.009935239,
  -0.01602458,
  -0.012166587,
  0.0013794744,
  -0.008211841,
  0.059792846,
  -0.016726034,
  0.0032170098,
  0.03756403,
  0.061002247,
  0.021612018,
  -0.008768166,
  -0.060179856,
  0.009602654,
  -0.011162783,
  -0.011235347,
  -0.025470011,
  0.06163114,
  0.023075394,
  -0.0006678168,
  0.012063787,
  -0.01849176,
  -0.008979811,
  -0.038386427,
  -0.051472157,
  -0.0089919055,
  -0.00045768317,
  0.041676,
  0.014186288,
  -0.05345558,
  0.023317276,
  0.03911207,
  0.0665655,
  0.022615822,
  0.004396177,
  -0.012650347,
  -0.018769922,
  0.023825224,
  0.024175951,
  -0.024345268,
  -0.009046328,
  0.0010559594,
  -0.040369846,
  -0.038773436,
  -0.026292404,
  -0.0007982055,
  0.030089928,
  0.034226082,
  0.061050624,
  -0.0011323028,
  0.028735397,
  0.0038972986,
  0.017270263,
  -0.04624

In [30]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

In [31]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}