In [5]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [None]:

api_key = os.getenv("PINECONE_API_KEY")

if not api_key:
    raise ValueError("Please set the PINECONE_API_KEY environment variable")

pc = Pinecone(api_key=api_key)
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec= ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Alice Johnson',
  'subject': 'Calculus',
  'stars': 4,
  'review': 'Dr. Johnson is very knowledgeable and explains concepts clearly. However, the homework assignments are quite challenging.'},
 {'professor': 'Dr. Bob Smith',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Dr. Smith is passionate about chemistry, but his lectures can be disorganized. The lab work is where the class really shines.'},
 {'professor': 'Dr. Carol Lee',
  'subject': 'History',
  'stars': 5,
  'review': 'Dr. Lee is an exceptional historian with engaging lectures and a deep understanding of the material. Highly recommend her class.'},
 {'professor': 'Dr. David Brown',
  'subject': 'Biology',
  'stars': 2,
  'review': 'Dr. Brown is knowledgeable but the course is very lecture-heavy and lacks interactive elements.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Dr. Davis makes complex topics accessible and is always willing to help students

In [9]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input = review["review"],
        model ="text-embedding-3-small"
    )

    embedding = response.data[0].embedding
    processed_data.append({
        "values":  embedding,
        "id": review["professor"],
        "metadata":{
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"],
        }
    })

In [10]:
processed_data[0]

{'values': [-0.023927193,
  -0.010666182,
  0.019728525,
  -0.006438175,
  0.026887124,
  -0.010874812,
  0.018854888,
  0.022336394,
  -0.0023079636,
  0.019728525,
  0.010783536,
  0.026196038,
  -0.055964857,
  0.0054993425,
  -0.0006234436,
  0.05056657,
  -0.018163804,
  -0.023888076,
  0.0265481,
  0.084338464,
  0.018307237,
  -0.006265404,
  0.022206,
  -0.017994292,
  -0.060554706,
  -0.033432875,
  0.008273463,
  0.012739438,
  0.03948313,
  -0.026730651,
  0.06530102,
  -0.00022329873,
  -0.013965135,
  -0.014903968,
  -0.026561141,
  0.04996676,
  0.0030626156,
  0.036197215,
  0.01804645,
  0.02993833,
  -0.013730427,
  -0.009349208,
  -0.059928816,
  -0.035206225,
  0.007275953,
  0.002441617,
  -0.00057210115,
  -0.024879064,
  0.025896134,
  0.037605464,
  -0.01605143,
  0.033015616,
  0.033380717,
  -0.033224244,
  -0.020263137,
  -0.013378365,
  0.003950921,
  0.022636298,
  -0.037370756,
  -0.0039607002,
  0.057998996,
  -0.027226146,
  -0.033615425,
  -0.020784711,


In [11]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 22}

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 22}},
 'total_vector_count': 22}