In [20]:
from dotenv import load_dotenv 
load_dotenv() 
import os 
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [13]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.delete_index(name="rag") 
pc.create_index( 
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1") 
)

In [14]:
import json 
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Johnson is an amazing professor. Her lectures are engaging and she is always available to help students during office hours. Highly recommend this class!'},
 {'professor': 'Dr. Michael Lee',
  'subject': 'Mechanical Engineering',
  'stars': 4,
  'review': "Dr. Lee's lectures are dry and sometimes difficult to follow, but he is very knowledgeable about the subject matter. The exams are challenging but fair."},
 {'professor': 'Dr. Olivia Garcia',
  'subject': 'Psychology',
  'stars': 5,
  'review': 'Dr. Garcia is a fantastic professor. Her lectures are engaging and she is very approachable. I learned a lot in her class and would recommend it to anyone interested in psychology.'},
 {'professor': 'Dr. William Chen',
  'subject': 'Economics',
  'stars': 4,
  'review': 'Dr. Chen is a brilliant lecturer, but his grading is quite tough. The workload is heavy, but I feel like I gained a lot of va

In [25]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model = "text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata":{
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })



In [26]:
processed_data[0]

{'values': [-0.0014760508,
  0.014133168,
  0.0062243626,
  0.03788843,
  0.004216876,
  -0.0054571335,
  0.013487081,
  0.03170445,
  -0.015771462,
  0.0059416993,
  0.026951088,
  0.0019988338,
  -0.019232646,
  0.0131525,
  -0.008226081,
  0.028820127,
  -0.026028104,
  -0.01056238,
  0.03394268,
  0.05620963,
  -0.0030977598,
  -0.011381527,
  0.022255413,
  -0.030481495,
  -0.043449398,
  -0.043587845,
  0.0027357775,
  0.01611758,
  0.049656454,
  -0.00801841,
  0.08279152,
  0.008151089,
  -0.022959188,
  -0.032742802,
  -0.03202749,
  0.01739822,
  0.014998465,
  0.025774285,
  0.016082969,
  -0.005462902,
  -0.008024178,
  -0.0177328,
  -0.004606259,
  0.00421976,
  0.016302178,
  0.0030025772,
  0.022555383,
  -0.02625885,
  0.050717887,
  0.044649277,
  -0.041903403,
  -0.0067435405,
  0.020720955,
  -0.029166246,
  -0.033965755,
  -0.0060974527,
  0.008958698,
  0.04118809,
  0.013060202,
  -0.020894015,
  0.038372993,
  -0.0049783364,
  -0.02655882,
  -0.008768333,
  -0.01

In [27]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [28]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}