In [10]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [7]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [8]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Alice Johnson',
  'subject': 'Computer Science',
  'stars': 4,
  'review': "Dr. Johnson explains complex concepts in a way that's easy to understand. Her lectures are well-structured, but the exams are quite challenging."},
 {'professor': 'Dr. Robert Smith',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Dr. Smith is an excellent professor! He makes math fun and engaging. The homework assignments are tough, but they really help with understanding the material.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Dr. Davis is knowledgeable but tends to go off-topic during lectures. The grading is fair, but participation is key to doing well in her class.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Physics',
  'stars': 2,
  'review': "Dr. Brown's lectures are hard to follow, and he doesn't seem very approachable. The lab sessions, however, are well-organized and helpful."},
 {'professor': 'Dr. Linda White',
  'sub

In [12]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata" : {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [13]:
processed_data[0]

{'values': [-0.0031909198,
  -0.016566314,
  0.032487016,
  0.012402139,
  0.034268897,
  0.01648884,
  0.0028535894,
  0.021317992,
  0.0047710463,
  0.020181721,
  0.027657866,
  -0.0029294484,
  -0.033855706,
  0.01583032,
  -0.004354629,
  0.014694049,
  -0.04284258,
  -0.021395465,
  0.03587001,
  0.05219099,
  0.04410797,
  -0.016940767,
  0.055160787,
  -0.015869057,
  -0.004509575,
  -0.029000733,
  -0.00039220715,
  0.0060945437,
  0.026599068,
  0.0128024155,
  0.084497236,
  0.00883838,
  0.008057194,
  -0.03197053,
  -0.033597466,
  0.029129853,
  -0.0068886424,
  0.024029547,
  0.039330468,
  0.01571411,
  -0.008412278,
  0.024326527,
  -0.051028892,
  -0.032822736,
  0.04273928,
  0.026986433,
  -0.0072889198,
  -0.008993326,
  0.033649113,
  0.06208171,
  -0.041938726,
  0.02773534,
  0.05386957,
  -0.004980869,
  -0.039898604,
  -0.010342647,
  0.0059202295,
  0.026147142,
  -0.0030843944,
  -0.018257808,
  0.055212434,
  -0.033649113,
  -0.020504525,
  0.0044062776,
  

In [14]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 7}

In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 7}},
 'total_vector_count': 7}