In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


  from tqdm.autonotebook import tqdm


In [None]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)


In [3]:
import json
data = json.load(open("review.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Great professor, explains concepts clearly. Sometimes assignments can be a bit tough.'},
 {'professor': 'Dr. Alice Johnson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Amazing lectures, very helpful during office hours. Exams are fair.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Content is interesting but lectures can be dry. Make sure to study the textbook.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Engaging lectures and provides plenty of resources to help students succeed.'},
 {'professor': 'Dr. Michael Miller',
  'subject': 'Biology',
  'stars': 2,
  'review': 'Lectures are disorganized and the exams are way too difficult.'},
 {'professor': 'Dr. Sarah Wilson',
  'subject': 'English Literature',
  'stars': 4,
  'review': 'Very passionate about the subject, but expects a lot from students.'

In [6]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata":{
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })


In [8]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [9]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}