In [7]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [5]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'An amazing professor who explains concepts clearly.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Great lectures, but sometimes difficult to follow.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Physics',
  'stars': 3,
  'review': 'The course was okay, but the exams were tough.'},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'The course material was overwhelming.'},
 {'professor': 'Dr. James Wilson',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Incredible teacher with engaging lectures.'},
 {'professor': 'Dr. Jessica Miller',
  'subject': 'History',
  'stars': 4,
  'review': 'Very knowledgeable, but the course load is heavy.'},
 {'professor': 'Dr. William Moore',
  'subject': 'Economics',
  'stars': 3,
  'review': 'Average teaching, but helpful office hours.'},
 {'professor': 'Dr. Linda Taylor'

In [11]:
processed_data = []
client = OpenAI()

# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        })


In [12]:
processed_data[0]

{'values': [-0.026386721,
  -0.02721801,
  -0.04617676,
  0.038292926,
  0.00062179076,
  -0.0147889,
  -0.0094056325,
  0.039258294,
  -0.008373225,
  -0.033975586,
  0.023209052,
  0.017953161,
  -0.027888404,
  -0.028022483,
  0.033653796,
  0.020581106,
  -0.026239235,
  0.018489476,
  0.04137674,
  0.047303025,
  0.022498434,
  -0.011638045,
  0.0026497336,
  0.055455018,
  -0.021224685,
  -0.03140127,
  0.01132296,
  0.030301824,
  0.0086816065,
  0.01018329,
  0.08715127,
  -0.0058793584,
  -0.0032748764,
  -0.053631548,
  -0.048751075,
  0.04566726,
  -0.027097339,
  -0.0013893923,
  -0.0055173454,
  0.029926402,
  0.001655874,
  0.022109605,
  -0.008312889,
  0.0050648293,
  0.06178354,
  0.023597881,
  -0.060067333,
  -0.0025692864,
  0.029014667,
  0.054006968,
  -0.041698527,
  -0.008145291,
  0.03260798,
  0.04928739,
  -0.0390974,
  0.06779028,
  0.03032864,
  0.01739003,
  0.0062078517,
  -0.03368061,
  0.06821933,
  -0.0021754296,
  -0.01110173,
  0.008554231,
  -0.0111

In [13]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)


{'upserted_count': 20}

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}