In [1]:

from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [5]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Ross',
  'subject': 'Introduction to Algorithms',
  'stars': 5,
  'review': 'Dr. Ross is an amazing professor! Her lectures are clear, and she makes complex topics easy to understand.'},
 {'professor': 'Dr. John Smith',
  'subject': 'Data Structures',
  'stars': 4,
  'review': 'Dr. Smith is very knowledgeable, but sometimes the pace of the class is a bit fast.'},
 {'professor': 'Prof. Linda Nguyen',
  'subject': 'Operating Systems',
  'stars': 3,
  'review': 'The course content is solid, but Prof. Nguyen could be more engaging in her delivery.'},
 {'professor': 'Dr. Mark Johnson',
  'subject': 'Database Systems',
  'stars': 5,
  'review': "Dr. Johnson's classes are both informative and fun. He provides real-world examples that make the material relevant."},
 {'professor': 'Prof. Sarah Lee',
  'subject': 'Computer Networks',
  'stars': 4,
  'review': 'Prof. Lee knows her stuff, but sometimes the assignments can be a bit too challenging.'},
 {'professor': 'Dr. A

In [6]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model = "text-embedding-3-small",   
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [8]:
processed_data[0]

{'values': [-0.022767281,
  -0.013887924,
  -0.04316517,
  0.023377223,
  0.01860325,
  -0.009407192,
  -0.0029690717,
  0.021477018,
  -0.0058824276,
  -0.011934935,
  0.0039910185,
  0.011946664,
  -0.011107993,
  0.0034543865,
  0.031130532,
  0.0099291615,
  -0.013747168,
  0.015318943,
  0.021301072,
  0.0540503,
  0.015236836,
  -0.021160318,
  0.060102805,
  -0.006246047,
  -0.042578686,
  -0.05161053,
  0.028620385,
  0.018532872,
  -0.001570309,
  0.026837476,
  0.068642005,
  -0.008040686,
  0.013852735,
  -0.040866155,
  -0.03643234,
  0.033335708,
  -0.03124783,
  0.000475418,
  0.008850032,
  0.0051434585,
  0.04316517,
  0.04004508,
  -0.018051956,
  0.011266344,
  0.046214882,
  0.0032725767,
  -0.045722235,
  -0.02137145,
  0.06441932,
  0.028995734,
  -0.075726725,
  0.011149047,
  0.027588174,
  0.004686001,
  -0.06770363,
  0.064700834,
  0.02984027,
  0.027658552,
  0.015694292,
  -0.027588174,
  0.0618388,
  0.0062871007,
  -0.0102927815,
  -0.032843065,
  -0.01617

In [9]:
index = pc.Index('rag')
index.upsert(
    vectors = processed_data,
    namespace="ns1"    
) # insert into the db

{'upserted_count': 20}

In [10]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}