In [9]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [5]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [6]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Thompson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Dr. Thompson explains complex concepts clearly and is always willing to help during office hours.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Brilliant professor! His lectures are engaging and he provides practical coding examples.'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Knowledgeable but sometimes goes too fast. Could improve on explaining difficult topics.'},
 {'professor': 'Prof. David Martinez',
  'subject': 'History',
  'stars': 4,
  'review': 'Passionate about the subject and brings historical events to life. Assignments can be challenging.'},
 {'professor': 'Dr. Rachel Lee',
  'subject': 'Psychology',
  'stars': 5,
  'review': 'Exceptional teacher! Her classes are thought-provoking and she encourages critical thinking.'},
 {'professor': 'Prof. Robert Wilson',
  'subject': 'Mathematics',
 

In [10]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [11]:
processed_data[0]

{'values': [-0.016620973,
  0.011211381,
  -0.003372543,
  0.041518625,
  0.017107837,
  0.005811931,
  0.032619845,
  0.028265122,
  -0.017581176,
  -0.0031544687,
  0.011840247,
  0.035081208,
  0.019420438,
  -0.047144603,
  0.011677958,
  0.02937409,
  0.011150523,
  0.0014774951,
  0.02132732,
  0.055989284,
  0.019920826,
  0.0041045286,
  0.034864824,
  -0.0032389937,
  -0.056205668,
  -0.0345132,
  0.004033528,
  0.024464883,
  -0.022463335,
  -0.019095862,
  0.072867215,
  -0.03161907,
  0.02319363,
  -0.016282875,
  -0.057017107,
  0.026885677,
  -0.0046522496,
  0.015498484,
  0.01480876,
  0.004537296,
  0.006829611,
  0.042059585,
  -0.02170599,
  0.0055211657,
  0.012813973,
  -0.028643794,
  -0.041085858,
  -0.01568782,
  0.031402685,
  0.017527081,
  -0.017824609,
  0.017567653,
  0.018460235,
  -0.010055081,
  -0.07189349,
  0.04408818,
  0.014565329,
  0.00011907444,
  0.020191304,
  -0.03240346,
  0.05444755,
  -0.007850671,
  -0.0014031131,
  -0.0009821792,
  -0.024

In [12]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [13]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}