In [7]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [5]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Physics',
  'stars': 5,
  'review': "Dr. Johnson's lectures are engaging and her explanations of complex concepts are crystal clear. Highly recommended!"},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Prof. Chen is knowledgeable and approachable. His coding assignments are challenging but rewarding.'},
 {'professor': 'Dr. Sarah Williams',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Dr. Williams knows her stuff, but her lectures can be a bit dry. Office hours are helpful though.'},
 {'professor': 'Prof. David Martinez',
  'subject': 'History',
  'stars': 5,
  'review': 'Prof. Martinez brings history to life! His passion for the subject is contagious.'},
 {'professor': 'Dr. Lisa Thompson',
  'subject': 'Psychology',
  'stars': 4,
  'review': "Dr. Thompson's research-based approach is refreshing. Assignments can be tough but fair."},
 {'professor': 'Prof. Robert Lee',
  'subject': 

In [8]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [9]:
processed_data[0]

{'values': [-0.0057467073,
  0.008641995,
  -0.0019771932,
  0.013762017,
  0.015015388,
  -0.0010786826,
  -0.00046570576,
  0.060512763,
  -0.0030394252,
  0.005423964,
  0.038027283,
  -0.009864031,
  -0.026195459,
  0.00484428,
  0.003374702,
  0.037901945,
  -0.035294935,
  -0.01333587,
  0.051839434,
  0.08673329,
  0.038202755,
  -0.015203393,
  0.027473899,
  -0.00958829,
  -0.04534697,
  -0.05031032,
  0.010891796,
  0.023525778,
  0.033966362,
  -0.02396446,
  0.06758178,
  0.005480366,
  -0.030582258,
  -0.035971753,
  -0.030381719,
  0.02609519,
  0.014451371,
  0.017559731,
  0.0028451527,
  0.008184514,
  0.007670632,
  -0.013574011,
  -0.037977148,
  -0.008103045,
  0.04712676,
  0.024528475,
  0.00019534965,
  -0.015341264,
  0.03494399,
  0.036748845,
  -0.044920824,
  0.0026978815,
  -0.0057122395,
  -0.020906232,
  -0.04098524,
  -0.013373471,
  -0.005057353,
  0.041737262,
  -0.023550846,
  -0.016832776,
  0.03271299,
  0.0008593427,
  -0.023713784,
  0.009889099,
 

In [10]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 21}

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 21}},
 'total_vector_count': 21}