In [5]:
#pip install -r requirements.txt

from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",dimension=1536,metric="cosine",spec=ServerlessSpec(cloud="aws",region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Great professor, explains concepts clearly and is always willing to help.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Amazing lectures, very engaging and thorough.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Good teacher but sometimes hard to follow during lectures.'},
 {'professor': 'Dr. Laura Wilson',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Very knowledgeable and approachable, makes difficult topics easy.'},
 {'professor': 'Dr. Michael Davis',
  'subject': 'History',
  'stars': 2,
  'review': 'Lectures are a bit dry and assignments are tough.'},
 {'professor': 'Dr. Jessica Taylor',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Interesting lectures, but exams are challenging.'},
 {'professor': 'Dr. William Anderson',
  'subject': 'Philosophy',
  'stars': 3,
  'review': 'Good discuss

In [7]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [8]:
processed_data[0]

{'values': [-0.05619859,
  -0.02505714,
  -0.017730422,
  0.04484276,
  -0.02232849,
  -0.02106286,
  -0.022746496,
  0.03186135,
  -0.047838468,
  -0.01786976,
  0.013445861,
  -0.00015693369,
  -0.020819023,
  -0.03276703,
  0.01697569,
  0.033904936,
  0.003027641,
  0.028656637,
  0.021608591,
  0.045725215,
  -0.003962349,
  -0.0027707415,
  0.022386545,
  0.038897786,
  -0.037736658,
  -0.034253273,
  -0.0050625177,
  0.023303837,
  0.003950738,
  0.008011783,
  0.094933815,
  -0.0007264306,
  -0.026636275,
  -0.037550878,
  -0.04221861,
  0.029840987,
  -0.009468998,
  -0.012772407,
  0.012737573,
  0.028285075,
  -0.013701309,
  0.03769021,
  -0.02206143,
  0.010496596,
  0.062329344,
  -0.0004535656,
  -0.06790276,
  -0.01742853,
  0.011233913,
  0.049278267,
  -0.01198284,
  -0.0035385373,
  0.039199676,
  0.029074643,
  -0.03875845,
  0.028679859,
  0.04310107,
  -0.014606989,
  0.012296344,
  0.0011763677,
  0.02649694,
  0.012609849,
  -0.005117671,
  -0.0094341645,
  -0.0

In [11]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}