In [11]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [None]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
  name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [10]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Johnson's lectures are engaging and well-structured. She explains complex concepts clearly."},
 {'professor': 'Prof. Michael Lee',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Fantastic professor! Prof. Lee's passion for coding is contagious. His projects are challenging but rewarding."},
 {'professor': 'Dr. Sarah Martinez',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Dr. Martinez knows her subject well, but sometimes struggles to keep the class engaged.'},
 {'professor': 'Prof. David Wilson',
  'subject': 'History',
  'stars': 4,
  'review': 'Prof. Wilson brings history to life with his storytelling. His exams are tough but fair.'},
 {'professor': 'Dr. Lisa Chen',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Dr. Chen is an amazing educator. Her lab sessions are well-organized and informative.'},
 {'professor': 'Prof. Robert Brown',
  'subject': 'Mathematics',
  '

In [12]:
processed_data = []
client = OpenAI()
for entry in data['reviews']:
  response = client.embeddings.create(
    input=entry['review'],
    model="text-embedding-3-small",
  )
  embedding = response.data[0].embedding
  processed_data.append({
    "values": embedding,
    "id": entry["professor"],
    "metadata": {
      "review": entry["review"],
      "subject": entry["subject"],
      "stars": entry["stars"]
    }
  })

In [13]:
processed_data[0]

{'values': [0.0038522093,
  0.0036571608,
  0.03151985,
  0.021091253,
  0.009661406,
  0.003096396,
  0.0017651896,
  0.053989444,
  0.0051427805,
  0.01871166,
  0.042468574,
  -0.009414344,
  -0.01859463,
  0.01786645,
  0.01903674,
  0.027254786,
  -0.050634608,
  -0.020285051,
  0.05409347,
  0.06870911,
  0.044757146,
  -0.018776676,
  0.03245608,
  -0.0008159532,
  -0.031337805,
  -0.041220263,
  0.015200785,
  0.011091762,
  0.025759414,
  0.004213049,
  0.06163535,
  0.0025453838,
  -0.023054741,
  -0.042832665,
  -0.025382321,
  0.027280793,
  -0.0029663637,
  -0.0011686661,
  0.02092221,
  0.018321563,
  0.024719156,
  0.0025697649,
  -0.048996203,
  -0.01667015,
  0.05255909,
  0.021039238,
  0.00975893,
  -0.010201041,
  0.04972438,
  0.044991203,
  -0.033756405,
  0.0066836644,
  0.005123276,
  -0.024966218,
  -0.03128579,
  -0.017047245,
  0.006235053,
  0.046733636,
  -0.015577879,
  -0.018503608,
  0.044028964,
  -0.0048697125,
  -0.01598098,
  0.011481859,
  -0.027540

In [15]:
index = pc.Index("rag")
index.upsert(
  vectors=processed_data,
  namespace="ns1"
)

{'upserted_count': 20}

In [16]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}