In [24]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [12]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
pc.create_index(
    name ="rag", dimension= 1536, metric ="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [15]:
import json 
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Johnson is an amazing professor. Her lectures are clear, and she always makes time for her students.'},
 {'professor': 'Prof. Michael Lee',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Prof. Lee is very knowledgeable, but sometimes his explanations can be a bit fast-paced.'},
 {'professor': 'Dr. Sarah Miller',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Dr. Miller knows her stuff, but her exams are tough. Be prepared to study hard.'},
 {'professor': 'Prof. John Smith',
  'subject': 'History',
  'stars': 5,
  'review': 'Prof. Smith makes history come alive! His passion for the subject is contagious.'},
 {'professor': 'Dr. Lisa Adams',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Dr. Adams is knowledgeable, but her lectures are hard to follow. The material is challenging.'},
 {'professor': 'Prof. David Brown',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Prof

In [28]:
processed_data =[]
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model= "text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata":{
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"],
            }
    })


In [29]:
processed_data[0]

{'values': [0.00078708393,
  -0.005364716,
  0.01044865,
  0.046860617,
  0.017169481,
  -0.010735407,
  0.019355992,
  0.039548352,
  -0.011942169,
  -0.0040653558,
  0.016655711,
  -0.008608636,
  -0.030993482,
  0.029774772,
  -0.0050600385,
  0.032283884,
  -0.03550988,
  0.0044148387,
  0.0405281,
  0.059501752,
  0.03314415,
  -0.018615207,
  0.019547163,
  -0.026357602,
  -0.021470813,
  -0.023585634,
  0.019523265,
  0.013596986,
  0.033120252,
  -0.011422425,
  0.08392376,
  0.007993308,
  -0.021291591,
  -0.011966065,
  -0.024111353,
  0.028675543,
  0.020311844,
  -0.0074138227,
  0.0040504206,
  0.002217874,
  -0.017145585,
  -0.00034444255,
  0.0020207297,
  0.0015577391,
  0.05056454,
  0.003945874,
  -0.008064996,
  0.0006201833,
  0.04729075,
  0.052954167,
  -0.021530554,
  -0.0030183995,
  0.02309576,
  -0.01762351,
  -0.03082621,
  -0.0030497634,
  0.02368122,
  0.03823406,
  0.018125333,
  -0.028102033,
  0.044925015,
  0.0022537184,
  -0.013919586,
  -0.005436405,


In [35]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [37]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}