In [15]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [13]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [14]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Carter',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Dr. Carter is incredibly knowledgeable and passionate about psychology. Her lectures are engaging and she encourages student participation.'},
 {'professor': 'Prof. John Smith',
  'subject': 'Calculus I',
  'stars': 3,
  'review': 'Prof. Smith is a good instructor but tends to move through material quickly. Office hours are helpful for clarification.'},
 {'professor': 'Dr. Maria Gonzalez',
  'subject': 'Modern American History',
  'stars': 4,
  'review': 'Dr. Gonzalez brings history to life with her vivid storytelling. She is approachable and provides insightful feedback on assignments.'},
 {'professor': 'Prof. Alex Johnson',
  'subject': 'Organic Chemistry',
  'stars': 2,
  'review': 'Prof. Johnson is knowledgeable but the lectures are very fast-paced and hard to follow. The course is challenging without sufficient support.'},
 {'professor': 'Dr. Linda Moore',
  'subject': 'Microec

In [16]:
processed_data = []
client = OpenAI()
for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model='text-embedding-3-small',
    )
    embedding = response.data[0].embedding
    processed_data.append({
        'values': embedding,
        'id': review['professor'],
        'metadata': {
            'review': review['review'],
            'subject': review['subject'],
            'stars': review['stars']
        }
    })

In [17]:
processed_data[0]

{'values': [0.006944114,
  -0.018871866,
  -0.022121003,
  0.068598315,
  0.03534963,
  0.04507261,
  0.019922338,
  0.06185575,
  -0.007683109,
  -0.0022444695,
  0.02692142,
  0.0054905526,
  0.0044309185,
  0.0068830396,
  0.022719529,
  0.0012726295,
  0.0019207773,
  -0.0061409906,
  0.033224255,
  0.0017268673,
  0.013277488,
  -0.0231104,
  0.022536306,
  -0.0068830396,
  -0.02806961,
  -0.020398716,
  0.054771163,
  0.058191307,
  -0.0069990805,
  0.020911738,
  0.081350565,
  -0.020899523,
  -0.029535387,
  -0.0118666785,
  -0.051644173,
  0.06473844,
  -0.007756398,
  -0.0026338163,
  0.0037988028,
  -0.00087793876,
  0.028607061,
  0.020423146,
  -0.014816552,
  0.043851133,
  0.04214106,
  -0.029535387,
  -0.021510262,
  -0.025870947,
  0.0027666523,
  0.026139671,
  -0.05374512,
  0.006644851,
  0.05447801,
  -0.0037285676,
  -0.067621134,
  0.053403106,
  0.013912657,
  0.008159487,
  0.00094817387,
  -0.012605674,
  0.04890806,
  0.017809179,
  -0.011097146,
  -0.0463185

In [18]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace='ns1'
)

{'upserted_count': 21}

In [20]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 21}},
 'total_vector_count': 21}