In [16]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [28]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [2]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Carter',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Dr. Carter explains concepts very clearly and makes difficult topics easy to understand. Highly recommend!'},
 {'professor': 'Prof. James Anderson',
  'subject': 'Calculus',
  'stars': 4,
  'review': 'Good professor, but sometimes the lectures can be a bit fast-paced. Make sure to review the material after class.'},
 {'professor': 'Dr. Olivia Martinez',
  'subject': 'Biology',
  'stars': 3,
  'review': "The course was interesting, but Dr. Martinez's teaching style is a bit monotonous. Could use more engagement in class."},
 {'professor': 'Prof. Robert Johnson',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Not the best at explaining concepts, and the lectures are often unorganized. I struggled to follow along.'},
 {'professor': 'Dr. Sophia Brown',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Amazing professor! Very knowledgeable and approachable. The assignments were challenging 

In [32]:
import google.generativeai as genai

processed_data = []
GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

for review in data['reviews']:
    response = genai.embed_content(
        content=review['review'],
        model="models/text-embedding-004",
    )
    embedding =  response['embedding']

    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "professor": review['professor'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [33]:
processed_data[0]

{'values': [0.05535064,
  -0.014492971,
  -0.02739771,
  0.0016114948,
  0.015719684,
  -0.0021753167,
  0.0017174451,
  0.012813277,
  -0.004820767,
  -0.006343168,
  0.061466422,
  -0.003154431,
  0.009519014,
  0.03232383,
  0.035805278,
  -0.057339314,
  -0.0036284975,
  0.05178525,
  -0.09782185,
  0.008005897,
  0.018363997,
  -0.06068407,
  0.034126338,
  -0.025511583,
  0.02398043,
  0.017021695,
  -0.011707726,
  -0.07384188,
  0.034357373,
  -0.032534186,
  0.029990178,
  0.00033326488,
  -0.004512802,
  -0.01380699,
  -0.021627413,
  -0.017126705,
  -0.0027594995,
  -0.024951687,
  0.0318252,
  -0.010064699,
  -0.0072810226,
  -0.04451687,
  -0.03696632,
  0.011984984,
  0.020820946,
  0.007940691,
  -0.033535015,
  0.10772531,
  -0.0045657125,
  0.080542944,
  -0.0007114644,
  0.062006198,
  -0.06735469,
  0.064724706,
  0.018472591,
  -0.031596452,
  -0.016218528,
  -0.06766923,
  0.083521575,
  9.798685e-05,
  -0.04998142,
  0.0011374553,
  0.020797292,
  -0.05553639,
  0

In [34]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [30]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}