In [12]:
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os
import json



In [13]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)


In [16]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Jane Smith',
  'subject': 'Introduction to Computer Science',
  'stars': 5,
  'review': 'Dr. Smith provides clear explanations and is very approachable. Her passion for teaching is evident.'},
 {'professor': 'Dr. John Doe',
  'subject': 'Data Structures and Algorithms',
  'stars': 4,
  'review': "Challenging course, but Dr. Doe's lectures are thorough. Be prepared for a heavy workload."},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Discrete Mathematics',
  'stars': 4,
  'review': 'Dr. Davis breaks down complex topics well. The course is tough but manageable.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Operating Systems',
  'stars': 3,
  'review': 'Dr. Brown is knowledgeable, but his grading is harsh. Expect to put in a lot of work.'},
 {'professor': 'Dr. Laura Wilson',
  'subject': 'Artificial Intelligence',
  'stars': 5,
  'review': 'Fantastic professor! Dr. Wilson makes AI exciting and accessible. Highly recommended.'},
 {'professor': 'Dr. Samuel Lee'

In [17]:
processed_data =[]
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [18]:
processed_data[0]

{'values': [0.0003970911,
  0.00777528,
  -0.025493374,
  0.028767873,
  0.012952171,
  -0.00057916885,
  0.0052763196,
  0.04274084,
  -0.014874449,
  -0.019289058,
  0.039585654,
  -0.027256565,
  -0.0018377634,
  -0.012223031,
  0.021887446,
  -0.012600859,
  0.0049216924,
  -0.00014883149,
  0.0009785385,
  0.035767615,
  0.047354307,
  -0.038657658,
  0.016107358,
  -0.009074475,
  -0.039585654,
  -0.049767096,
  0.029563298,
  0.028449703,
  -0.02837016,
  -0.003325871,
  0.08198181,
  -0.016345985,
  0.0040666107,
  -0.010181441,
  -0.031207176,
  0.032983627,
  -0.055361584,
  0.0027326166,
  -0.0005464404,
  -0.015524046,
  0.028449703,
  0.016200157,
  -0.03651001,
  -0.015988044,
  0.030438265,
  0.0076824804,
  -0.019315572,
  -0.013449312,
  0.0043483237,
  0.031392775,
  -0.036032755,
  0.029245129,
  0.02709748,
  -0.05737666,
  -0.06761113,
  0.019673513,
  0.00895516,
  0.05737666,
  0.016266443,
  -0.024949832,
  0.057641804,
  -0.0067544845,
  -0.023663895,
  -0.0004

In [19]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [20]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}