In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings

  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
"""pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)"""
pc.create_index(
    name="rag", dimension=384, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [3]:
import json
data=json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Roberts',
  'subject': 'Psychology 101',
  'stars': 4,
  'review': 'Engaging lectures and very knowledgeable. Sometimes a bit too fast-paced, but overall a great professor.'},
 {'professor': 'Dr. Michael Thompson',
  'subject': 'Computer Science 201',
  'stars': 5,
  'review': 'Excellent professor. Clear explanations, helpful in office hours, and really cares about students.'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Biology 110',
  'stars': 3,
  'review': 'Material is interesting, but lectures can be dry. Attendance is important for understanding.'},
 {'professor': 'Dr. Richard Davis',
  'subject': 'Mathematics 203',
  'stars': 2,
  'review': 'Very difficult class. Exams are tough, and lectures can be confusing. Be prepared to work hard.'},
 {'professor': 'Dr. Linda Martinez',
  'subject': 'History 102',
  'stars': 4,
  'review': 'Great storyteller, makes history come alive. Sometimes hard to follow due to the amount of content covered.'},
 {'profes

In [4]:
import requests

model_id = "sentence-transformers/all-MiniLM-L6-v2"
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}

def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

In [5]:
#processed_data = []
#client = OpenAI()
"""for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
        #model="meta-llama/llama-3.1-8b-instruct:free",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })"""
processed_data = []

for review in data["reviews"]:
    embedding = query(review['review'])
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [6]:
processed_data[0]

{'values': [0.011148389428853989,
  -0.025768155232071877,
  -0.002526493975892663,
  0.021327875554561615,
  -0.10347292572259903,
  -0.0333888940513134,
  0.007917361333966255,
  0.01518397219479084,
  -0.018366694450378418,
  0.028554029762744904,
  -0.04598432034254074,
  0.08683972805738449,
  -0.06976887583732605,
  0.041904158890247345,
  -0.004164779093116522,
  -0.03647339344024658,
  -0.001746841357089579,
  -0.07700420916080475,
  0.029838532209396362,
  -0.09745167195796967,
  0.019327066838741302,
  0.07731848955154419,
  0.08002536743879318,
  -0.05810851231217384,
  -0.01171359233558178,
  -0.0025633194018155336,
  -0.038952071219682693,
  -0.05061405152082443,
  0.06890176981687546,
  -0.045980315655469894,
  -0.058197323232889175,
  0.06845695525407791,
  -0.009538993239402771,
  0.037228040397167206,
  -0.03327731788158417,
  0.020621242001652718,
  0.05191655829548836,
  0.1061321496963501,
  0.018789945170283318,
  -0.015545140020549297,
  0.009540385566651821,
  0.

In [7]:
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")
"""index.upsert(
    vectors=processed_data,
    namespace="ns1"
)"""

Upserted count: 20


'index.upsert(\n    vectors=processed_data,\n    namespace="ns1"\n)'

In [8]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}