In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from pinecone import Pinecone, ServerlessSpec
import os

  from tqdm.autonotebook import tqdm


In [2]:
#Load environment variables
from dotenv import load_dotenv
load_dotenv

<function dotenv.main.load_dotenv(dotenv_path: Union[str, ForwardRef('os.PathLike[str]'), NoneType] = None, stream: Optional[IO[str]] = None, verbose: bool = False, override: bool = False, interpolate: bool = True, encoding: Optional[str] = 'utf-8') -> bool>

In [3]:
#Load the reviews data from the JSON file
with open("reviews.json", "r") as f:
    data = json.load(f)

#Extract the reviews
reviews = [review["review"] for review in data["reviews"]]

In [5]:
#Create a simple embedding using TF-IDF
vectorizer = TfidfVectorizer(max_features=93)
embeddings = vectorizer.fit_transform(reviews).toarray()

print(f"Generated {len(embeddings)} embeddings.")
print(f"Each embedding has {embeddings.shape[1]} dimensions.")


Generated 10 embeddings.
Each embedding has 93 dimensions.


In [6]:
#Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [7]:
#Create or connect to an index
index_name = "rag"
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=embeddings.shape[1],
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

In [8]:
#Connect to the index
index = pc.Index(index_name)

In [9]:
#Prepare data for upsert
vectors_to_upsert = []
for i, (review, embedding) in enumerate(zip(data["reviews"], embeddings)):
    vectors_to_upsert.append({
        "id": str(i),
        "values": embedding.tolist(),
        "metadata": {
            "professor": review["professor"],
            "subject": review["subject"],
            "stars": review["stars"],
            "review": review["review"],
        }
    })

In [10]:
#Upsert data in batches
batch_size = 100
for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i:i+batch_size]
    index.upsert(vectors=batch)

print(f"Upserted {len(vectors_to_upsert)} vectors to pinecone index '{index_name}'")

Upserted 10 vectors to pinecone index 'rag'


In [11]:
#Example query
query = "Engaging professor"
query_embedding = vectorizer.transform([query]).toarray()[0]

In [12]:
#Search in Pinecone
search_results = index.query(
    vector=query_embedding.tolist(),
    top_k=3,
    include_metadata=True
)

print("\nSearch Results:")
for result in search_results['matches']:
    print(f"Score: {result['score']:.4f}")
    print(f"Professor: {result['metadata']['professor']}")
    print(f"Subject: {result['metadata']['subject']}")
    print(f"Stars: {result['metadata']['stars']}")
    print(f"Review: {result['metadata']['review']}")
    print()


Search Results:
Score: 0.2093
Professor: Dr. Emily Johnson
Subject: Biology
Stars: 4.0
Review: Dr. Johnson's lectures are engaging and well-structured. She's always willing to help during office hours.

Score: 0.1689
Professor: Dr. Maria Garcia
Subject: Sociology
Stars: 5.0
Review: Inspiring professor! Her real-world examples make the course material relevant and interesting.

Score: 0.1574
Professor: Prof. Michael Lee
Subject: Computer Science
Stars: 5.0
Review: Brilliant professor! Makes complex topics easy to understand. His coding examples are particularly helpful.

