<a href="https://colab.research.google.com/github/NUMLDS/stitching-project-Eleanorhhhyxz/blob/main/Embedding_Upserting_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pinecone
import pandas as pd
import tqdm
import kagglehub
import numpy as np
from datetime import datetime
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from pydantic import BaseModel, Field
from typing import List

# Extract Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = kagglehub.dataset_download("nadyinky/sephora-products-and-skincare-reviews")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nadyinky/sephora-products-and-skincare-reviews?dataset_version_number=2...


100%|██████████| 147M/147M [00:06<00:00, 24.5MB/s] 

Extracting files...





Path to dataset files: C:\Users\Eleanor Huang\.cache\kagglehub\datasets\nadyinky\sephora-products-and-skincare-reviews\versions\2


In [None]:
# Get and concat datasets from Kaggle
dataset_dir = path
review_files = [
    "reviews_0-250.csv",
    "reviews_250-500.csv",
    "reviews_500-750.csv",
    "reviews_750-1250.csv",
    "reviews_1250-end.csv"
]

# Load and combine all review files into a single DataFrame
review_dfs = [pd.read_csv(os.path.join(dataset_dir, file)) for file in review_files]
reviews = pd.concat(review_dfs, ignore_index=True)

# Filter Reviews After 2021/09/30
cutoff_date = datetime(2021, 9, 30)
reviews['submission_time'] = pd.to_datetime(reviews['submission_time'])

# Filter reviews after the cutoff date
filtered_reviews = reviews[reviews['submission_time'] > cutoff_date]
print("Filtered reviews:", len(filtered_reviews))
print(filtered_reviews.head())

In [None]:
filtered_reviews.drop(columns=['Unnamed: 0'], inplace=True)
filtered_reviews.dropna(subset=["review_text"], inplace=True)
filtered_reviews = filtered_reviews[filtered_reviews["review_text"].apply(lambda x: isinstance(x, str))]
filtered_reviews.to_csv("/content/drive/MyDrive/Datasets/filtered_reviews.csv", index=False)

### Read in Saved Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
filtered_reviews = pd.read_csv("/content/drive/MyDrive/Datasets/filtered_reviews.csv")
filtered_reviews

  filtered_reviews = pd.read_csv("/content/drive/MyDrive/Datasets/filtered_reviews.csv")


Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,31423088263,1,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
2,5061282401,5,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
3,6083038851,5,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
4,47056667835,5,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280134,2276253200,5,1.0,,0,0,0,2023-03-13,Consider salicylic acid your secret weapon for...,,fair,brown,combination,,P505392,Multi Action Clear Acne Clearing Treatment Lot...,StriVectin,49.0
280135,28013163278,5,1.0,,0,0,0,2023-03-13,I’ve been using this as my only moisturizer fo...,,,blue,combination,blonde,P505392,Multi Action Clear Acne Clearing Treatment Lot...,StriVectin,49.0
280136,1539813076,5,1.0,,0,0,0,2023-03-13,I got breakouts whenever it’s my time of month...,,light,blue,combination,blonde,P505392,Multi Action Clear Acne Clearing Treatment Lot...,StriVectin,49.0
280137,5595682861,5,1.0,,0,0,0,2023-03-13,I love this!!! I don’t get actual acne just an...,,fair,hazel,oily,,P505392,Multi Action Clear Acne Clearing Treatment Lot...,StriVectin,49.0


In [None]:
# Extract 0.1% of the most useful reviews for upserting
filtered_reviews = filtered_reviews.sort_values(by="helpfulness", ascending=False)
data = filtered_reviews[:280]

In [None]:
data

Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
155579,28113452634,3,0.0,1.0,2,0,2,2022-07-10,This is just okay. I seem to break out more wi...,Better toners out there,lightMedium,green,dry,brown,P475630,Mini Mandelic Acid + Superfood Unity Exfoliant,Youth To The People,16.0
155595,7814134331,5,1.0,1.0,2,0,2,2022-06-01,"i love this, it did slightly burn the first co...",yesyesyehysyesyes,mediumTan,brown,combination,black,P475630,Mini Mandelic Acid + Superfood Unity Exfoliant,Youth To The People,16.0
155593,10150716568,3,0.0,1.0,7,0,7,2022-06-02,I have bought this a week ago and did not noti...,,fair,blue,combination,brown,P475630,Mini Mandelic Acid + Superfood Unity Exfoliant,Youth To The People,16.0
155590,25983950393,5,1.0,1.0,4,0,4,2022-06-05,I absolutely love this exfoliant. It has mande...,A must have!,light,green,combination,brown,P475630,Mini Mandelic Acid + Superfood Unity Exfoliant,Youth To The People,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155358,12803545459,5,1.0,1.0,1,0,1,2022-07-30,Better than Laneige in my opinion! Smells good...,Favorite lip mask,fair,blue,dry,brown,P474832,Sugar Recovery Lip Mask Advanced Therapy,fresh,28.0
155357,2790422977,2,0.0,1.0,8,0,8,2022-08-07,"Pros: not as sticky as LaneigeSweet, soft inof...",,fair,blue,normal,brown,P474832,Sugar Recovery Lip Mask Advanced Therapy,fresh,28.0
155356,21662643936,3,1.0,1.0,1,0,1,2022-08-15,"I LOVE their Advanced Therapy Lip Treatment, I...","Just Okay, Their other products are better",lightMedium,brown,combination,brown,P474832,Sugar Recovery Lip Mask Advanced Therapy,fresh,28.0
155355,7737064238,3,1.0,1.0,5,0,5,2022-08-16,"This is my second time buying this lip mask, I...",Good but new formula?,light,hazel,combination,brown,P474832,Sugar Recovery Lip Mask Advanced Therapy,fresh,28.0


In [None]:
from dotenv import load_dotenv
from pinecone import Pinecone
load_dotenv("/content/drive/MyDrive/code_envs/.env")  # Loads API keys from .env file
# Access environment variables
pinecone_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_key)

# Data Preprocessing

In [None]:
# Chunk the Review Text Data
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

# Apply chunking to review_text
splits = data['review_text'].apply(lambda text: text_splitter.split_text(text) if isinstance(text, str) else [])
splits = [split for split in splits if split]
splits

[['I use this with the Nudestix “Citrus Clean Balm & Make-Up Melt“ to double cleanse and it has completely changed my skin (for the better). The make-up melt is oil based and removes all of your makeup super easily. I follow-up with this water based cleanser, and I also use this just by itself when I’m',
  'and I also use this just by itself when I’m not wearing make-up. It leaves the skin gently cleansed, but without stripping the skin. 10/10 recommend combining with the make-up melt. It’s perfection!'],
 ['This is just okay. I seem to break out more with this one vs the REN toner. This leaves my skin pretty dry which might be better for oily skin but I live in a desert climate so this is a bit too stripping. If you have skin on the dryer side I’d recommend the REN toner over this, plus it’s cheaper!'],
 ['i love this, it did slightly burn the first couple uses when but then stopped with more use, it really played a part in getting rid of small acne issues, my pores are non existent, 

In [None]:
# Chunk each review text and store the metadata
splits = []
for idx, row in data.iterrows():
    if isinstance(row["review_text"], str):
        # Split the text into chunks
        chunk_list = text_splitter.split_text(row["review_text"])
        for chunk_text in chunk_list:
            # Collect chunk + row metadata
            splits.append({
                "chunk_text": chunk_text,
                "product_name": row.get("product_name", ""),
                "brand_name": row.get("brand_name", ""),
                "price_usd": row.get("price_usd", 0.0),
                "rating": row.get("rating", 0.0),
                # Use a unique ID per row or chunk
                "id": f"{idx}"
            })

print(f"Total chunks created: {len(splits)}")

Total chunks created: 491


# Text Embedding

In [None]:
from fastembed import TextEmbedding

class embeddingModel:
    def __init__(self, model):
        # Initialize FastEmbed model
        self.model = TextEmbedding(model_name=model)

    def embed_documents(self, splits):
        # Embed a list of chunks (splits) and return a list of embeddings
        return [list(self.model.embed(chunk))[0].tolist() for chunk in splits] # every time extract the first result from what self.model.embed() returns

    def embed_query(self, query):
        return list(self.model.embed(query))[0].tolist()

# Initialize the embedding model
embeddings = embeddingModel("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

  self.model = TextEmbedding(model_name=model)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [None]:
# Embed the chunks in batches
from tqdm import tqdm
batch_size = 100
embedding_list = []
metadata_list = []
ids_list = []

for i in tqdm(range(0, len(splits), batch_size), desc="Embedding in batches"):
    batch = splits[i : i + batch_size]
    for item in batch:
        # Generate or reuse embeddings
        vector = embeddings.embed_query(item["chunk_text"])

        # Build metadata dict
        meta = {
            "review_text": item["chunk_text"],
            "product_name": item["product_name"],
            "brand_name": item["brand_name"],
            "price_usd": item["price_usd"],
            "rating": item["rating"]
        }

        embedding_list.append(vector)
        metadata_list.append(meta)
        ids_list.append(item["id"])

Embedding in batches: 100%|██████████| 5/5 [01:01<00:00, 12.28s/it]


In [None]:
len(metadata_list)

491

In [None]:
len(embedding_list)

491

# Vector DB Query

In [None]:
# Create an index
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)
embed_dim = 768
index_name = "sephora-review-index-test"

if index_name in pc.list_indexes().names(): # Ensure no duplicate index exists with the same name before creating a new one
    pc.delete_index(index_name)

pc.create_index(
        index_name,
        dimension = embed_dim,
        metric = 'cosine',
        spec = spec
    ) # Create a new Pinecone index

In [None]:
# Retrieves and prints information about the Pinecone index
index = pc.Index(index_name) # This initializes or retrieves an existing Pinecone index. The index is where vector embeddings are stored.
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
data

Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
155579,28113452634,3,0.0,1.0,2,0,2,2022-07-10,This is just okay. I seem to break out more wi...,Better toners out there,lightMedium,green,dry,brown,P475630,Mini Mandelic Acid + Superfood Unity Exfoliant,Youth To The People,16.0
155595,7814134331,5,1.0,1.0,2,0,2,2022-06-01,"i love this, it did slightly burn the first co...",yesyesyehysyesyes,mediumTan,brown,combination,black,P475630,Mini Mandelic Acid + Superfood Unity Exfoliant,Youth To The People,16.0
155593,10150716568,3,0.0,1.0,7,0,7,2022-06-02,I have bought this a week ago and did not noti...,,fair,blue,combination,brown,P475630,Mini Mandelic Acid + Superfood Unity Exfoliant,Youth To The People,16.0
155590,25983950393,5,1.0,1.0,4,0,4,2022-06-05,I absolutely love this exfoliant. It has mande...,A must have!,light,green,combination,brown,P475630,Mini Mandelic Acid + Superfood Unity Exfoliant,Youth To The People,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155358,12803545459,5,1.0,1.0,1,0,1,2022-07-30,Better than Laneige in my opinion! Smells good...,Favorite lip mask,fair,blue,dry,brown,P474832,Sugar Recovery Lip Mask Advanced Therapy,fresh,28.0
155357,2790422977,2,0.0,1.0,8,0,8,2022-08-07,"Pros: not as sticky as LaneigeSweet, soft inof...",,fair,blue,normal,brown,P474832,Sugar Recovery Lip Mask Advanced Therapy,fresh,28.0
155356,21662643936,3,1.0,1.0,1,0,1,2022-08-15,"I LOVE their Advanced Therapy Lip Treatment, I...","Just Okay, Their other products are better",lightMedium,brown,combination,brown,P474832,Sugar Recovery Lip Mask Advanced Therapy,fresh,28.0
155355,7737064238,3,1.0,1.0,5,0,5,2022-08-16,"This is my second time buying this lip mask, I...",Good but new formula?,light,hazel,combination,brown,P474832,Sugar Recovery Lip Mask Advanced Therapy,fresh,28.0


In [None]:
# Upsert the vectors to Pinecone vector DB (all are training vectors)
import pinecone
# Combine your lists into a single structure for upsert
vectors_to_upsert = []
for i in range(len(embedding_list)):
    doc_id = ids_list[i]
    doc_embedding = embedding_list[i]
    doc_metadata = metadata_list[i]
    # Pinecone expects (id, vector, metadata)
    vectors_to_upsert.append((doc_id, doc_embedding, doc_metadata))

# Upsert all data to Pinecone with batches
batch_size = 200
for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i : i + batch_size]
    index.upsert(vectors=batch)
print("Upserted all chunked data to Pinecone!")

Upserted all chunked data to Pinecone!


# Vector Query and Similarity Verification

### Create a separate test dataset and embed the test review data

In [None]:
test_df = filtered_reviews.sort_values(by="helpfulness", ascending=False)[280:300]
test_df

Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
222329,11034513066,5,1.0,1.0,8,0,8,2022-04-26,Got this during spring sale and wow!!! I have ...,OMG THIS!!!!!!,medium,brown,combination,brown,P475194,Brightening Treatment Drops Triple Vitamin C S...,TULA Skincare,56.0
222328,26468684015,4,1.0,1.0,8,0,8,2022-05-04,I got the opportunity to try this product and ...,,light,brown,normal,brown,P475194,Brightening Treatment Drops Triple Vitamin C S...,TULA Skincare,56.0
222217,8509768424,5,1.0,1.0,3,0,3,2021-11-14,Amazingly moisturizing and a must-have for whe...,,mediumTan,,oily,,P397623,Dermask Water Jet Vital Hydra Solution,Dr. Jart+,7.0
222216,5984133905,5,1.0,1.0,2,0,2,2021-12-17,Love this as a once in a while item to add int...,My fav hydrating mask,light,hazel,dry,brown,P397623,Dermask Water Jet Vital Hydra Solution,Dr. Jart+,7.0
222215,1441152211,5,1.0,1.0,3,0,3,2021-12-19,My go to mask for extra hydration! After using...,Super Boost of Hydration,lightMedium,brown,combination,brown,P397623,Dermask Water Jet Vital Hydra Solution,Dr. Jart+,7.0
222155,825169105,2,1.0,1.0,1,0,1,2023-03-06,Product is ok. I have to use is throughout the...,container broke,fair,blue,combination,blonde,P404794,Hydrating Oil Stick,MILK MAKEUP,32.0
222223,5280923172,5,1.0,1.0,3,0,3,2022-11-03,So glad I decided to buy this! This has improv...,Holy grail!,medium,hazel,combination,black,P402014,Blue Moon Clean-Rinse Cleansing Balm,Sunday Riley,50.0
222228,12354217483,3,0.0,1.0,1,0,1,2023-02-22,Probably more a general rule with stronger vit...,DO NOT USE AFTER EXFOLIATION,fair,blue,normal,blonde,P475194,Brightening Treatment Drops Triple Vitamin C S...,TULA Skincare,56.0
222168,5129094264,2,0.0,1.0,4,0,4,2022-03-09,The bottom of the stick was broken and I can’t...,Need to return for faulty bottom that was broken,fair,hazel,dry,brown,P404794,Hydrating Oil Stick,MILK MAKEUP,32.0
222120,8476149396,5,1.0,1.0,1,0,1,2022-10-20,I received this cleansing foam from Influenste...,A must try cleansing foam!,fair,hazel,oily,red,P476731,Mini Gentle Cleansing Foam Hydrating Makeup Re...,Sulwhasoo,12.0


In [None]:
# No need to chunk the test data reviews for embedding
test_embeddings = []
for i, row in test_df.iterrows():
    full_text = row["review_text"]
    # Use the same embedding model as your main index
    query_vec = embeddings.embed_query(full_text)
    test_embeddings.append({
        "id": str(i),
        "text": full_text,
        "vector": query_vec
    })

In [None]:
len(test_embeddings)

20

In [None]:
for item in test_embeddings[:5]:  # just select the first 5 entires of the test embeddings as a demo
    query_vec = item["vector"]
    results = index.query(
        vector=query_vec,
        top_k=3,
        include_metadata=True
    )

    print(f"\n=== Test doc ID: {item['id']} ===")
    print("Original text:", item["text"])
    for match in results.matches:
        print("Match ID:", match.id)
        print("Score:", match.score)
        # The metadata includes your stored fields like review_text, product_name, etc.
        print("Metadata:", match.metadata)


=== Test doc ID: 222329 ===
Original text: Got this during spring sale and wow!!! I have used every other vitamin c in Sephora and nothing compares to this!!! Give this a shot and you will not regret incredibly bright even skin tone with no dark spots!!!
Match ID: 155054
Score: 0.708327234
Metadata: {'brand_name': 'Lancôme', 'price_usd': 140.0, 'product_name': 'Rénergie H.C.F. Triple Serum: For Plumping & Anti-Aging', 'rating': 5.0, 'review_text': 'Love, love this product!! The cream/jel combination goes on smoothly, the scent light and refreshing. Immediately my skin had a beautiful “glow“ that lasted for hours, looked well hydrated and fresh!!  My dark spots look lighter!  I  will continue with this awesome serum, am beyond pleased.'}
Match ID: 154964
Score: 0.700173855
Metadata: {'brand_name': 'Lancôme', 'price_usd': 140.0, 'product_name': 'Rénergie H.C.F. Triple Serum: For Plumping & Anti-Aging', 'rating': 4.0, 'review_text': 'Great Serum. My face looks clear and hydrated. I would

In [None]:
# Use a user-defined query for similarity search
user_query = "What skincare products are good for oily skin?"
query_vector = embeddings.embed_query(user_query)
results = index.query(
    vector=query_vector,
    top_k=3,
    include_metadata=True
)

print("\n=== User Query ===")
print(user_query)
for match in results.matches:
    print("Match ID:", match.id)
    print("Score:", match.score)
    print("Metadata:", match.metadata)


=== User Query ===
What skincare products are good for oily skin?
Match ID: 155590
Score: 0.811426342
Metadata: {'brand_name': 'Youth To The People', 'price_usd': 16.0, 'product_name': 'Mini Mandelic Acid + Superfood Unity Exfoliant', 'rating': 5.0, 'review_text': 'a moisturizer. A great product to have in your skincare routine.'}
Match ID: 155598
Score: 0.786289632
Metadata: {'brand_name': 'Youth To The People', 'price_usd': 16.0, 'product_name': 'Mini Mandelic Acid + Superfood Unity Exfoliant', 'rating': 5.0, 'review_text': 'Love the ingredients and what this has done for my skin! Has helped reduce texture and breakouts on my skin. Use with a small piece of gauze after washing my face before my serum and moisturizer.'}
Match ID: 155521
Score: 0.784392834
Metadata: {'brand_name': 'Sunday Riley', 'price_usd': 72.0, 'product_name': 'Juno Antioxidant + Superfood Face Oil', 'rating': 5.0, 'review_text': 'away. ( this oil works really good for me) I have dry skin and I don’t need to wear 