In [3]:
!pip install -q sentence-transformers faiss-cpu pandas numpy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import pandas as pd
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
import faiss

df = pd.read_csv("product_info_skincare.csv")
print("✅ Dataset Loaded:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

✅ Dataset Loaded: (1813, 28)
Columns: ['Unnamed: 0', 'product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count', 'rating', 'reviews', 'size', 'variation_type', 'variation_value', 'variation_desc', 'ingredients', 'price_usd', 'value_price_usd', 'sale_price_usd', 'limited_edition', 'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category', 'secondary_category', 'tertiary_category', 'child_count', 'child_max_price', 'child_min_price']


Unnamed: 0.1,Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,...,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
0,18,P483068,ABBOTT Sampler Set,6485,ABBOTT,4493,4.8163,49.0,,,...,0,1,0,"['Vegan', 'Woody & Earthy Scent', 'Clean + Pla...",Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,
1,47,P474806,Blue Tansy Reparative Mask,6321,adwoa beauty,14660,4.7581,492.0,16 oz/ 453 mg,Size,...,0,0,1,"['Good for: Damage', 'Good for: Color Care', '...",Hair,Hair Styling & Treatments,Hair Masks,0,,
2,48,P457233,Baomint Leave In Conditioning Styler,6321,adwoa beauty,13333,4.3472,144.0,14 oz/ 414 mL,Size,...,0,0,1,"['Clean at Sephora', 'All Hair Types', 'Curl-E...",Hair,Hair Styling & Treatments,Leave-In Conditioner,1,13.0,13.0
3,49,P474808,Blue Tansy Leave in Conditioning Styler,6321,adwoa beauty,11674,4.5762,210.0,14 oz/ 414 mL,Size,...,0,0,0,"['Good for: Damage', 'Vegan', 'Clean at Sephor...",Hair,Hair Styling & Treatments,Leave-In Conditioner,0,,
4,50,P457234,Baomint Moisturizing Shampoo,6321,adwoa beauty,11122,4.1324,136.0,14 oz/ 414 mL,Size,...,0,0,1,"['Unisex/ Genderless Scent', 'Clean at Sephora...",Hair,Shampoo & Conditioner,Shampoo,1,12.0,12.0


In [7]:
df['description'] = (
    df['product_name'].fillna('') + " | " +
    df['ingredients'].fillna('') + " | " +
    df['primary_category'].fillna('') + " | " +
    df['highlights'].fillna('')
)

print("✅ Created description column for embeddings")

✅ Created description column for embeddings


In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model Loaded")

product_embeddings = model.encode(
    df['description'].astype(str).tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)
print("✅ Embeddings Shape:", product_embeddings.shape)

dimension = product_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(product_embeddings)
print("✅ FAISS index is ready with", index.ntotal, "items")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model Loaded


Batches:   0%|          | 0/57 [00:00<?, ?it/s]

✅ Embeddings Shape: (1813, 384)
✅ FAISS index is ready with 1813 items


In [9]:
with open("product_embeddings.pkl", "wb") as f:
    pickle.dump(product_embeddings, f)

faiss.write_index(index, "product_faiss.index")
df.to_csv("product_info_processed.csv", index=False)

print("✅ Files saved for Streamlit:")
print("   - product_embeddings.pkl")
print("   - product_faiss.index")
print("   - product_info_processed.csv")

✅ Files saved for Streamlit:
   - product_embeddings.pkl
   - product_faiss.index
   - product_info_processed.csv


In [10]:
def recommend_products(query, top_n=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_n)
    results = df.iloc[indices[0]].copy()
    results['distance'] = distances[0]

    display_cols = [c for c in ['product_name', 'price_usd', 'rating', 'distance'] if c in results.columns]
    return results[display_cols]

test_query = input("search: ")
print("🔹 Top 5 Recommendations for:", test_query)
recommend_products(test_query, top_n=5)

search: hair oil
🔹 Top 5 Recommendations for: hair oil


Unnamed: 0,product_name,price_usd,rating,distance
1350,Argan Hair & Scalp Oil Blend,25.0,4.3333,0.640322
1432,Omega 9 Hair Mask,58.0,3.1975,0.711461
201,Scalp Revival Charcoal + Coconut Oil Micro-exf...,15.0,3.6965,0.749986
194,"Don't Despair, Repair! Strengthening Treatment...",30.0,4.1202,0.75377
277,Hairdresser's Invisible Oil Soft Texture Finis...,34.0,4.4762,0.761017
