## Content-based Filtering Recommender

### Download Dataset from Kaggle

In [9]:
import os
import zipfile
import pandas as pd

# Download using Kaggle API
os.system('kaggle datasets download -d nadyinky/sephora-products-and-skincare-reviews')

# Unzip the downloaded dataset
with zipfile.ZipFile('sephora-products-and-skincare-reviews.zip', 'r') as zip_ref:
    zip_ref.extractall('sephora_dataset')

# Load product information
products_df = pd.read_csv("sephora_dataset/product_info.csv")

print(products_df.head()) # for checking purpose 

  product_id               product_name  brand_id brand_name  loves_count  \
0    P473671    Fragrance Discovery Set      6342      19-69         6320   
1    P473668    La Habana Eau de Parfum      6342      19-69         3827   
2    P473662  Rainbow Bar Eau de Parfum      6342      19-69         3253   
3    P473660       Kasbah Eau de Parfum      6342      19-69         3018   
4    P473658  Purple Haze Eau de Parfum      6342      19-69         2691   

   rating  reviews            size                      variation_type  \
0  3.6364     11.0             NaN                                 NaN   
1  4.1538     13.0  3.4 oz/ 100 mL  Size + Concentration + Formulation   
2  4.2500     16.0  3.4 oz/ 100 mL  Size + Concentration + Formulation   
3  4.4762     21.0  3.4 oz/ 100 mL  Size + Concentration + Formulation   
4  3.2308     13.0  3.4 oz/ 100 mL  Size + Concentration + Formulation   

  variation_value  ... online_only out_of_stock  sephora_exclusive  \
0             NaN  ...

### Filter Skincare Product only in primary_category, secondary_category, and tertiary_category columns
(remove makeup, fragrance, bath&body, hair, teeth, supplement, etc products)

In [10]:
# Filter only 'Skincare' at primary_category 
skincare_products_df = products_df[products_df['primary_category'].str.lower() == 'skincare'].copy()

# Keep only specific secondary categories
allowed_secondary_categories = [
    'Moisturizers', 'Treatments', 'Eye Care', 'Lip Balms & Treatments',
    'Sunscreen', 'Cleansers', 'Masks'
]

skincare_products_df = skincare_products_df[skincare_products_df['secondary_category'].isin(allowed_secondary_categories)].copy()

# Keep only selected tertiary categories
allowed_tertiary_categories = [
    'Moisturizers', 'Face Serums', 'Eye Creams & Treatments', 'Face Sunscreen',
    'Face Wash & Cleansers', 'Face Oils', 'Toners', 'Face Masks', 'Facial Peels',
    'Exfoliators', 'Eye Masks', 'Face Wipes', 'Blemish & Acne Treatments',
    'Night Creams', 'Mists & Essences', 'Sheet Masks', 'Makeup Removers'
]

skincare_products_df = skincare_products_df[skincare_products_df['tertiary_category'].isin(allowed_tertiary_categories)].copy()

# Save to new CSV file
skincare_products_df.to_csv("filtered_skincare_products.csv", index=False)
print("Filtered products dataset saved as 'filtered_skincare_products.csv'")

Filtered products dataset saved as 'filtered_skincare_products.csv'


In [11]:
# Load filtered skincare products IDs
filtered_products = pd.read_csv("filtered_skincare_products.csv")
skincare_product_ids = set(filtered_products['product_id'].unique())

import glob

# List of file paths
review_files = [
    "sephora_dataset/reviews_0-250.csv",
    "sephora_dataset/reviews_250-500.csv",
    "sephora_dataset/reviews_500-750.csv",
    "sephora_dataset/reviews_750-1250.csv",
    "sephora_dataset/reviews_1250-end.csv"
]

# Initialize empty list to store filtered reviews
all_filtered_reviews = []

for file in review_files:
    reviews_df = pd.read_csv(file)
    
    skincare_reviews = reviews_df[reviews_df['product_id'].isin(skincare_product_ids)].copy()
    all_filtered_reviews.append(skincare_reviews)

# Merge all and save as one new file
merged_reviews_df = pd.concat(all_filtered_reviews, ignore_index=True)
merged_reviews_df.to_csv("filtered_skincare_reviews.csv", index=False)

print(f"\nFinal merged reviews file saved as: filtered_skincare_reviews.csv")
print(f"Total reviews: {len(merged_reviews_df)}")

  reviews_df = pd.read_csv(file)
  reviews_df = pd.read_csv(file)
  reviews_df = pd.read_csv(file)



Final merged reviews file saved as: filtered_skincare_reviews.csv
Total reviews: 895583


### Preprocessing

In [12]:
import re
import ast
import unicodedata
import pandas as pd

# Input and output file paths
IN_PATH  = "filtered_skincare_products.csv"   # raw filtered file
OUT_PATH = "products_preprocessed.csv"        # final preprocessed file
SHOW_DEBUG = True

# ------------------------------
# Basic helpers
# ------------------------------
def norm_text(s):
    # Normalize text: lowercase, strip, collapse whitespace
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    return re.sub(r"\s+", " ", s.lower().strip())

def map_with_table(text, table):
    # Match text against regex patterns in a table, return first label found
    if not text:
        return None
    for patt, label in table:
        if re.search(patt, text, flags=re.IGNORECASE):
            return label
    return None

# ------------------------------
# Product-type rules 
# ------------------------------
# Map regex patterns from categories/names to standardized product types
TERTIARY_MAP = [
    (r"\beye (creams?(\s*&\s*treatments?)?|masks?|care)\b", "eye treatment"),
    (r"\bface serums?\b", "serum"),
    (r"\bserums?\b", "serum"),
    (r"\bface oils?\b|\bfacial oils?\b", "face oil"),
    (r"\bface sunscreen\b|\bsunscreens?\b|\bsun care\b|\bspf\b", "sunscreen"),
    (r"\bmoisturizers?\b|\bnight creams?\b|\bgel[- ]?creams?\b|\bface creams?\b|\blotions?\b|\bemulsions?\b", "moisturizer"),
    (r"\bface masks?\b|\bsheet masks?\b|\bsleeping masks?\b|\bovernight masks?\b|\bclay masks?\b|\bmud masks?\b", "face mask"),
    (r"\bface wipes\b|\bmakeup removers?\b", "makeup remover"),
    (r"\bcleansers?\b|\bface wash\b|\bmicellar\b", "cleanser"),
    (r"\btoners?\b|\bmists? & essences?\b|\bessences?\b", "toner"),
    (r"\bblemish\s*&\s*acne treatments?\b|\bspot (treatments?|correctors?)\b|\bacne spot\b", "spot treatment"),
    (r"\bexfoliators?\b|\bexfoliants?\b|\bpeels?\b|\bscrubs?\b|\b(aha|bha|pha)\b", "exfoliator"),
]
SECONDARY_MAP = TERTIARY_MAP  # Reuse same mapping for secondary

# Fallback rules if product type not found in categories
NAME_RULES = [
    (r"\beye (cream|serum|gel|balm|mask|treatment)\b", "eye treatment"),
    (r"\bface oil|facial oil|night oil|firming oil\b|\boil\b(?!\s*cleanser)", "face oil"),
    (r"\bface wipes\b|\bmakeup remover\b", "makeup remover"),
    (r"\b(cleanser|cleansing|face wash|micellar|balm cleanser|oil cleanser)\b", "cleanser"),
    (r"\b(toner|tonique)\b|\bmists?\b|\bessence\b(?!\s*serum)", "toner"),
    (r"\b(serum|ampoule|booster)\b(?!\s*mask)|\bessence serum\b", "serum"),
    (r"\b(sunscreen|sun screen|spf)\b", "sunscreen"),
    (r"\b(mask|sheet mask|sleeping mask|wash[- ]?off mask|overnight mask|mud mask|clay mask)\b", "face mask"),
    (r"\b(spot (treatment|corrector)|acne spot|blemish treatment)\b", "spot treatment"),
    (r"\b(exfoliator|exfoliant|peel|aha|bha|pha|scrub)\b", "exfoliator"),
    (r"\b(moisturizer|moisturiser|cream|lotion|emulsion|gel[- ]?cream|face cream)\b", "moisturizer"),
]

def detect_product_type(row):
    # Infer product type from tertiary > secondary > name (priority order)
    name = norm_text(row.get("product_name", ""))
    sec  = norm_text(row.get("secondary_category", ""))
    ter  = norm_text(row.get("tertiary_category", ""))

    # Tertiary category first
    label = map_with_table(ter, TERTIARY_MAP)
    if label: return label

    # Secondary (skip if too generic like 'treatment')
    is_generic = bool(re.fullmatch(r"(treatment|treatments)", sec.strip()))
    if not is_generic:
        label = map_with_table(sec, SECONDARY_MAP)
        if label: return label

    # Fallback: check product name
    label = map_with_table(name, NAME_RULES)
    if label: return label

    return "other"

# ------------------------------
# Skin type + concern rules 
# ------------------------------
# Regex patterns to detect skin types from text
SKIN_TYPE_PATTERNS = [
    (r"\b(?:good|best)\s*for:\s*oily\b", "oily"),
    (r"\b(?:good|best)\s*for:\s*dry\b", "dry"),
    (r"\b(?:good|best)\s*for:\s*combination\b", "combination"),
    (r"\b(?:good|best)\s*for:\s*sensitive\b", "sensitive"),
    (r"\b(?:good|best)\s*for:\s*normal\b", "normal"),
    (r"\b(oily skin|oily)\b", "oily"),
    (r"\b(dry skin|dry)\b", "dry"),
    (r"\b(combination skin|combination|combo)\b", "combination"),
    (r"\b(sensitive skin|sensitive)\b", "sensitive"),
    (r"\b(normal skin|normal)\b", "normal"),
    (r"\bfor\s+sensitive\s+skin\b", "sensitive"),
    (r"\bsuitable\s+for\s+sensitive\b", "sensitive"),
    (r"\bfor\s+sensitive\b", "sensitive"),
    (r"\bhypoallergenic\b", "sensitive"),
    (r"\bgentle\b", "sensitive"),
]

# Regex patterns to detect skin concerns from claims or product name
SKIN_CONCERN_PATTERNS = [
    (r"\b(acne|blemish|breakout|pimple)\b", "acne"),
    (r"\bpores?\b", "pores"),
    (r"\b(dark spot|hyperpigment|discoloration|melasma)\b", "hyperpigmentation"),
    (r"\b(wrinkle|fine line|anti[- ]?aging|firming|loss of firmness|elasticity)\b", "aging"),
    (r"\b(redness|rosacea|irritation|calming|soothing)\b", "redness"),
    (r"\b(dryness|dehydration|hydrating|moisturizing|moisturising|barrier)\b", "dehydration"),
    (r"\b(dull(ness)?|brighten(ing)?|glow|radiance)\b", "dullness"),
    (r"\boil(y| control|iness)\b", "oil-control"),
    (r"\b(blackhead|whitehead|congestion)\b", "blackheads"),
    (r"\b(uneven (tone|texture)|texture|resurfacing)\b", "texture"),
    (r"\b(dark circle|dark circles)\b", "dark-circles"),
]

# Ingredient-based concern mapping
INGREDIENT_CONCERN_PATTERNS = [
    (r"\b(salicylic acid|beta hydroxy|bha|willow bark|benzoyl peroxide|sulfur|zinc pca|zinc)\b", {"acne","pores","oil-control"}),
    (r"\b(kaolin|bentonite|clay|charcoal)\b", {"pores","oil-control"}),
    (r"\b(tea tree|melaleuca)\b", {"acne"}),
    (r"\b(hyaluronic acid|sodium hyaluronate|glycerin|panthenol|urea|betaine|trehalose|aloe)\b", {"dehydration"}),
    (r"\b(ceramide|ceramides|cholesterol|squalane|squalene|shea|shea butter)\b", {"dehydration"}),
    (r"\b(retinol|retinal|retinoate|bakuchiol|peptide|matrixyl|collagen|coenzyme ?q10|ubiquinone)\b", {"aging"}),
    (r"\b(vitamin ?c|ascorbic|ascorbyl|ethyl ascorbic|magnesium ascorbyl|sodium ascorbyl|alpha arbutin|tranexamic|azelaic|kojic|licorice|glycyrrhiza)\b", {"hyperpigmentation","dullness"}),
    (r"\b(centella|cica|madecassoside|asiaticoside|allantoin|bisabolol|beta glucan|green tea|oat|colloidal oatmeal)\b", {"redness"}),
    (r"\b(aha|glycolic|lactic|mandelic|tartaric|citric|pha|gluconolactone|lactobionic)\b", {"texture","dullness"}),
]

def to_list_from_highlights(val):
    # Parse highlights column into list (safe eval or regex split)
    if pd.isna(val):
        return []
    s = str(val).strip()
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            return [str(x) for x in parsed]
    except Exception:
        pass
    return [t.strip() for t in re.split(r"[,\|;/]+", s) if t.strip()]

def extract_skin_types_from_highlights_and_name(highlights_list, product_name):
    # Detect skin types from highlights + product name
    stypes = set()
    for raw in highlights_list:
        t = norm_text(raw)
        for patt, lab in SKIN_TYPE_PATTERNS:
            if re.search(patt, t):
                stypes.add(lab)
    pname = norm_text(product_name)
    if re.search(r"\bsensitive\b", pname):
        stypes.add("sensitive")
    return ",".join(sorted(stypes))

def extract_concerns_from_highlights_and_name(highlights_list, product_name):
    # Detect skin concerns from highlights + product name
    concerns = set()
    for raw in highlights_list:
        t = norm_text(raw)
        for patt, lab in SKIN_CONCERN_PATTERNS:
            if re.search(patt, t):
                concerns.add(lab)
    pname = norm_text(product_name)
    for patt, lab in SKIN_CONCERN_PATTERNS:
        if re.search(patt, pname):
            concerns.add(lab)
    return concerns

def extract_concerns_from_ingredients(ingredients_text):
    # Detect skin concerns directly from ingredient list
    concerns = set()
    ing = norm_text(ingredients_text)
    for patt, labs in INGREDIENT_CONCERN_PATTERNS:
        if re.search(patt, ing):
            concerns |= labs
    return concerns

# ------------------------------
# Text cleaning for vectorization
# ------------------------------

STOPWORDS = {
    "a","an","the","and","or","but","to","of","for","on","in","into","with","by","from","as",
    "it","its","this","that","these","those","is","are","was","were","be","been","being",
    "at","up","down","over","under","about","than","then","so","such","very","more","most",
    "less","least","you","your","yours","we","our","ours","they","their","theirs","he","she","his","her","them","i","me","my","mine"
}

PHRASES = [
    (r"\bhyaluronic acid\b", "hyaluronic_acid"),
    (r"\bsodium hyaluronate\b", "sodium_hyaluronate"),
    (r"\bsalicylic acid\b", "salicylic_acid"),
    (r"\bbenzoyl peroxide\b", "benzoyl_peroxide"),
    (r"\balpha hydroxy\b", "alpha_hydroxy"),
    (r"\bbeta hydroxy\b", "beta_hydroxy"),
    (r"\bpolyhydroxy\b", "polyhydroxy"),
    (r"\bvitamin c\b", "vitamin_c"),
    (r"\bvitamin e\b", "vitamin_e"),
    (r"\bcoenzyme q10\b|\bubiquinone\b", "coenzyme_q10"),
    (r"\btea tree\b", "tea_tree"),
    (r"\bcentella asiatica\b|\bcica\b", "centella_asiatica"),
    (r"\bgreen tea\b", "green_tea"),
    (r"\bniacinamide\b", "niacinamide"),
    (r"\bretino(l|id|ate|nal)\b", "retinoid"),
    (r"\baha\b", "aha"),
    (r"\bbha\b", "bha"),
    (r"\bpha\b", "pha"),
    (r"\bspf\s*\d+\b", lambda m: "spf_" + re.sub(r"\D", "", m.group(0))),
]

def deaccent(s):
    # Remove accents/diacritics from text 
    return unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")

def strip_html(s):
    # Remove HTML tags
    return re.sub(r"<[^>]+>", " ", s)

def basic_normalize(s):
    # General normalization: lowercase, strip HTML, remove symbols/numbers
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = s.lower()
    s = strip_html(s)
    s = deaccent(s)
    s = re.sub(r"[\/\+]", " ", s)                # split slashes/plus
    s = re.sub(r"[^a-z0-9\-\_\s]", " ", s)       # keep hyphen/underscore
    s = re.sub(r"\b\d+\b", " ", s)               # drop standalone numbers
    s = re.sub(r"\s+", " ", s).strip()
    return s

def apply_phrases(s):
    # Replace key phrases with normalized tokens
    for patt, repl in PHRASES:
        s = re.sub(patt, repl if isinstance(repl, str) else repl, s, flags=re.IGNORECASE)
    return s

def clean_text(s):
    # Full text cleaning pipeline for vectorization
    s = "" if s is None else str(s)
    s = apply_phrases(s)
    s = basic_normalize(s)
    toks = s.split()
    toks = [t for t in toks if (t not in STOPWORDS and len(t) > 1)]
    # Remove duplicates while keeping word order
    seen, out = set(), []
    for t in toks:
        if t not in seen:
            out.append(t); seen.add(t)
    return " ".join(out)

# ------------------------------
# Build combined content
# ------------------------------
def combine_categories(row):
    # Combine primary + secondary + tertiary categories into one string
    parts = [
        str(row.get("primary_category", "") or ""),
        str(row.get("secondary_category", "") or ""),
        str(row.get("tertiary_category", "") or "")
    ]
    return " ".join([p for p in parts if p]).strip()

def parse_highlights_text(val):
    # Convert highlights list into plain text string
    lst = to_list_from_highlights(val)
    return " ".join(lst)

def build_product_content(row):
    # Build final product_content field with weighted chunks
    chunks = [
        row.get("clean_product_name", ""),
        row.get("clean_brand_name", ""),
        row.get("clean_categories", ""),
        row.get("clean_highlights", ""),
        row.get("clean_ingredients", ""),
        row.get("clean_product_type", ""),
        row.get("clean_skin_type", ""),
        row.get("clean_skin_concern", ""),
    ]

    # Weight ingredients higher (add twice more)
    ingredients = row.get("clean_ingredients", "")
    chunks += [ingredients] * 2

    return " ".join([c for c in chunks if c]).strip()

# ------------------------------
# Main pipeline
# ------------------------------
def main():
    # Load dataset
    df = pd.read_csv(IN_PATH, encoding="utf-8")

    # Defensive defaults: ensure required columns exist
    for c in ["product_id","product_name","brand_name","ingredients","highlights",
              "primary_category","secondary_category","tertiary_category","price_usd", "rating", "reviews"]:
        if c not in df.columns:
            df[c] = ""

    # (A) RULE-BASED ENRICHMENT 
    df["product_type"] = df.apply(detect_product_type, axis=1)  # infer product type
    df["_highlights_list"] = df["highlights"].apply(to_list_from_highlights)  # parse highlights
    df["skin_type"] = df.apply(  # infer skin type
        lambda r: extract_skin_types_from_highlights_and_name(r["_highlights_list"], r["product_name"]),
        axis=1
    )
    df["ingredients"] = df["ingredients"].fillna("").astype(str)  # ensure string ingredients

    # Skin concern = union of claims/name + ingredients
    concerns_from_claims = df.apply(
        lambda r: extract_concerns_from_highlights_and_name(r["_highlights_list"], r["product_name"]),
        axis=1
    )
    concerns_from_ings = df["ingredients"].apply(extract_concerns_from_ingredients)
    df["skin_concern"] = [
        ",".join(sorted(set(a) | set(b)))
        for a, b in zip(concerns_from_claims, concerns_from_ings)
    ]

    # Convert price to numeric
    df["price_usd"] = pd.to_numeric(df["price_usd"], errors="coerce")

    df["rating"] = pd.to_numeric(df.get("rating"), errors="coerce")
    df["reviews"] = pd.to_numeric(df.get("reviews"), errors="coerce")

    # (B) TEXT CLEANING + CONTENT 
    df["_categories_raw"] = df.apply(combine_categories, axis=1)
    df["_highlights_text"] = df["highlights"].apply(parse_highlights_text)

    # Clean text fields
    df["clean_product_name"] = df["product_name"].apply(clean_text)
    df["clean_brand_name"]   = df["brand_name"].apply(clean_text)
    df["clean_categories"]   = df["_categories_raw"].apply(clean_text)
    df["clean_highlights"]   = df["_highlights_text"].apply(clean_text)
    df["clean_ingredients"]  = df["ingredients"].apply(clean_text)
    df["clean_product_type"] = df["product_type"].apply(clean_text)
    df["clean_skin_type"]    = df["skin_type"].apply(clean_text)
    df["clean_skin_concern"] = df["skin_concern"].apply(clean_text)

    # Build final product_content field (for TF-IDF)
    df["product_content"] = df.apply(build_product_content, axis=1)

    # Save final
    keep_cols = [
        "product_id","product_name","brand_name","ingredients","highlights",
        "primary_category","secondary_category","tertiary_category","price_usd", "rating", "reviews", 
        "product_type","skin_type","skin_concern","product_content"
    ]
    for c in keep_cols:
        if c not in df.columns:
            df[c] = ""
    df[keep_cols].to_csv(OUT_PATH, index=False)

    # Debug sample output
    if SHOW_DEBUG:
        print(f"Final products preprocessed file saved as: {OUT_PATH}")
        print("Sample product_content:")
        print(df["product_content"].head(5).to_string(index=False))

if __name__ == "__main__":
    main()

Final products preprocessed file saved as: products_preprocessed.csv
Sample product_content:
genius sleeping collagen moisturizer algenist s...
genius liquid collagen serum algenist skincare ...
triple algae eye renewal balm cream algenist sk...
sublime defense ultra lightweight uv fluid spf_...
genius ultimate anti-aging cream algenist skinc...


### TF-IDF Vectorizer

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed data
df = pd.read_csv("products_preprocessed.csv")

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['product_content'])

# for checking purpose
print(df[['product_name', 'product_content']].head())
print("TF-IDF shape: " + str(tfidf_matrix.shape))
print("Feature names: " + str(vectorizer.get_feature_names_out()[:50]))


                                        product_name  \
0               GENIUS Sleeping Collagen Moisturizer   
1                       GENIUS Liquid Collagen Serum   
2            Triple Algae Eye Renewal Balm Eye Cream   
3  SUBLIME DEFENSE Ultra Lightweight UV Defense F...   
4                   GENIUS Ultimate Anti-Aging Cream   

                                     product_content  
0  genius sleeping collagen moisturizer algenist ...  
1  genius liquid collagen serum algenist skincare...  
2  triple algae eye renewal balm cream algenist s...  
3  sublime defense ultra lightweight uv fluid spf...  
4  genius ultimate anti-aging cream algenist skin...  
TF-IDF shape: (1803, 4872)
Feature names: ['100h' '10eicosanedioate' '14m' '15ml' '1st' '20k' '24k' '25th' '360o'
 '3d' '40b' '5x' '72h' '7m' 'aa' 'abeille' 'abeitate' 'abelmo'
 'abelmoschus' 'abies' 'abietata' 'abrial' 'absinthium' 'absolue'
 'absolute' 'absorbing' 'absorption' 'abyssinica' 'ac' 'acacia' 'acacial'
 'acai' 'acanthi

### Recommending Skincare Products

In [14]:
import re
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def contentbased_recommender(
    product_type=None,
    skin_type=None,
    skin_concern=None,
    concern_match="all",
    max_price=None,
    n=10
):
    # ---- helpers ----
    def _to_set(x):
        if x is None or (isinstance(x, float) and pd.isna(x)):
            return set()
        if isinstance(x, (list, tuple, set)):
            return {str(t).strip().lower() for t in x if str(t).strip()}
        return {t.strip().lower() for t in re.split(r"[;,/|]", str(x)) if t.strip()}

    req_type    = str(product_type).strip().lower() if product_type else None
    req_skin    = str(skin_type).strip().lower()    if skin_type    else None
    req_concern = _to_set(skin_concern)

    # ---- build profile text for TF-IDF ----
    tokens = []
    if req_type:    tokens.append(req_type)
    if req_skin:    tokens.append(req_skin)
    if req_concern: tokens.extend(req_concern)
    profile_text = " ".join(tokens).strip() or "skincare"

    # vectorize profile
    qv = vectorizer.transform([profile_text])
    sims = cosine_similarity(qv, tfidf_matrix).ravel()

    # numeric columns
    price_col   = pd.to_numeric(df.get("price_usd", np.nan), errors="coerce")
    rating_col  = pd.to_numeric(df.get("rating", np.nan), errors="coerce").fillna(0.0)
    reviews_col = pd.to_numeric(df.get("reviews", 0), errors="coerce").fillna(0).astype(int)

    # filter candidates
    rows = []
    for i, sim in enumerate(sims):
        row = df.iloc[i]

        # product type filter
        if req_type and str(row.get("product_type","")).strip().lower() != req_type:
            continue

        # skin type filter
        row_skin = str(row.get("skin_type","")).strip().lower()
        if req_skin and row_skin != req_skin:
            continue

        # skin concern filter
        row_concern = _to_set(row.get("skin_concern",""))
        if req_concern:
            if concern_match == "all":
                if not req_concern.issubset(row_concern):
                    continue
            else:  # "any"
                if row_concern.isdisjoint(req_concern):
                    continue

        # price filter
        p = price_col.iat[i]
        if max_price is not None and (pd.isna(p) or p > float(max_price)):
            continue

        rows.append({
            "product_name": row.get("product_name",""),
            "brand_name": row.get("brand_name",""),
            "product_type": row.get("product_type",""),
            "skin_type": row.get("skin_type",""),
            "skin_concern": row.get("skin_concern",""),
            "price_usd": row.get("price_usd",""),
            "rating": rating_col.iat[i],
            "reviews": reviews_col.iat[i],
            "similarity": float(sim)
        })

    out = pd.DataFrame(rows)
    if out.empty:
        return out

    # final ranking
    out = out.sort_values(
        by=["similarity","rating","reviews"],
        ascending=[False, False, False]
    ).head(n)

    out["similarity"] = out["similarity"].round(4)
    return out


In [15]:
#test
display(contentbased_recommender(
    product_type="moisturizer",
    skin_type="dry",
    skin_concern={"dehydration"},
    concern_match="all",
    max_price=100,
    n=20
))

display(contentbased_recommender(
    product_type="eye treatment",
    skin_concern={"dark-circles","texture"},
    concern_match="all",
    max_price=120,
    n=10
))


Unnamed: 0,product_name,brand_name,product_type,skin_type,skin_concern,price_usd,rating,reviews,similarity
25,SuperSolutions 10% Urea Moisturizer Textured S...,The INKEY List,moisturizer,dry,"dehydration,dullness,redness,texture",19.99,4.1935,62,0.1331
21,Hydra Prep Soothing & Hydrating Gel,Shani Darden Skin Care,moisturizer,dry,"dehydration,redness",38.0,0.0,0,0.1112
9,Hydra Life Intense Sorbet Crème Moisturizer,Dior,moisturizer,dry,dehydration,85.0,5.0,3,0.1054
8,Intensive Moisture Balance Moisturizer,Dermalogica,moisturizer,dry,"dehydration,dullness,hyperpigmentation,redness...",47.0,4.4667,150,0.0918
15,Priming Moisturizer Rich Face Cream with Ceram...,Glossier,moisturizer,dry,"dehydration,dullness,texture",35.0,4.05,20,0.0902
3,Vinosource-Hydra SOS Intense Hydration Moistur...,Caudalie,moisturizer,dry,"dehydration,dullness,texture",45.0,4.0697,502,0.0886
12,Ultra Repair Cream Intense Hydration,First Aid Beauty,moisturizer,dry,"dehydration,dullness,hyperpigmentation,redness",38.0,4.52,7539,0.0868
23,Superscreen Daily Moisturizer Sunscreen SPF 40...,Supergoop!,moisturizer,dry,dehydration,44.0,3.6997,646,0.0865
19,Overnight Glow Dark Spot Sleeping Cream,REN Clean Skincare,moisturizer,dry,"aging,dehydration,dullness,hyperpigmentation,r...",55.0,4.1408,476,0.0821
13,Rich Bitch Cactus + Vitamin C Moisturizer,Freck Beauty,moisturizer,dry,"dehydration,dullness,hyperpigmentation,redness",34.0,4.3784,222,0.0799


Unnamed: 0,product_name,brand_name,product_type,skin_type,skin_concern,price_usd,rating,reviews,similarity
53,Lift & Contour 1% Bakuchiol & Peptide Eye Serum,Wishful,eye treatment,,"aging,dark-circles,dehydration,dullness,rednes...",45.0,4.0,26,0.1833
38,White Lucent Anti-Dark Circles Eye Cream,Shiseido,eye treatment,,"aging,dark-circles,dehydration,dullness,hyperp...",65.0,3.5878,393,0.1813
1,Bright Eyes Collagen-Infused Brightening Collo...,BeautyBio,eye treatment,,"aging,dark-circles,dehydration,dullness,texture",40.0,4.0594,202,0.1806
37,Benefiance WrinkleResist24 Pure Retinol Expres...,Shiseido,eye treatment,,"aging,dark-circles,dehydration,dullness,hyperp...",70.0,4.422,500,0.167
35,Brightening Dark Circle Eye Cream,REN Clean Skincare,eye treatment,,"dark-circles,dehydration,dullness,redness,texture",51.0,4.3027,370,0.1654
48,Brighten-i Eye Cream,The INKEY List,eye treatment,,"dark-circles,dehydration,dullness,texture",12.99,3.7617,1137,0.1647
25,Powerful-Strength Dark Circle Reducing Vitamin...,Kiehl's Since 1851,eye treatment,,"aging,dark-circles,dehydration,dullness,hyperp...",55.0,3.8646,1041,0.1616
32,C5 Super Boost Vitamin C Eye Cream,Paula's Choice,eye treatment,,"aging,dark-circles,dehydration,dullness,hyperp...",39.0,4.5,340,0.1519
29,Banana Bright+ Vitamin CC Eye Sticks,OLEHENRIKSEN,eye treatment,,"dark-circles,dehydration,dullness,hyperpigment...",34.0,4.604,298,0.1481
33,Instant FIRMx Eye Temporary Eye Tightener,Peter Thomas Roth,eye treatment,,"aging,dark-circles,dehydration,dullness,hyperp...",38.0,3.3254,627,0.148


### Evaluation metrics

In [None]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.metrics import mean_squared_error

df = pd.read_csv("products_preprocessed.csv")

def evaluate_mse_rmse_skin(
    reviews,
    top_n=5,
    sample_size=500,
    liked_threshold=3.5
):

    # 1) Basic cleaning
    for col in ["author_id", "product_id", "rating"]:
        if col not in reviews.columns:
            raise ValueError(f"Missing column in reviews: {col}")

    reviews = reviews.copy()
    reviews["rating"] = pd.to_numeric(reviews["rating"], errors="coerce")
    reviews = reviews.dropna(subset=["author_id", "product_id", "rating"])

    # 2) Keep only products that exist in df
    if "product_id" not in df.columns:
        df["product_id"] = np.arange(len(df))
    valid_ids = set(df["product_id"])
    reviews = reviews[reviews["product_id"].isin(valid_ids)]

    # 3) Build user -> liked items (users need ≥2 likes)
    likes = (reviews[reviews["rating"] >= liked_threshold]
             .groupby("author_id")["product_id"].apply(list).to_dict())
    eligible = [(u, lst) for u, lst in likes.items() if len(set(lst)) >= 2]
    if not eligible:
        print("No eligible users for evaluation.")
        return None

    # sample users (fully random each run)
    if sample_size and sample_size < len(eligible):
        idx = np.random.choice(len(eligible), size=sample_size, replace=False)
        eligible = [eligible[i] for i in idx]

    # 4) Evaluate: leave one liked item out
    y_true, y_pred, skipped = [], [], 0

    for user, liked in eligible:
        liked = [pid for pid in liked if pid in valid_ids]
        if len(liked) < 2:
            skipped += 1
            continue

        # hold out one liked product
        q = np.random.choice(liked)
        qrow = df.loc[df["product_id"] == q]
        if qrow.empty:
            skipped += 1
            continue
        qrow = qrow.iloc[0]

        # get recommendations from profile
        recs = contentbased_recommender(
            product_type=qrow.get("product_type"),
            skin_type=qrow.get("skin_type"),
            skin_concern=qrow.get("skin_concern").split(",") if isinstance(qrow.get("skin_concern"), str) else None,
            concern_match="all",
            max_price=qrow.get("price_usd"),
            n=top_n
        )
        if recs is None or recs.empty:
            skipped += 1
            continue

        # predict = similarity-weighted mean of rec ratings
        r_ratings = pd.to_numeric(recs.get("rating"), errors="coerce").fillna(0).to_numpy()
        r_sims    = pd.to_numeric(recs.get("similarity"), errors="coerce").fillna(0).to_numpy()
        if r_ratings.size == 0 or r_sims.sum() == 0:
            skipped += 1
            continue
        pred_rating = float(np.average(r_ratings, weights=np.clip(r_sims, 1e-6, None)))

        # true = this user's rating of held-out product
        held = reviews.loc[(reviews["author_id"] == user) & (reviews["product_id"] == q), "rating"]
        if held.empty:
            skipped += 1
            continue

        y_true.append(float(held.iloc[0]))
        y_pred.append(pred_rating)

    if not y_true:
        print("No valid predictions made.")
        return None

    mse_val  = mean_squared_error(y_true, y_pred)
    rmse_val = sqrt(mse_val)

    print(f"\n=== MSE/RMSE Evaluation (n={len(y_true)}, skipped={skipped}) ===")
    print(f"MSE:  {mse_val:.4f}")
    print(f"RMSE: {rmse_val:.4f}")

    return {"mse": mse_val, "rmse": rmse_val, "n_evaluated": len(y_true), "skipped": skipped}


In [None]:
reviews = pd.read_csv("filtered_skincare_reviews.csv")

evaluate_mse_rmse_skin(reviews, top_n=5, sample_size=100, liked_threshold=4)


  reviews = pd.read_csv("filtered_skincare_reviews.csv")



=== MSE/RMSE Evaluation (n=100, skipped=0) ===
MSE:  0.4853
RMSE: 0.6966


{'mse': 0.48530767418723575,
 'rmse': 0.6966402760300583,
 'n_evaluated': 100,
 'skipped': 0}