# CONTENT BASED

In [None]:
import pandas as pd
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split

# ----------------------------
# 1. Load & filter skincare products
# ----------------------------
product_info = pd.read_csv("data/OriDataSet/product_info.csv", low_memory=False)

# Convert product_id to string and strip whitespace
product_info['product_id'] = product_info['product_id'].astype(str).str.strip()

# Filter primary category
skincare_products_df = product_info[product_info['primary_category'].str.lower() == 'skincare'].copy()

# Filter secondary categories
allowed_secondary_categories = [
    'Moisturizers', 'Treatments', 'Eye Care', 'Lip Balms & Treatments',
    'Sunscreen', 'Cleansers', 'Masks'
]
skincare_products_df = skincare_products_df[
    skincare_products_df['secondary_category'].isin(allowed_secondary_categories)
].copy()

# Filter tertiary categories
allowed_tertiary_categories = [
    'Moisturizers', 'Face Serums', 'Eye Creams & Treatments', 'Face Sunscreen',
    'Face Wash & Cleansers', 'Face Oils', 'Toners', 'Face Masks', 'Facial Peels',
    'Exfoliators', 'Eye Masks', 'Face Wipes', 'Blemish & Acne Treatments',
    'Night Creams', 'Mists & Essences', 'Sheet Masks', 'Makeup Removers'
]
skincare_products_df = skincare_products_df[
    skincare_products_df['tertiary_category'].isin(allowed_tertiary_categories)
].copy()

# Optional: save filtered products
skincare_products_df.to_csv("data/CleanedDataSet/filtered_skincare_products.csv", index=False)

# ----------------------------
# 2. Load & filter reviews
# ----------------------------
review_files = [
    "data/OriDataSet/reviews_0-250.csv",
    "data/OriDataSet/reviews_250-500.csv",
    "data/OriDataSet/reviews_500-750.csv",
    "data/OriDataSet/reviews_750-1250.csv",
    "data/OriDataSet/reviews_1250-end.csv"
]

skincare_product_ids = set(skincare_products_df['product_id'].unique())
all_reviews = []

for file in review_files:
    df = pd.read_csv(file)
    
    # Drop 'Unnamed: 0' if exists
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])
    
    df['product_id'] = df['product_id'].astype(str).str.strip()
    skincare_reviews = df[df['product_id'].isin(skincare_product_ids)].copy()
    all_reviews.append(skincare_reviews)

# Merge all reviews
merged_reviews_df = pd.concat(all_reviews, ignore_index=True)

# Add sequential review_id
merged_reviews_df['review_id'] = merged_reviews_df.index + 1

# Drop essential missing values
merged_reviews_df.dropna(subset=["author_id", "product_id", "rating", "review_text", "skin_type"], inplace=True)

# Keep valid ratings
merged_reviews_df = merged_reviews_df[(merged_reviews_df["rating"] >= 1) & (merged_reviews_df["rating"] <= 5)]

# Remove duplicates
merged_reviews_df.drop_duplicates(inplace=True)

# ----------------------------
# 3. Clean review text
# ----------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)   # remove special characters
    text = re.sub(r"\s+", " ", text).strip()
    return text

merged_reviews_df["cleaned_review_text"] = merged_reviews_df["review_text"].apply(clean_text)

# ----------------------------
# 4. Add combined features to products
# ----------------------------
skincare_products_df["combined_features"] = (
    skincare_products_df["brand_name"].fillna("") + " " +
    skincare_products_df["tertiary_category"].fillna("") + " " +
    skincare_products_df["product_name"].fillna("") + " " +
    skincare_products_df.get("ingredients", pd.Series("")).fillna("") + " " +
    skincare_products_df.get("highlights", pd.Series("")).fillna("") + " " 
)


# ----------------------------
# 5. Merge reviews with product info
# ----------------------------
merged_reviews_df = merged_reviews_df.drop(columns=["brand_name", "product_name"], errors="ignore")

combined_df = pd.merge(
    merged_reviews_df,
    skincare_products_df[["product_id", "brand_name", "product_name", "combined_features"]],
    on="product_id",
    how="inner"
)

# ----------------------------
# 6. Sentiment analysis
# ----------------------------
def analyze_sentiment(text):
    if pd.isna(text) or not isinstance(text, str):
        return 0.0
    return TextBlob(text).sentiment.polarity

combined_df["sentiment_score"] = combined_df["cleaned_review_text"].apply(analyze_sentiment)

# ----------------------------
# 7. Train/test split
# ----------------------------
train_df, test_df = train_test_split(
    combined_df,
    test_size=0.2,
    random_state=42
)

# ----------------------------
# 8. Save final datasets
# ----------------------------
try:
    combined_df.to_csv("data/CleanedDataSet/combined_skincare_with_sentiment.csv", index=False)
    print("Combined dataset saved successfully")
except PermissionError:
    print("Could not save combined dataset CSV (file may be open)")

try:
    train_df.to_csv("data/CleanedDataSet/train_skincare.csv", index=False)
    test_df.to_csv("data/CleanedDataSet/test_skincare.csv", index=False)
    print("Train/test datasets saved successfully")
except PermissionError:
    print("Could not save train/test CSV files (files may be open)")

print("Full preprocessing and train/test split completed!")
print(f"Combined dataframe shape: {combined_df.shape}")
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

In [None]:
import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import joblib
import matplotlib.pyplot as plt

nltk.download("punkt", quiet=True)

# Paths
TRAIN_PATH = "data/CleanedDataSet/train_skincare.csv"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, "word2vec_skincare.model")
EMBEDDINGS_PATH = os.path.join(MODEL_DIR, "product_embeddings.pkl")

# ---------------- 1) Load dataset ----------------
def load_dataset(path):
    df = pd.read_csv(path, low_memory=False)
    df = df.dropna(subset=["combined_features"]).copy()
    df["tokens"] = df["combined_features"].astype(str).str.lower().apply(word_tokenize)
    return df

# ---------------- 2) Extract tertiary category ----------------
def extract_tertiary(feature_str):
    """
    从 combined_features extract tertiary category
    """
    feature_str = str(feature_str).lower()
    
    mapping = {
        'moisturizer': 'Moisturizers',
        'serum': 'Face Serums',
        'eye cream': 'Eye Creams & Treatments',
        'treatment': 'Blemish & Acne Treatments',
        'lip balm': 'Lip Balms & Treatments',
        'sunscreen': 'Face Sunscreen',
        'cleanser': 'Face Wash & Cleansers',
        'face wash': 'Face Wash & Cleansers',
        'oil': 'Face Oils',
        'toner': 'Toners',
        'mask': 'Face Masks',
        'peel': 'Facial Peels',
        'exfoliator': 'Exfoliators',
        'eye mask': 'Eye Masks',
        'wipe': 'Face Wipes',
        'night cream': 'Night Creams',
        'mist': 'Mists & Essences',
        'essence': 'Mists & Essences',
        'sheet mask': 'Sheet Masks',
        'makeup remover': 'Makeup Removers'
    }
    
    for k, v in mapping.items():
        if k in feature_str:
            return v
    return 'Other'

# ---------------- 3) Train Word2Vec ----------------
def train_w2v(token_lists, model_path=MODEL_PATH):
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    model = Word2Vec(
        sentences=token_lists,
        vector_size=100,
        window=5,
        min_count=2,
        workers=4,
        sg=1,       # Skip-gram
        epochs= 80
    )
    model.save(model_path)
    print(f"Word2Vec model saved at: {model_path}")
    return model

# ---------------- 4) Build embeddings ----------------
def sentence_vec(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

def build_embeddings(df, model):
    # 确保 df 已经有 tertiary_category
    if 'tertiary_category' not in df.columns:
        df['tertiary_category'] = df['combined_features'].astype(str).apply(extract_tertiary)

    prod = df.groupby("product_id", as_index=False).agg({
        "brand_name": "first",
        "product_name": "first",
        "price_usd": "first",
        "tokens": "first",
        "tertiary_category": "first"
    })

    prod["embedding"] = prod["tokens"].apply(lambda toks: sentence_vec(toks, model))
    emb = np.vstack(prod["embedding"].values)
    return prod, emb

# ---------------- 5) Save embeddings ----------------
def save_embeddings(prod_df, prod_embeds, path=EMBEDDINGS_PATH):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    joblib.dump((prod_df, prod_embeds), path)
    print(f"Embeddings saved at: {path}")

# ---------------- 6) Run pipeline ----------------
if __name__ == "__main__":
    df = load_dataset(TRAIN_PATH)
    df['tertiary_category'] = df['combined_features'].astype(str).apply(extract_tertiary)

    w2v = train_w2v(df["tokens"].tolist())
    prod_df, prod_embeds = build_embeddings(df, w2v)
    save_embeddings(prod_df, prod_embeds)

    print(f"{len(prod_df)} unique products embedded successfully")

Word2Vec model saved at: models\word2vec_skincare.model
Embeddings saved at: models\product_embeddings.pkl
1760 unique products embedded successfully


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Recommendation function: content-based + same tertiary category
def recommend_same_category(prod_df, prod_embeds, product_id, top_n=10):
    # get targeted product
    target = prod_df[prod_df["product_id"] == product_id]
    if target.empty:
        raise ValueError(f"Product ID {product_id} not found.")
    
    # extract tertiary category
    tc = target["tertiary_category"].values[0] if "tertiary_category" in target else None
    if tc is None:
        raise ValueError(f"Product ID {product_id} has no tertiary_category information.")

    # Only select the product that same tertiary category
    candidates = prod_df[prod_df["tertiary_category"] == tc]
    
    # 如果只有目标产品本身，直接返回空
    if len(candidates) <= 1:
        return pd.DataFrame(columns=[
            "product_id", "brand_name", "product_name", "price_usd", "tertiary_category", "similarity_score"
        ])
    
    # 计算相似度
    target_vec = target["embedding"].values[0].reshape(1, -1)
    cand_embeds = np.vstack(candidates["embedding"].values)
    sims = cosine_similarity(target_vec, cand_embeds)[0]

    # Build DataFrame
    recs = candidates.copy()
    recs["similarity_score"] = sims
    recs = recs[recs["product_id"] != product_id].sort_values("similarity_score", ascending=False)
    
    return recs.head(top_n)[[
        "product_id", "brand_name", "product_name", "price_usd", "tertiary_category", "similarity_score"
    ]]

# -------- Example usage --------
recs = recommend_same_category(prod_df, prod_embeds, product_id="P454095", top_n=5)
print(recs)

Loading data...


  df = pd.read_csv("data/CleanedDataSet/combined_skincare_with_sentiment.csv")


=== Correct Content-Based Evaluation ===
Average recommendation similarity: 0.940
Similarity > 0.8: 100.0%
Category consistency: 50.0%
Diversity (cross-category recommendations): 82.0%

=== Recommendation Example Test ===

1. Recommendations for product 'HydraKate Recharging Water Cream Moisturizer':
   → Water Drench Hyaluronic Cloud Rich Barrier Moisturizer (similarity: 0.950)
   → The ZenBubble Gel Cream (similarity: 0.942)

2. Recommendations for product 'High-Potency Night-a-Mins Resurfacing Cream with Fruit-Derived AHAs':
   → High-Potency Night-A-Mins Oil-Free Resurfacing Cream with Fruit Derived AHAs (similarity: 0.988)
   → Plantscription Youth-Renewing Power Night Cream (similarity: 0.958)

3. Recommendations for product 'All About Clean Liquid Facial Soap':
   → All About Clean 2-in-1 Cleansing + Exfoliating Jelly (similarity: 0.946)
   → All About Clean Foaming Facial Soap (similarity: 0.920)


# Surprise SVD

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from surprise import Dataset, Reader, SVD, accuracy, dump
from surprise.model_selection import train_test_split
import os

print("Using Surprise-based collaborative filtering...")

# ---------------- 1) Load data ----------------
df = pd.read_csv("data/CleanedDataSet/combined_skincare_with_sentiment.csv", low_memory=False)

print(f"Data loaded: {df.shape}")
print("Columns available:", df.columns.tolist())

# ---------------- 2) Prepare final rating ----------------
if "sentiment_score" in df.columns:
    # Normalize sentiment_score (-1 to 1) → (0 to 5)
    df["sentiment_normalized"] = (df["sentiment_score"] + 1) * 2.5
    df["final_rating"] = 0.7 * df["rating"] + 0.3 * df["sentiment_normalized"]
else:
    df["final_rating"] = df["rating"]

print(f"Final rating range: {df['final_rating'].min():.2f} to {df['final_rating'].max():.2f}")

# ---------------- 3) Build Surprise dataset ----------------
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["author_id", "product_id", "final_rating"]], reader)

# ---------------- 4) Train-test split ----------------
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# ---------------- 5) Train SVD model ----------------
print("Training Surprise SVD model...")
model = SVD(
    n_factors=50,   # latent dimensions
    lr_all=0.005,   # learning rate
    reg_all=0.02,   # regularization
    random_state=42
)
model.fit(trainset)

# ---------------- 6) Save Model ----------------
print("Saving model...")
model_path = "models/surprise_svd_model.pkl"
# Create directory if not exists
os.makedirs(os.path.dirname(model_path), exist_ok=True)
dump.dump(model_path, algo=model)
print(f"✅ Model saved to {model_path}")

# ---------------- 7) Evaluate with RMSE ----------------
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f"✅ Model evaluation completed. Test RMSE: {rmse:.4f}")

# ---------------- 8) Top-N helper functions ----------------
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions."""
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # sort and get top N
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

def precision_recall_f1_coverage(predictions, n=10, threshold=3.5):
    """
    Compute precision, recall, F1 and coverage at Top-N recommendations.
    threshold: rating above this is considered relevant.
    """
    top_n = get_top_n(predictions, n=n)

    precisions = []
    recalls = []
    all_recommended_items = set()

    for uid, user_ratings in top_n.items():
        # Relevant items for this user (ground truth)
        true_items = {iid for (uid_, iid, true_r, est, _) in predictions if uid_ == uid and true_r >= threshold}
        recommended_items = {iid for (iid, est) in user_ratings}

        all_recommended_items.update(recommended_items)

        n_rel = len(true_items)  # relevant
        n_rec_k = len(recommended_items)  # recommended in top N
        n_rel_and_rec_k = len(true_items & recommended_items)

        if n_rec_k > 0:
            precisions.append(n_rel_and_rec_k / n_rec_k)
        if n_rel > 0:
            recalls.append(n_rel_and_rec_k / n_rel)

    precision = np.mean(precisions) if precisions else 0
    recall = np.mean(recalls) if recalls else 0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

    # Coverage = proportion of unique items ever recommended
    all_items = {iid for (_, iid, _, _, _) in predictions}
    coverage = len(all_recommended_items) / len(all_items)

    return precision, recall, f1, coverage

# ---------------- 9) Compute evaluation metrics ----------------
precision, recall, f1, coverage = precision_recall_f1_coverage(predictions, n=5, threshold=3.5)

print("\n📊 Evaluation Metrics (Top-5):")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1-score:  {f1:.4f}")
print(f"   Coverage:  {coverage:.4f}")

# ---------------- 10) Cold Start Helper Functions ----------------
def get_popular_products(df, top_n=10, min_ratings=5):
    """Get popular products for cold start fallback"""
    product_stats = df.groupby('product_id').agg({
        'final_rating': ['count', 'mean']
    }).round(2)
    product_stats.columns = ['rating_count', 'avg_rating']
    product_stats = product_stats.reset_index()
    
    # Filter products with enough ratings
    qualified = product_stats[product_stats['rating_count'] >= min_ratings]
    qualified = qualified.sort_values(['rating_count', 'avg_rating'], ascending=False)
    
    return qualified.head(top_n)

def format_popular_recommendations(popular_products, df):
    """Format popular products recommendations"""
    product_info = df[['product_id', 'product_name', 'brand_name']].drop_duplicates()
    result = pd.merge(popular_products, product_info, on='product_id')
    
    print("\n🔥 Popular Products (Cold Start Fallback):")
    print("-" * 60)
    recommendations = []
    
    for i, row in result.iterrows():
        print(f"{i+1}. {row['product_name']}")
        print(f"   Brand: {row['brand_name']}")
        print(f"   Avg Rating: {row['avg_rating']} ({row['rating_count']} ratings)")
        print()
        recommendations.append((row['product_id'], row['avg_rating']))
    
    return recommendations

# ---------------- 11) Enhanced Recommendation function with Cold Start ----------------
def recommend_products_surprise(user_id, df, model, top_n=5):
    """Recommend top N products with cold start handling"""
    user_id = str(user_id)

    # Check if user exists in dataset (cold start detection)
    if user_id not in df["author_id"].values:
        print(f"🎯 User {user_id} not found - using popular products fallback")
        popular_products = get_popular_products(df, top_n)
        return format_popular_recommendations(popular_products, df)

    # Get all unique products
    all_products = df["product_id"].unique()
    user_products = df[df["author_id"] == user_id]["product_id"].unique()

    predictions = []
    for product_id in all_products:
        if product_id not in user_products:
            try:
                pred = model.predict(user_id, str(product_id))
                predictions.append((product_id, pred.est))
            except:
                # Skip if prediction fails for individual item
                continue

    # Sort by predicted rating
    top_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]

    # Get product details
    product_info = df.drop_duplicates("product_id")[["product_id", "product_name", "brand_name"]]

    print(f"\n🎯 Top {top_n} Recommendations for User {user_id}:")
    print("-" * 60)
    for i, (product_id, predicted_rating) in enumerate(top_predictions, 1):
        product_row = product_info[product_info["product_id"] == product_id]
        if not product_row.empty:
            name = product_row["product_name"].values[0]
            brand = product_row["brand_name"].values[0]
            print(f"{i}. {name}")
            print(f"   Brand: {brand}")
            print(f"   Predicted Rating: {predicted_rating:.2f}")
        else:
            print(f"{i}. Product {product_id} - Details not available")

    return top_predictions

# ---------------- 12) Test recommendation with both existing and new user ----------------
if len(df["author_id"].unique()) > 0:
    # Test with existing user
    sample_user = df["author_id"].iloc[0]
    print("Testing with existing user:")
    recs = recommend_products_surprise(sample_user, df, model, top_n=5)
    
    # Test with new user (cold start)
    print("\n" + "="*60)
    print("Testing Cold Start with new user:")
    new_user = "new_user_12345"
    new_user_recs = recommend_products_surprise(new_user, df, model, top_n=5)
else:
    print("No users found in dataset")