In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------
# Sample data schemas (replace with your actual data)
# -----------------------------

# Products: dairy catalog
products = pd.DataFrame([
    {"product_id": 1, "name": "Whole Milk 1L", "category": "Milk", "fat_content": "Full", "brand": "FarmFresh",
     "price": 60, "tags": "milk full-fat pasteurized cow"},
    {"product_id": 2, "name": "Toned Milk 1L", "category": "Milk", "fat_content": "Toned", "brand": "FarmFresh",
     "price": 55, "tags": "milk toned pasteurized cow"},
    {"product_id": 3, "name": "Greek Yogurt 200g", "category": "Yogurt", "fat_content": "Low", "brand": "CreamCo",
     "price": 80, "tags": "yogurt high-protein cultured"},
    {"product_id": 4, "name": "Paneer 200g", "category": "Cheese", "fat_content": "Full", "brand": "CreamCo",
     "price": 120, "tags": "paneer fresh cheese"},
    {"product_id": 5, "name": "Ghee 500g", "category": "Ghee", "fat_content": "Full", "brand": "PureGold",
     "price": 450, "tags": "ghee clarified-butter"},
    {"product_id": 6, "name": "Lassi 250ml", "category": "Beverage", "fat_content": "Low", "brand": "FarmFresh",
     "price": 35, "tags": "lassi sweet beverage yogurt-based"}
])

# Users: demographics/preferences
users = pd.DataFrame([
    {"user_id": 101, "age": 28, "gender": "F", "location": "Noida", "diet_pref": "High-Protein"},
    {"user_id": 102, "age": 42, "gender": "M", "location": "Noida", "diet_pref": "Low-Fat"},
    {"user_id": 103, "age": 35, "gender": "F", "location": "Delhi", "diet_pref": "Balanced"},
])

# Interactions: purchases/ratings (implicit count or explicit rating)
interactions = pd.DataFrame([
    {"user_id": 101, "product_id": 1, "quantity": 5, "rating": 4.0},
    {"user_id": 101, "product_id": 3, "quantity": 2, "rating": 5.0},
    {"user_id": 102, "product_id": 2, "quantity": 6, "rating": 4.5},
    {"user_id": 102, "product_id": 6, "quantity": 3, "rating": 3.5},
    {"user_id": 103, "product_id": 4, "quantity": 1, "rating": 4.0},
    {"user_id": 103, "product_id": 5, "quantity": 1, "rating": 4.5},
])


In [19]:
#Content based Filter
class ContentBasedRecommender:
    def __init__(self, products):
        self.products = products.reset_index(drop=True)
        self.text_cols = ['name', 'category', 'fat_content', 'brand', 'tags']
        self.vectorizer = TfidfVectorizer(stop_words="english")
        self.product_vectors = None

    def _build_corpus(self):
        # Combine selected text columns into a single string per product
        corpus = (
            self.products[self.text_cols]
            .fillna("")
            .agg(" ".join, axis=1)
            .tolist()
        )
        return corpus

    def fit(self):
        corpus = self._build_corpus()
        self.product_vectors = self.vectorizer.fit_transform(corpus)

    def recommend_similar_products(self, product_id, top_k=5):
        idx_map = {pid: i for i, pid in enumerate(self.products["product_id"])}
        if product_id not in idx_map:
            return pd.DataFrame(columns=["product_id", "name", "score"])

        idx = idx_map[product_id]
        sims = cosine_similarity(
            self.product_vectors[idx],
            self.product_vectors
        ).flatten()

        result = self.products.copy()
        result["score"] = sims

        return (
            result[result["product_id"] != product_id]
            .sort_values("score", ascending=False)
            .head(top_k)[["product_id", "name", "score"]]
        )

    def recommend_for_user_profile(self, liked_product_ids, top_k=5):
        idx_map = {pid: i for i, pid in enumerate(self.products["product_id"])}
        liked_indices = [idx_map[pid] for pid in liked_product_ids if pid in idx_map]

        if not liked_indices:
            return pd.DataFrame(columns=["product_id", "name", "score"])

        user_profile = np.mean(
            self.product_vectors[liked_indices].toarray(),
            axis=0
        ).reshape(1, -1)

        sims = cosine_similarity(user_profile, self.product_vectors).flatten()

        result = self.products.copy()
        result["score"] = sims

        return (
            result[~result["product_id"].isin(liked_product_ids)]
            .sort_values("score", ascending=False)
            .head(top_k)[["product_id", "name", "score"]]
        )


In [20]:
cb = ContentBasedRecommender(products)
cb.fit()

print(cb.recommend_similar_products(product_id=1, top_k=3))
print(cb.recommend_for_user_profile(liked_product_ids=[1, 3], top_k=3))

   product_id               name     score
1           2      Toned Milk 1L  0.660390
5           6        Lassi 250ml  0.043314
2           3  Greek Yogurt 200g  0.000000
   product_id           name     score
1           2  Toned Milk 1L  0.466966
5           6    Lassi 250ml  0.183502
3           4    Paneer 200g  0.085106


In [24]:
#Collaborative Filtering


class CollaborativeFiltering:
    def __init__(self, interactions_df, products_df, users_df, use_rating=True):
        self.interactions = interactions_df.copy()
        self.products = products_df.copy()
        self.users = users_df.copy()
        self.use_rating = use_rating

        self.user_item = None
        self.user_index = None
        self.item_index = None
        self.item_sim = None

    def fit(self):
        value_col = (
            "rating"
            if self.use_rating and "rating" in self.interactions.columns
            else "quantity"
        )

        pivot = (
            self.interactions
            .pivot_table(
                index="user_id",
                columns="product_id",
                values=value_col,
                aggfunc="mean"
            )
            .fillna(0)
        )

        # Normalize (important)
        pivot_norm = pivot.subtract(pivot.mean(axis=1), axis=0).fillna(0)

        self.user_item = pivot_norm.values
        self.user_index = pivot.index.tolist()
        self.item_index = pivot.columns.tolist()

        # Precompute item similarity
        self.item_sim = cosine_similarity(self.user_item.T)

    def _get_user_vector(self, user_id):
        if user_id not in self.user_index:
            return None
        return self.user_item[self.user_index.index(user_id)]

    def recommend_for_user(self, user_id, top_k=5):
        user_vec = self._get_user_vector(user_id)

        # Cold start â†’ popular items
        if user_vec is None or np.count_nonzero(user_vec) == 0:
            popular = (
                self.interactions
                .groupby("product_id")["quantity"]
                .sum()
                .sort_values(ascending=False)
                .head(top_k)
                .reset_index()
                .merge(self.products[["product_id", "name"]], on="product_id")
            )
            popular["score"] = popular["quantity"]
            return popular[["product_id", "name", "score"]]

        scores = user_vec @ self.item_sim

        interacted = {
            self.item_index[i]
            for i in np.where(user_vec != 0)[0]
        }

        recs = [
            (pid, scores[i])
            for i, pid in enumerate(self.item_index)
            if pid not in interacted
        ]

        recs = sorted(recs, key=lambda x: x[1], reverse=True)[:top_k]

        return (
            pd.DataFrame(recs, columns=["product_id", "score"])
            .merge(self.products[["product_id", "name"]], on="product_id")
        )

In [23]:
cf = CollaborativeFiltering(interactions, products, users, use_rating=True)
cf.fit()
print(cf.recommend_for_user(user_id=101, top_k=3))

Empty DataFrame
Columns: [product_id, score, name]
Index: []


In [26]:
#Hybrid Filter
class HybridRecommender:
    def __init__(self, cb_model: ContentBasedRecommender, cf_model: CollaborativeFiltering,
                 w_content=0.5, w_collab=0.5):
        self.cb = cb_model
        self.cf = cf_model
        self.wc = w_content
        self.wf = w_collab
        self.scaler = MinMaxScaler()

    def recommend(self, user_id, liked_product_ids=None, top_k=5):
        # Content-based scores
        if liked_product_ids:
            cb_recs = self.cb.recommend_for_user_profile(liked_product_ids, top_k=100)
        else:
            # If no likes, use popularity proxy via CF or return empty
            cb_recs = self.cb.products.copy()
            cb_recs["score"] = 0.0
            cb_recs = cb_recs[["product_id", "name", "score"]]

        # Collaborative scores
        cf_recs = self.cf.recommend_for_user(user_id, top_k=100)

        # Merge and scale
        merged = pd.merge(cb_recs, cf_recs, on=["product_id", "name"], how="outer", suffixes=("_cb", "_cf")).fillna(0.0)
        merged[["score_cb", "score_cf"]] = self.scaler.fit_transform(merged[["score_cb", "score_cf"]])
        merged["hybrid_score"] = self.wc * merged["score_cb"] + self.wf * merged["score_cf"]

        return merged.sort_values("hybrid_score", ascending=False).head(top_k)[["product_id", "name", "hybrid_score"]]

# Usage
hyb = HybridRecommender(cb, cf, w_content=0.4, w_collab=0.6)
print(hyb.recommend(user_id=101, liked_product_ids=[1,3], top_k=5))


   product_id           name  hybrid_score
0           2  Toned Milk 1L      0.400000
3           6    Lassi 250ml      0.157187
1           4    Paneer 200g      0.072901
2           5      Ghee 500g      0.000000


  merged = pd.merge(cb_recs, cf_recs, on=["product_id", "name"], how="outer", suffixes=("_cb", "_cf")).fillna(0.0)


In [27]:
#Knowledge Based Filtering
class KnowledgeBasedRecommender:
    def __init__(self, products_df):
        self.products = products_df.copy()

    def apply_rules(self, constraints: dict, top_k=5):
        df = self.products.copy()

        # Example constraints:
        # constraints = {
        #   "max_price": 100,
        #   "fat_content": ["Low", "Toned"],
        #   "category_in": ["Milk", "Yogurt"],
        #   "exclude_tags": ["sweet"]
        # }

        if "max_price" in constraints:
            df = df[df["price"] <= constraints["max_price"]]

        if "fat_content" in constraints:
            df = df[df["fat_content"].isin(constraints["fat_content"])]

        if "category_in" in constraints:
            df = df[df["category"].isin(constraints["category_in"])]

        if "brand_in" in constraints:
            df = df[df["brand"].isin(constraints["brand_in"])]

        if "exclude_tags" in constraints:
            excl = set(constraints["exclude_tags"])
            df = df[~df["tags"].apply(lambda t: any(tag in t for tag in excl))]

        # Simple scoring: cheaper first, then brand preference
        df["kb_score"] = 1.0 / (df["price"] + 1)
        return df.sort_values("kb_score", ascending=False).head(top_k)[["product_id", "name", "kb_score"]]

# Usage
kb = KnowledgeBasedRecommender(products)
print(kb.apply_rules({"max_price": 100, "fat_content": ["Low", "Toned"], "category_in": ["Milk", "Yogurt"]}, top_k=5))



   product_id               name  kb_score
1           2      Toned Milk 1L  0.017857
2           3  Greek Yogurt 200g  0.012346


In [28]:
#Demographic based Filtering
class DemographicRecommender:
    def __init__(self, products_df, users_df, interactions_df=None):
        self.products = products_df.copy()
        self.users = users_df.copy()
        self.interactions = interactions_df.copy() if interactions_df is not None else None

    def _segment(self, user_row):
        # Example segmentation logic
        age = user_row["age"]
        diet = user_row.get("diet_pref", "Balanced")

        if diet == "Low-Fat":
            return {"fat_content": ["Low", "Toned"], "category_in": ["Milk", "Yogurt"]}
        if diet == "High-Protein":
            return {"category_in": ["Yogurt", "Cheese"]}
        # Age-based tweak
        if age < 30:
            return {"category_in": ["Beverage", "Yogurt", "Milk"]}
        return {"category_in": ["Milk", "Cheese", "Ghee"]}

    def recommend(self, user_id, top_k=5):
        user_row = self.users[self.users["user_id"] == user_id]
        if user_row.empty:
            return pd.DataFrame(columns=["product_id", "name", "score"])
        user_row = user_row.iloc[0]

        seg_rules = self._segment(user_row)
        df = self.products.copy()
        if "category_in" in seg_rules:
            df = df[df["category"].isin(seg_rules["category_in"])]
        if "fat_content" in seg_rules:
            df = df[df["fat_content"].isin(seg_rules["fat_content"])]

        # Optional: popularity by segment using interactions
        if self.interactions is not None:
            pop = (self.interactions.groupby("product_id")["quantity"].sum().rename("popularity"))
            df = df.merge(pop, on="product_id", how="left").fillna({"popularity": 0})
        else:
            df["popularity"] = 0

        # Score: segment match + popularity + affordability
        df["afford"] = 1.0 / (df["price"] + 1)
        # Normalize
        scaler = MinMaxScaler()
        df[["popularity", "afford"]] = scaler.fit_transform(df[["popularity", "afford"]])
        df["demo_score"] = 0.6 * df["popularity"] + 0.4 * df["afford"]

        return df.sort_values("demo_score", ascending=False).head(top_k)[["product_id", "name", "demo_score"]]

# Usage
demo = DemographicRecommender(products, users, interactions)
print(demo.recommend(user_id=102, top_k=5))


   product_id               name  demo_score
0           2      Toned Milk 1L         1.0
1           3  Greek Yogurt 200g         0.0
