# KerasRS Recommender System Example

This notebook demonstrates how to build retrieval and ranking recommender models using the KerasRS API, following official KerasRS examples and best practices.

In [22]:
import pandas as pd
import numpy as np
import keras
import keras_rs
import tensorflow as tf
from keras import layers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Data Loading and Preprocessing

In [23]:
# Load data
# (Update the path if needed)
df = pd.read_csv('../data/data.csv', encoding='ISO-8859-1')

# Remove cancelled orders (those starting with 'C')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
# Remove rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
# Create interaction strength
df['Interaction'] = df['Quantity'] * df['UnitPrice']
df = df[df['Interaction'] > 0]

# Encode users and products
customer_ids = df['CustomerID'].unique().tolist()
product_ids = df['StockCode'].unique().tolist()
customer2idx = {x: i for i, x in enumerate(customer_ids)}
product2idx = {x: i for i, x in enumerate(product_ids)}
df['customer_idx'] = df['CustomerID'].map(customer2idx)
df['product_idx'] = df['StockCode'].map(product2idx)
num_users = len(customer2idx)
num_products = len(product2idx)
df['normalized_interaction'] = df['Interaction'] / df['Interaction'].max()

# Prepare X and y
X = df[['customer_idx', 'product_idx']].values
y = df['normalized_interaction'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Retrieval Model (Two-Tower)

In [24]:
class RetrievalModel(keras.Model):
    def __init__(self, num_users, num_products, embedding_dim=32, **kwargs):
        super().__init__(**kwargs)
        self.user_embedding = layers.Embedding(num_users, embedding_dim)
        self.product_embedding = layers.Embedding(num_products, embedding_dim)
        self.retrieval = keras_rs.layers.BruteForceRetrieval(k=10, return_scores=False)
        self.loss_fn = keras.losses.MeanSquaredError()
        self._candidates_set = False

    def update_candidates(self):
        # Set candidate embeddings for retrieval
        product_indices = np.arange(self.product_embedding.input_dim)
        product_embs = self.product_embedding(product_indices)
        self.retrieval.update_candidates(product_embs, product_indices)
        self._candidates_set = True

    def call(self, inputs, training=False):
        user_emb = self.user_embedding(inputs[:, 0])
        product_emb = self.product_embedding(inputs[:, 1])
        result = {"user_emb": user_emb, "product_emb": product_emb}
        if not training and self._candidates_set:
            # Only call retrieval if candidates are set
            result["predictions"] = self.retrieval(user_emb)
        return result

    def compute_loss(self, x, y, y_pred, sample_weight, training=True):
        user_emb = y_pred["user_emb"]
        product_emb = self.product_embedding(x[:, 1])
        labels = keras.ops.expand_dims(y, -1)
        scores = keras.ops.sum(keras.ops.multiply(user_emb, product_emb), axis=1, keepdims=True)
        return self.loss_fn(labels, scores, sample_weight)

In [25]:
retrieval_model = RetrievalModel(num_users, num_products, embedding_dim=32)
retrieval_model.compile(optimizer=keras.optimizers.Adagrad(learning_rate=0.05))
retrieval_history = retrieval_model.fit(
    X_train, y_train, batch_size=64, epochs=5, validation_data=(X_test, y_test), verbose=1
)

# IMPORTANT: Update candidates before using retrieval for inference
retrieval_model.update_candidates()

Epoch 1/5




[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 2.8586e-05 - val_loss: 2.2233e-05
Epoch 2/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 2.8586e-05 - val_loss: 2.2233e-05
Epoch 2/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2.3347e-05 - val_loss: 2.2067e-05
Epoch 3/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2.3347e-05 - val_loss: 2.2067e-05
Epoch 3/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2.2985e-05 - val_loss: 2.1905e-05
Epoch 4/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2.2985e-05 - val_loss: 2.1905e-05
Epoch 4/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2.5163e-05 - val_loss: 2.1748e-05
Epoch 5/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 2.51

## Ranking Model (Deep Neural Network)

In [26]:
class RankingModel(keras.Model):
    def __init__(self, num_users, num_products, embedding_dim=32, **kwargs):
        super().__init__(**kwargs)
        self.user_embedding = layers.Embedding(num_users, embedding_dim)
        self.product_embedding = layers.Embedding(num_products, embedding_dim)
        self.ratings = keras.Sequential([
            layers.Dense(256, activation="relu"),
            layers.Dense(64, activation="relu"),
            layers.Dense(1)
        ])

    def call(self, inputs):
        user_emb = self.user_embedding(inputs[:, 0])
        product_emb = self.product_embedding(inputs[:, 1])
        concat = keras.ops.concatenate([user_emb, product_emb], axis=1)
        return self.ratings(concat)

In [27]:
ranking_model = RankingModel(num_users, num_products, embedding_dim=32)
ranking_model.compile(
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.RootMeanSquaredError()],
    optimizer=keras.optimizers.Adagrad(learning_rate=0.05),
)
ranking_history = ranking_model.fit(X_train, y_train, batch_size=64, epochs=5, validation_data=(X_test, y_test), verbose=1)

Epoch 1/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - loss: 3.1762e-05 - root_mean_squared_error: 0.0054 - val_loss: 2.3276e-06 - val_root_mean_squared_error: 0.0015
Epoch 2/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - loss: 3.1762e-05 - root_mean_squared_error: 0.0054 - val_loss: 2.3276e-06 - val_root_mean_squared_error: 0.0015
Epoch 2/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 5.3604e-06 - root_mean_squared_error: 0.0023 - val_loss: 9.5828e-07 - val_root_mean_squared_error: 9.7892e-04
Epoch 3/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 5.3604e-06 - root_mean_squared_error: 0.0023 - val_loss: 9.5828e-07 - val_root_mean_squared_error: 9.7892e-04
Epoch 3/5
[1m4974/4974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 6.2525e-06 - root_mean_squared_error: 0.0024 - val_loss: 6.3531e-07 - val_root_mean_squa

## Beyond-Accuracy Metrics with KerasRS

KerasRS provides advanced metrics such as Mean Reciprocal Rank (MRR), nDCG, and coverage/diversity. For production, consider using these for a more holistic evaluation.

In [28]:
from keras_rs.metrics import MeanReciprocalRank , NDCG

# Example instantiation (adapt for your evaluation pipeline)
mrr_metric = MeanReciprocalRank()
ndcg_metric = NDCG()

## References
- [KerasRS API Documentation](https://keras.io/keras_rs/api/)
- [KerasRS Basic Retrieval Example](https://keras.io/keras_rs/examples/basic_retrieval/)
- [KerasRS Basic Ranking Example](https://keras.io/keras_rs/examples/basic_ranking/)
- [KerasRS Deep Recommender Example](https://keras.io/keras_rs/examples/deep_recommender/)

## Model Performance Evaluation with KerasRS

Let's evaluate our models using advanced ranking metrics provided by KerasRS.

In [29]:
from keras_rs.metrics import MeanReciprocalRank, MeanAveragePrecision, DCG, NDCG

def get_ranking_eval_data(model, X, y, num_products, group_size=20):
    # For a subset of users, get predictions for all products
    user_indices = np.unique(X[:, 0])
    np.random.shuffle(user_indices)
    user_indices = user_indices[:group_size]
    y_true, y_pred = [], []
    
    for user in user_indices:
        # All products for this user
        user_mask = X[:, 0] == user
        user_products = X[user_mask][:, 1]
        # Build a full list of all products for this user
        all_products = np.arange(num_products)
        labels = np.zeros(num_products)
        labels[user_products] = y[user_mask]
        # Predict for all products
        user_input = np.column_stack([np.repeat(user, num_products), all_products])
        scores = model.predict(user_input, verbose=0).flatten()
        y_true.append(labels)
        y_pred.append(scores)
    
    return np.array(y_true), np.array(y_pred)

# Evaluate Ranking Model
y_true, y_pred = get_ranking_eval_data(ranking_model, X_test, y_test, num_products)

# Initialize and compute metrics
mrr = MeanReciprocalRank()(y_true, y_pred)
map_score = MeanAveragePrecision()(y_true, y_pred)
dcg = DCG()(y_true, y_pred)
ndcg = NDCG()(y_true, y_pred)

print("\nRanking Model Performance:")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Mean Average Precision (MAP): {map_score:.4f}")
print(f"Discounted Cumulative Gain (DCG): {dcg:.4f}")
print(f"Normalized DCG (nDCG): {ndcg:.4f}")


Ranking Model Performance:
Mean Reciprocal Rank (MRR): 0.0000
Mean Average Precision (MAP): 0.0000
Discounted Cumulative Gain (DCG): 0.0001
Normalized DCG (nDCG): 0.1811


## Generate and Evaluate Recommendations

Let's generate recommendations for sample users and evaluate their quality.

In [30]:
def get_recommendations(model, user_id, df, customer2idx, product2idx, top_n=10):
    """Generate top-N recommendations for a specific user."""
    user_idx = customer2idx[user_id]
    all_products = np.arange(len(product2idx))
    user_input = np.column_stack([np.repeat(user_idx, len(product2idx)), all_products])
    scores = model.predict(user_input, verbose=0).flatten()
    
    # Get top N recommendations
    top_indices = np.argsort(scores)[-top_n:][::-1]
    idx2product = {v: k for k, v in product2idx.items()}
    recommended_products = [idx2product[idx] for idx in top_indices]
    
    # Create recommendations dataframe
    recommendations_data = []
    for idx, product_id in zip(top_indices, recommended_products):
        product_info = df[df['StockCode'] == product_id].iloc[0]
        recommendations_data.append({
            'StockCode': product_id,
            'Description': product_info['Description'],
            'UnitPrice': product_info['UnitPrice'],
            'Predicted_Score': scores[idx]
        })
    
    recommendations = pd.DataFrame(recommendations_data)
    return recommendations

# Generate recommendations for a sample user
sample_user_id = df['CustomerID'].iloc[0]
recommendations = get_recommendations(ranking_model, sample_user_id, df, customer2idx, product2idx)

print(f"\nTop 10 Recommendations for User {sample_user_id}:")
print(recommendations)

# Get user's actual purchases for comparison
user_actual = df[df['CustomerID'] == sample_user_id][
    ['StockCode', 'Description', 'UnitPrice', 'Quantity']
].drop_duplicates()
print(f"\nUser's Actual Purchases:")
print(user_actual)


Top 10 Recommendations for User 17850.0:
  StockCode                         Description  UnitPrice  Predicted_Score
0     22291      HANGING CHICK CREAM DECORATION       1.45         0.002746
1    35610C      WHITE CHRISTMAS FLOCK DROPLET        1.25         0.002401
2        C2                            CARRIAGE      50.00         0.002212
3     84813       SET OF 4 DIAMOND NAPKIN RINGS      12.75         0.001945
4     20983  12 PENCILS TALL TUBE RED RETROSPOT       0.85         0.001808
5     23166      MEDIUM CERAMIC TOP STORAGE JAR       1.04         0.001723
6     21503                       TOYBOX  WRAP        0.42         0.001700
7     22367     CHILDRENS APRON SPACEBOY DESIGN       1.95         0.001650
8     22649        STRAWBERRY FAIRY CAKE TEAPOT       4.95         0.001642
9     22307    GOLD MUG BONE CHINA TREE OF LIFE       1.95         0.001534

User's Actual Purchases:
     StockCode                          Description  UnitPrice  Quantity
0       85123A   WHITE 