In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
import time
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.evaluation.python_evaluation import (
    map, ndcg_at_k, precision_at_k, recall_at_k
)
from recommenders.utils.notebook_utils import store_metadata

print("System version:", sys.version)
print("Pandas version:", pd.__version__)
print("Tensorflow version:", tf.__version__)


System version: 3.12.4 (main, Jun 16 2024, 01:58:47) [Clang 15.0.0 (clang-1500.3.9.4)]
Pandas version: 2.2.3
Tensorflow version: 2.18.0


### Set the default parameters

In [2]:
EPOCHS = 50
BATCH_SIZE = 256
SEED = 42
TOP_K = 10

## Load data

In [3]:
behaviors_df = pd.read_csv("data/MIND/MINDsmall_train/behaviors.tsv", sep="\t",
                           names=["impression_id", "user_id", "timestamp", "history", "impressions"])

news_df = pd.read_csv("data/MIND/MINDsmall_train/news.tsv", sep="\t",
                      names=["news_id", "category", "subcategory", "title", "abstract", "url", "entity_list", "relation_list"])

In [4]:
# print(interactions_df["timestamp"].head())

# print(interactions_df["timestamp"].dtype)
# print(interactions_df["timestamp"].apply(type).value_counts())

# print(pd.to_datetime(interactions_df["timestamp"].head(100)))

# print(interactions_df["timestamp"].isna().sum())
# print((interactions_df["timestamp"].astype(str) == "").sum())

# interactions_df["timestamp"] = pd.to_datetime(interactions_df["timestamp"], infer_datetime_format=True)

## Preprocessing

In [5]:
# Process interactions
def process_impressions(row):
    impressions = row["impressions"].split(" ")
    return [(row["user_id"], imp.split("-")[0], int(imp.split("-")[1]), row["timestamp"])
            for imp in impressions]

interactions = []
for _, row in behaviors_df.iterrows():
    interactions.extend(process_impressions(row))

interactions_df = pd.DataFrame(interactions, columns=["userID", "itemID", "rating", "timestamp"])

# -*- the NCFDataet function reads userID in int only so removing the letter 'U' and 'N' from IDs -*-
# Create mappings for user IDs and item IDs
user_mapping = {user: idx for idx, user in enumerate(interactions_df["userID"].unique())}
item_mapping = {item: idx for idx, item in enumerate(interactions_df["itemID"].unique())}

# Apply mappings
interactions_df["userID"] = interactions_df["userID"].map(user_mapping)
interactions_df["itemID"] = interactions_df["itemID"].map(item_mapping)

# Apply the same item mapping to news_df
news_df["news_id"] = news_df["news_id"].map(item_mapping)

# Filter interactions for only available news articles
interactions_df = interactions_df[interactions_df["itemID"].isin(news_df["news_id"])]

# Convert timestamp to datetime
interactions_df["timestamp"] = pd.to_datetime(interactions_df["timestamp"])

# Sort interactions by userID before saving
interactions_df.sort_values(by="userID", inplace=True)

# First, group interactions to get the minimum timestamp per itemID
item_min_timestamps = interactions_df.groupby("itemID")["timestamp"].min().reset_index()
item_min_timestamps.rename(columns={"itemID": "news_id", "timestamp": "upload_time"}, inplace=True)

# Merge with news_df
news_df = news_df.merge(item_min_timestamps, on="news_id", how="left")
news_df.rename(columns={"upload_time_x": "upload_time"}, inplace=True)
# news_df["upload_time"] = news_df["upload_time_y"].fillna(news_df["upload_time_x"])
# news_df.drop(columns=["upload_time_x", "upload_time_y"], inplace=True)

news_df.dropna(subset=["upload_time"], inplace=True)
news_df.sort_values(by="upload_time", inplace=True)

# Mark 20% latest uploaded news as "new items"
n_new_items = int(0.2 * len(news_df))
new_items = set(news_df.tail(n_new_items)["news_id"].values)


# Save all processed interactions
interactions_df.to_csv("data/MIND/mind_interactions_train.csv", index=False)

train_file = "data/MIND/mind_interactions_train.csv"
data = NCFDataset(train_file=train_file, seed=SEED)


INFO:recommenders.models.ncf.dataset:Indexing data/MIND/mind_interactions_train.csv ...


In [6]:
# print(item_min_timestamps.head())
# print(news_df.columns)
# print(news_df.filter(like="upload_time").head())

## Model Training

In [7]:
# Train NCF model
model = NCF(
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

with Timer() as train_time:
    model.fit(data)

print("Training time:", train_time)

# Get top-k recommendations
with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train_file["itemID"].unique())
    for user in train_file["userID"].unique():
        users.extend([user] * len(item))
        items.extend(item)
        preds.extend(model.predict([user] * len(item), item, is_list=True))

    all_predictions = pd.DataFrame({"userID": users, "itemID": items, "prediction": preds})
    all_predictions = all_predictions.sort_values(by=['userID', 'prediction'], ascending=[True, False])

print("Prediction time:", test_time)

I0000 00:00:1738526869.750389 1216378 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled


KeyboardInterrupt: 

In [None]:
# Adjust batch size and learning rate
BATCH_SIZE = 256
LEARNING_RATE = 1e-4

# Simplify model architecture
model = NCF(
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[8, 4],  # Reduced layer sizes
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    verbose=10,
    seed=SEED
)

# Use early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define early stopping parameters
patience = 3  # Number of epochs to wait for improvement
best_loss = float('inf')  # Track the best validation loss
wait = 0  # Counter for epochs without improvement

# Training loop with early stopping
for epoch in range(EPOCHS):
    # Train the model for one epoch
    model.fit(data)  # Assuming model.fit() trains for one epoch
    
    # Evaluate the model on validation data
    val_loss = evaluate_model(model, validation_data)  # You need to implement this function
    
    # Early stopping logic
    if val_loss < best_loss:
        best_loss = val_loss
        wait = 0  # Reset the wait counter
    else:
        wait += 1  # Increment the wait counter
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}")
            break  # Stop training

# with Timer() as train_time:
#     model.fit(data, callbacks=[early_stopping])

print("Training time:", train_time)

# Batch predictions for efficiency
with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train_file["itemID"].unique())
    user_batch = train_file["userID"].unique()
    
    # Predict in batches
    for user in user_batch:
        users.extend([user] * len(item))
        items.extend(item)
    
    # Predict all at once
    preds = model.predict(users, items, is_list=True)
    
    all_predictions = pd.DataFrame({"userID": users, "itemID": items, "prediction": preds})
    all_predictions = all_predictions.sort_values(by=['userID', 'prediction'], ascending=[True, False])

print("Prediction time:", test_time)

## Model Evaluation

In [46]:
def hit_rate_at_k(ground_truth, predictions, k=TOP_K):
    """
    Calculate Hit Rate at K (HR@K).

    Args:
        ground_truth (pd.DataFrame): Ground truth data with columns 'userID' and 'itemID'.
        predictions (pd.DataFrame): Predicted recommendations with columns 'userID', 'itemID', and 'prediction'.
        k (int): Number of top recommendations to consider.

    Returns:
        float: Hit Rate at K.
    """
    # Group predictions by user and get top-K items for each user
    top_k_predictions = predictions.groupby('userID').head(k)
    
    # Merge predictions with ground truth to find hits
    hits = pd.merge(
        top_k_predictions,
        ground_truth,
        on=['userID', 'itemID'],
        how='inner'
    )
    
    # Count the number of users with at least one hit
    users_with_hits = hits['userID'].nunique()
    total_users = ground_truth['userID'].nunique()
    
    # Calculate HR@K
    hr_at_k = users_with_hits / total_users if total_users > 0 else 0
    return hr_at_k

In [None]:
# Compute metrics
eval_ndcg = ndcg_at_k(train_file, all_predictions, col_prediction='prediction', k=TOP_K)
eval_hr = hit_rate_at_k(train_file, all_predictions, k=TOP_K)  # Calculate HR@K

# Coverage metrics
exposed_new_items = all_predictions[all_predictions["itemID"].isin(new_items)]["itemID"].nunique()
total_new_items = len(new_items)
new_item_coverage = exposed_new_items / total_new_items if total_new_items > 0 else 0

total_items = all_predictions["itemID"].nunique()
overall_coverage = total_items / data.n_items

# Print metrics
print("NDCG@K:", eval_ndcg)
print("HR@K:", eval_hr)  # Print HR@K
print("New Item Coverage@K:", new_item_coverage)
print("Overall Item Coverage@K:", overall_coverage)

# Store results
store_metadata("ndcg", eval_ndcg)
store_metadata("hr_at_k", eval_hr)  # Store HR@K
store_metadata("new_item_coverage", new_item_coverage)
store_metadata("overall_coverage", overall_coverage)
store_metadata("train_time", train_time.interval)
store_metadata("test_time", test_time.interval)