



#Bertopic & LightFM Hybrid Recommender

## Get the Environment Ready

Installing & Importing Packages

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!python --version


Python 3.12.11


In [2]:
!pip install bertopic sentence-transformers annoy umap-learn hdbscan scikit-learn joblib scipy pandas json

[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for json[0m[31m
[0m

In [3]:
!pip install annoy



In [4]:
!pip install bertopic



In [5]:
!apt-get install -y build-essential
!pip install cython
!pip install git+https://github.com/daviddavo/lightfm

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
Collecting git+https://github.com/daviddavo/lightfm
  Cloning https://github.com/daviddavo/lightfm to /tmp/pip-req-build-4ayh38f0
  Running command git clone --filter=blob:none --quiet https://github.com/daviddavo/lightfm /tmp/pip-req-build-4ayh38f0
  Resolved https://github.com/daviddavo/lightfm to commit f0eb500ead54ab65eb8e1b3890337a7223a35114
  Preparing metadata (setup.py) ... [?25l[?25hdone


Here we define some Paths and Constants that can be useful later.

In [6]:
import os, gc, logging, json
from pathlib import Path
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack, save_npz, load_npz
import joblib

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score


# Paths
BASE_DIR = Path("/content/drive/MyDrive/recommender")
BASE_DIR.mkdir(parents=True, exist_ok=True)


DATAFILE = BASE_DIR / "deezer_train_clean.parquet"

# Constants
EMBED_BATCH = 128
LF_NO_COMPONENTS = 32
LF_EPOCHS = 20
LF_NUM_THREADS = 4


  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [None]:
import logging
logger = logging.getLogger("hybrid_pipeline")
logger.setLevel(logging.INFO)
if not logger.handlers:
    sh = logging.StreamHandler()
    sh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
    logger.addHandler(sh)

def save_joblib(obj, path): joblib.dump(obj, path); logger.info(f"Saved {path}")
def load_joblib(path): logger.info(f"Loading {path}"); return joblib.load(path)


## Prepare the Data

Read Parquet file, check consistency, deduplicate only unique user-items

In [6]:
from pathlib import Path
from typing import Tuple
import pandas as pd
from sklearn.model_selection import train_test_split


def load_and_prepare_data(path: Path) -> pd.DataFrame:
    """
    Loads dataset (CSV or Parquet), renames `media_id` to `item_id`,
    and returns a deduplicated dataframe (one row per user-item).
    """
    if not path.exists():
        raise FileNotFoundError(f"Data file not found at {path}")

    # Load
    if path.suffix == ".parquet":
        df = pd.read_parquet(path)
    else:
        df = pd.read_csv(path)

    print(f"Loaded dataframe: {df.shape}")

    if "media_id" not in df.columns:
        raise KeyError("Dataset must contain a `media_id` column.")
    df = df.rename(columns={"media_id": "item_id"})

    # Deduplicate per user-item
    if "user_id" in df.columns and "item_id" in df.columns:
        df = df.drop_duplicates(subset=["user_id", "item_id"]).reset_index(drop=True)
    else:
        raise KeyError("Dataset must contain `user_id` and `media_id`/`item_id` columns.")

    return df



This part is for testing if topic reranking has really improved our dataset. So we include only the items with topics for the evaluation part between LightFM and BERTopic reranking.

In [8]:
from pathlib import Path
from typing import Tuple
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

def load_and_prepare_data_with_topics(path: Path, topic_mask: np.ndarray) -> pd.DataFrame:
    """
    Loads dataset (CSV or Parquet), renames `media_id` to `item_id`,
    filters only items with topic embeddings, and returns a deduplicated dataframe.

    Parameters
    ----------
    path : Path
        Path to CSV or Parquet dataset.
    topic_mask : np.ndarray
        Boolean array, True for items with topic embeddings. Index corresponds to item_id.

    Returns
    -------
    pd.DataFrame
        Deduplicated dataframe with only items that have topics.
    """
    if not path.exists():
        raise FileNotFoundError(f"Data file not found at {path}")

    # Load
    if path.suffix == ".parquet":
        df = pd.read_parquet(path)
    else:
        df = pd.read_csv(path)

    print(f"Loaded dataframe: {df.shape}")

    if "media_id" not in df.columns:
        raise KeyError("Dataset must contain a `media_id` column.")
    df = df.rename(columns={"media_id": "item_id"})

    # Deduplicate per user-item
    if "user_id" in df.columns and "item_id" in df.columns:
        df = df.drop_duplicates(subset=["user_id", "item_id"]).reset_index(drop=True)
    else:
        raise KeyError("Dataset must contain `user_id` and `media_id`/`item_id` columns.")

    # Filter items that have topic embeddings
    df = df[df["item_id"].isin(np.where(topic_mask)[0])].reset_index(drop=True)
    print(f"Filtered dataframe (items with topics): {df.shape}")

    return df


## Embeddings for Bertopic model

We generate embeddings for artist, album, track name combined using HuggingFace light embedding model: all-MiniLM-L6-v2

In [8]:
import numpy as np
import joblib
from pathlib import Path
from sentence_transformers import SentenceTransformer
import torch
import os

ARTIFACTS_DIR = Path("/content/drive/MyDrive/deezer/artifacts")
ARTIFACTS_DIR.mkdir(exist_ok=True, parents=True)


MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME, device="cuda" if torch.cuda.is_available() else "cpu")
print(f"Using model: {MODEL_NAME} on device: {model.device}")

# Deduplicate items
items = items.drop_duplicates(subset=["item_id"]).reset_index(drop=True)
print("After deduplication:")
print("items shape:", items.shape)
print("unique media_id:", items["item_id"].nunique())

def generate_texts(items_df):
    return (items_df["artist_name"].fillna("") + " - " +
            items_df["album_name"].fillna("") + " - " +
            items_df["track_name"].fillna("")).tolist()

texts = generate_texts(items)
print(f"Total unique items to embed: {len(texts)}")

# Save file
SAVE_FILE = ARTIFACTS_DIR / "item_embeddings.joblib"

# Initialize or resume
if SAVE_FILE.exists():
    loaded = joblib.load(SAVE_FILE)
    if isinstance(loaded, list):
        all_emb = loaded
        start_idx = sum(a.shape[0] for a in all_emb)
    else:
        all_emb = [loaded]
        start_idx = loaded.shape[0]
    print(f"Resuming from {start_idx} embeddings...")
else:
    all_emb = []
    start_idx = 0

BATCH_SIZE = 32
CHUNK_SIZE = 500

# Embed in chunks
for i in range(start_idx, len(texts), CHUNK_SIZE):
    batch_texts = texts[i:i+CHUNK_SIZE]
    emb = model.encode(batch_texts, batch_size=BATCH_SIZE, convert_to_numpy=True, show_progress_bar=True)
    all_emb.append(emb)

    # Save partial embeddings
    joblib.dump(all_emb, SAVE_FILE)
    print(f"Saved {i + len(batch_texts)} / {len(texts)} embeddings")

# Final stack and save as a single array
all_emb = np.vstack(all_emb)
joblib.dump(all_emb, SAVE_FILE)
print(f"✅ Done! Final embeddings shape: {all_emb.shape}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Using model: sentence-transformers/all-MiniLM-L6-v2 on device: cpu


NameError: name 'items' is not defined

In [None]:
emb.shape[0]

165

Load the saved embedding model

In [None]:
import joblib

emb = joblib.load("/content/drive/MyDrive/deezer/artifacts/item_embeddings.joblib")


## Bertopic Topic Modeling

Here, using the embedding model created in the previous step, with UMAP and BERTOPIC and hdbscan we create topics based on artist, track, album name. Reason we choose UMAP and HDBSCAN is they decrease the computation time.

In [None]:
from umap import UMAP
import hdbscan
from bertopic import BERTopic
import numpy as np


SAMPLE_SIZE = 400_000
rng = np.random.default_rng(42)
sample_idx = rng.choice(len(texts), size=SAMPLE_SIZE, replace=False)

texts_sample = [texts[i] for i in sample_idx]
emb_sample = emb[sample_idx]

# Models
umap_model = UMAP(n_components=10, n_neighbors=7, metric="cosine", random_state=42)
cluster_model = hdbscan.HDBSCAN(min_cluster_size=40, prediction_data=True)

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=cluster_model,
    verbose=True
)

topics_sample, _ = topic_model.fit_transform(texts_sample, emb_sample)

topics_full, _ = topic_model.transform(texts, emb)

joblib.dump(topic_model, "/content/drive/MyDrive/deezer/artifacts/bertopic_model.joblib")
joblib.dump(topics_full, "/content/drive/MyDrive/deezer/artifacts/item_topics.joblib")

print("✅ BERTopic complete. Topics saved.")




2025-09-26 13:03:47,226 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-26 13:08:30,121 - BERTopic - Dimensionality - Completed ✓
2025-09-26 13:08:30,131 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-26 13:09:06,641 - BERTopic - Cluster - Completed ✓
2025-09-26 13:09:06,701 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-26 13:09:09,742 - BERTopic - Representation - Completed ✓
2025-09-26 13:09:13,416 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-09-26 13:18:39,911 - BERTopic - Dimensionality - Completed ✓
2025-09-26 13:18:39,914 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-09-26 13:23:24,282 - BERTopic - Cluster - Completed ✓


FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/bertopic_model.joblib'

In [None]:
save_joblib(topic_model, ARTIFACTS_DIR / "bertopic_model.joblib")
save_joblib(topics_full, ARTIFACTS_DIR / "item_topics.joblib")


2025-09-26 13:25:47,508 - INFO - Saved /content/drive/MyDrive/deezer/artifacts/bertopic_model.joblib
INFO:hybrid_pipeline:Saved /content/drive/MyDrive/deezer/artifacts/bertopic_model.joblib
2025-09-26 13:25:48,520 - INFO - Saved /content/drive/MyDrive/deezer/artifacts/item_topics.joblib
INFO:hybrid_pipeline:Saved /content/drive/MyDrive/deezer/artifacts/item_topics.joblib


Load the topic model.

In [7]:
import joblib
from pathlib import Path

ARTIFACTS_DIR = Path("/content/drive/MyDrive/deezer/artifacts")

topic_model = joblib.load(ARTIFACTS_DIR / "bertopic_model.joblib")

topics_full = joblib.load(ARTIFACTS_DIR / "item_topics.joblib")

print(type(topic_model))   # <class 'bertopic._bertopic.BERTopic'>


<class 'bertopic._bertopic.BERTopic'>


We check some of the topics generated. In this example it is symphony, Bethoven etc. In another one (3) it is christmas themed words.

In [None]:
# topics_full: numeric topic IDs per item
# topic_model: the fitted BERTopic model

topic_info = topic_model.get_topic_info()
print(topic_info.head(40))  # first 10 topics

# Example: see top words for topic 3
topic_words = topic_model.get_topic(10)
print(f"Topic 3 top words: {topic_words}")



   Topic  Count                                   Name  \
0     -1  59491                       -1_ep_love_la_me   
1      0    914             0_remix_remixes_remixed_dj   
2      1    735                 1_lisa_kate_tina_sarah   
3      2    646             2_feat_alvarez_yankee_kayz   
4      3    458         3_christmas_merry_jingle_santa   
5      4    431                 4_cumbia_los_exitos_el   
6      5    412  5_dtente_mditation_relaxation_musique   
7      6    376           6_bir_mzikleri_orijinal_dizi   
8      7    361           7_rue_ktanou_bonheur_paranoa   
9      8    330            8_zouk_session_lanne_marvin   

                                      Representation  \
0  [ep, love, la, me, you, the, feat, original, m...   
1  [remix, remixes, remixed, dj, djkicks, mixed, ...   
2  [lisa, kate, tina, sarah, stacey, turner, katy...   
3  [feat, alvarez, yankee, kayz, enrique, nicky, ...   
4  [christmas, merry, jingle, santa, claus, xmas,...   
5  [cumbia, los, exitos, 

In [None]:

import numpy as np

# For csr_matrix
topic_sums = topic_embeddings.sum(axis=1).A1  # sum over topics
missing_topic_indices = np.where(topic_sums == 0)[0]

print(f"Number of items with missing topic embeddings: {len(missing_topic_indices)}")

There are many missing topics (~200000), since artist track and album name are not long, they might be generalized as outlier or noise (-1 score) and also the built-in GPU in collab was not enough so we had to sample the full item list. For fair evaluation of if reranking improves LightFm we will also make a new dataset that includes items with topics only. This is only for the last part of the evaluation part.

Run the preprocessing function.

In [8]:
df = load_and_prepare_data(Path('/content/drive/MyDrive/recommender/deezer_train_clean.parquet'))

print("Full dataframe shape:", df.shape)
#print("Unique items shape:", items.shape)
#print("Total interactions:", interactions.shape)


df.head()



Loaded dataframe: (7528468, 20)
Full dataframe shape: (3962482, 20)


Unnamed: 0,genre_id_x,ts_listen,item_id,album_id,context_type,release_date_x,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,track_name,artist_name,album_name,bpm,gain
0,25471,2016-12-01 13:00:15,222606,41774,12,20040704,1,0,223,0,0,9241,55164,29,0,Seul au monde,Malin Plaisir,Malin plaisir,100.1,-10.3
1,25571,2016-11-30 22:25:35,250467,43941,0,20060301,2,1,171,0,0,16547,55830,30,1,You're not ok,SoldouT,Stop talking / dead tapes,160.2,-9.9
2,16,2016-11-19 13:59:13,305197,48078,1,20140714,2,1,149,1,1,7665,2704,29,1,Samuel,René Aubry,Dérives,90.1,-17.7
3,7,2016-11-26 09:21:38,900502,71521,0,20001030,0,0,240,0,1,1580,938,30,0,Mujer Amiga Mia,Eros Ramazzotti,Estilolibre (Spanish Version),124.9,-8.7
4,7,2016-11-05 18:02:54,542335,71718,0,20080215,0,0,150,0,1,1812,2939,24,1,Mr. Tambourine Man,The Byrds,Collections - The Byrds Play Dylan,120.5,-12.5


In [10]:
topic_mask = np.array(topics_full) != -1

df_topic = load_and_prepare_data_with_topics(Path('/content/drive/MyDrive/recommender/deezer_train_clean.parquet'), topic_mask)



Loaded dataframe: (7528468, 20)
Filtered dataframe (items with topics): (10514, 20)


In [None]:
def cpu_quick_cleanup(kill_joblib: bool = True):
    """Free memory and kill background threads."""
    try:
        if kill_joblib:
            from joblib.externals.loky import get_reusable_executor
            ex = get_reusable_executor()
            if ex is not None:
                ex.shutdown(wait=True, kill_workers=True)
    except Exception:
        pass
    gc.collect()


In [None]:
cpu_quick_cleanup(kill_joblib=True)


## Computing Weights


Optional: Here we prepare our dataset for the recommender system . We assign weights to our data. We won't just depend on binary is_listened but also take into consideration the recency of the item.

In [9]:
import pandas as pd
import numpy as np

def compute_interaction_weights(
    interactions_df: pd.DataFrame,
    user_col="user_id",
    item_col="item_id",
    listened_col="is_listened",
    ts_col="ts_listen",
    use_recency=False,
    recency_weight=1.0,
    recency_halflife_days=90,
    clip_min=0.05,
    clip_max=3.0,
):
    """
    Compute interaction weights for LightFM.

    Modes:
    - Binary (use_recency=False): keep only is_listened == 1, weight=1
    - Recency (use_recency=True): exponential decay by recency per user

    Returns
    -------
    df : pd.DataFrame with "_final_weight"
    diag : dict with summary stats
    """

    df = interactions_df.copy()

    # Keep only positive interactions
    df = df[df[listened_col] > 0].copy()

    if not use_recency:
        # ------------------------------
        # Mode 1: Binary (uniform weight)
        # ------------------------------
        df["_final_weight"] = 1.0

    else:
        # ------------------------------
        # Mode 2: Recency-based weighting
        # ------------------------------
        df[ts_col] = pd.to_datetime(df[ts_col], errors="coerce")

        # Compare to each user's most recent listen
        latest_ts = df.groupby(user_col)[ts_col].transform("max")
        age_days = (latest_ts - df[ts_col]).dt.days.to_numpy().astype(np.float32)

        # Exponential decay with half-life
        tau = float(recency_halflife_days)
        recency_score = np.exp(-age_days / tau).astype(np.float32)
        df["_recency_score"] = recency_score

        # Raw weight
        raw_weight = 1.0 + recency_weight * recency_score

        # Normalize into [clip_min, clip_max]
        if raw_weight.sum() > 0:
            wmin, wmax = raw_weight.min(), raw_weight.max()
            if wmax > wmin:
                final_weight = (raw_weight - wmin) / (wmax - wmin)
                final_weight = final_weight * (clip_max - clip_min) + clip_min
            else:
                final_weight = np.full_like(raw_weight, clip_min)
        else:
            final_weight = np.full_like(raw_weight, clip_min)

        df["_final_weight"] = final_weight.astype(np.float32)

    # Diagnostics
    diag = {
        "n_interactions": len(df),
        "final_weight_min": float(df["_final_weight"].min()) if len(df) > 0 else None,
        "final_weight_max": float(df["_final_weight"].max()) if len(df) > 0 else None,
        "final_weight_mean": float(df["_final_weight"].mean()) if len(df) > 0 else None,
        "mode": "recency" if use_recency else "binary",
    }

    return df, diag



In [10]:
weighted_df, diag = compute_interaction_weights(df, use_recency=False)







Sanity check:

In [None]:
weighted_df.head()

Unnamed: 0,genre_id_x,ts_listen,item_id,album_id,context_type,release_date_x,platform_name,platform_family,media_duration,listen_type,...,user_id,artist_id,user_age,is_listened,track_name,artist_name,album_name,bpm,gain,_final_weight
1,25571,2016-11-30 22:25:35,250467,43941,0,20060301,2,1,171,0,...,16547,55830,30,1,You're not ok,SoldouT,Stop talking / dead tapes,160.2,-9.9,1.0
2,16,2016-11-19 13:59:13,305197,48078,1,20140714,2,1,149,1,...,7665,2704,29,1,Samuel,René Aubry,Dérives,90.1,-17.7,1.0
4,7,2016-11-05 18:02:54,542335,71718,0,20080215,0,0,150,0,...,1812,2939,24,1,Mr. Tambourine Man,The Byrds,Collections - The Byrds Play Dylan,120.5,-12.5,1.0
5,7,2016-11-05 13:48:29,542335,71718,1,20080215,1,0,150,1,...,10325,2939,29,1,Mr. Tambourine Man,The Byrds,Collections - The Byrds Play Dylan,120.5,-12.5,1.0
6,7,2016-11-15 12:51:44,542335,71718,1,20080215,1,2,150,1,...,51,2939,28,1,Mr. Tambourine Man,The Byrds,Collections - The Byrds Play Dylan,120.5,-12.5,1.0


## Create LightFM Dataset with BERTopic Features

We define a function that creates a dataset that could be fit to a lightfm hybrid model. We will make use of all our features and keep them. We will have user, item matrices and topics will be in the item matrix.

In [11]:
import pandas as pd
from lightfm.data import Dataset

def build_lightfm_dataset_from_interactions(
    interactions_df: pd.DataFrame,
    user_feature_cols: list = None,
    item_feature_cols: list = None
):
    """
    Build a LightFM dataset using only the interactions DataFrame.

    - Users & items come from interactions_df
    - User/item features are taken from columns in the same DataFrame
    """
    dataset = Dataset()

    # Users & items
    users = interactions_df['user_id'].astype(str).unique()
    items_for_fit = interactions_df['item_id'].astype(str).unique()

    # --- User features ---
    user_feature_set = set()
    if user_feature_cols:
        for col in user_feature_cols:
            if col in interactions_df.columns:
                user_feature_set.update([
                    f"{col}:{val}" for val in interactions_df[col].dropna().unique()
                ])

    # --- Item features ---
    item_feature_set = set()
    if item_feature_cols:
        for col in item_feature_cols:
            if col in interactions_df.columns:
                item_feature_set.update([
                    f"{col}:{val}" for val in interactions_df[col].dropna().unique()
                ])

    # Fit dataset
    dataset.fit(
        users=users,
        items=items_for_fit,
        user_features=list(user_feature_set),
        item_features=list(item_feature_set)
    )

    # --- User features matrix ---
    def user_features_gen():
        for uid, row in interactions_df.drop_duplicates('user_id').iterrows():
            feats = []
            if user_feature_cols:
                for col in user_feature_cols:
                    if col in row and pd.notna(row[col]):
                        feats.append(f"{col}:{row[col]}")
            yield (str(row["user_id"]), feats)

    user_features_matrix = dataset.build_user_features(user_features_gen(), normalize=False)

    # --- Item features matrix ---
    def item_features_gen():
        for iid, row in interactions_df.drop_duplicates('item_id').iterrows():
            feats = []
            if item_feature_cols:
                for col in item_feature_cols:
                    if col in row and pd.notna(row[col]):
                        feats.append(f"{col}:{row[col]}")
            yield (str(row["item_id"]), feats)

    item_features_matrix = dataset.build_item_features(item_features_gen(), normalize=False)

    return dataset, user_features_matrix, item_features_matrix



In [12]:

feature_cols = ["release_date_x","context_type","genre_id_x","platform_name","listen_type","bpm","gain"]

dataset, user_features_matrix, item_features_matrix = build_lightfm_dataset_from_interactions(
    interactions_df=df,
    user_feature_cols=["user_gender", "user_age"],
    item_feature_cols=feature_cols
)

print("User features shape:", user_features_matrix.shape)
print("Item features shape:", item_features_matrix.shape)


User features shape: (19911, 19926)
Item features shape: (452508, 466861)


## Train Validation Split for Evaluation

We use scikit-learn function to create 0.8/0.2 train validation split.

In [13]:


train_weighted_df, valid_weighted_df = train_test_split(
    weighted_df,
    test_size=0.2,
    random_state=42
)


In [None]:
train_weighted_df.head()

Unnamed: 0,genre_id_x,ts_listen,item_id,album_id,context_type,release_date_x,platform_name,platform_family,media_duration,listen_type,...,artist_id,user_age,is_listened,track_name,artist_name,album_name,bpm,gain,_recency_score,_final_weight
3092619,0,2016-11-06 18:58:39,128546535,13580161,8,20160715,2,1,237,0,...,249,30,1,"Deeper And Deeper (Live At Deezer, Paris)",Ben Harper,"Deezer Sessions (Live At Deezer, Paris)",115.81,-21.5,0.783139,2.360261
2595816,7,2016-11-02 18:51:40,79875066,7964062,0,20140620,0,0,314,0,...,384236,18,1,Afire Love,Ed Sheeran,x (Deluxe Edition),98.0,-9.4,0.85594,2.575022
203297,0,2016-11-27 17:20:41,127468321,13448813,1,20160715,2,1,158,1,...,130398,29,1,Cry (Just A Little) (A-Trak and Phantoms Remix...,Bingo Players,Cry (Just A Little) (A-Trak and Phantoms Remix),126.05,-8.0,1.0,3.0
1035394,829,2016-11-14 22:41:42,1267910,132836,0,20080801,0,2,142,0,...,10133,21,1,African Herbsman,Bob Marley & The Wailers,The Anthology,107.7,-14.6,0.846482,2.547121
1918552,0,2016-11-17 23:26:18,125972317,13257415,0,20160603,0,0,196,0,...,5306539,23,1,Perfect Strangers,Jonas Blue,Perfect Strangers,118.12,-8.0,0.967216,2.903287


In [None]:
dataset

<lightfm.data.Dataset at 0x7f948e8fed80>

Now we need to prepare train and validation datasets for LightFM. LightFM accepts sparse matrices.

In [14]:
def build_interaction_matrix(df, dataset, weight_col='_final_weight'):
    user_map = dataset.mapping()[0]
    item_map = dataset.mapping()[2]
    rows, cols, data = [], [], []

    for u, i, w in df[['user_id', 'item_id', weight_col]].itertuples(index=False, name=None):
        u_idx = user_map.get(str(u))
        i_idx = item_map.get(str(i))
        if u_idx is not None and i_idx is not None:
            rows.append(u_idx)
            cols.append(i_idx)
            data.append(w)

    from scipy.sparse import coo_matrix
    return coo_matrix((data, (rows, cols)), shape=(len(user_map), len(item_map)))


train_interactions = build_interaction_matrix(train_weighted_df, dataset)
valid_interactions = build_interaction_matrix(valid_weighted_df, dataset)

print("Train interactions nnz:", train_interactions.nnz)
print("Valid interactions nnz:", valid_interactions.nnz)

Train interactions nnz: 2008115
Valid interactions nnz: 502029


## LightFM Model Training

In [17]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score
import joblib

model = LightFM(no_components=128, loss='warp')
model.fit(
    train_interactions,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    epochs=60,
    num_threads=8,
    verbose=True
)


Epoch: 100%|██████████| 60/60 [00:08<00:00,  7.24it/s]


<lightfm.lightfm.LightFM at 0x7d549c59c740>

In [None]:
MODEL_FILE = "/content/drive/MyDrive/deezer/artifacts/lightfm_model3_bertopic_train.joblib"
joblib.dump(model, MODEL_FILE)
print(f"LightFM model saved to {MODEL_FILE}")

USER_FEATS_FILE = "/content/drive/MyDrive/deezer/artifacts/user_features_matrix1.npz"
ITEM_FEATS_FILE = "/content/drive/MyDrive/deezer/artifacts/item_features_matrix1.npz"

from scipy.sparse import save_npz
save_npz(USER_FEATS_FILE, user_features_matrix)
save_npz(ITEM_FEATS_FILE, item_features_matrix)

print(f"User features saved to {USER_FEATS_FILE}")
print(f"Item features saved to {ITEM_FEATS_FILE}")

✅ LightFM model saved to /content/drive/MyDrive/deezer/artifacts/lightfm_model2_bertopic_train.joblib
✅ User features saved to /content/drive/MyDrive/deezer/artifacts/user_features_matrix1.npz
✅ Item features saved to /content/drive/MyDrive/deezer/artifacts/item_features_matrix1.npz


In [15]:
import joblib
from scipy.sparse import load_npz

# --- Load LightFM model ---
MODEL_FILE = "/content/drive/MyDrive/deezer/artifacts/lightfm_model3_bertopic_train.joblib"
model = joblib.load(MODEL_FILE)
print("✅ LightFM model loaded")

# --- Load user features matrix ---
USER_FEATS_FILE = "/content/drive/MyDrive/deezer/artifacts/user_features_matrix1.npz"
user_features_matrix = load_npz(USER_FEATS_FILE)
print("✅ User features matrix loaded")

# --- Load item features matrix ---
ITEM_FEATS_FILE = "/content/drive/MyDrive/deezer/artifacts/item_features_matrix1.npz"
item_features_matrix = load_npz(ITEM_FEATS_FILE)
print("✅ Item features matrix loaded")


✅ LightFM model loaded
✅ User features matrix loaded
✅ Item features matrix loaded


## Retrieval with Annoy (ANN)

In [16]:
from scipy.sparse import csr_matrix
import numpy as np

# topics_full: list/array of topic IDs per item
num_items = len(topics_full)
num_topics = max(topics_full) + 1  # total number of topics

rows, cols, data = [], [], []

for idx, topic_id in enumerate(topics_full):
    if topic_id >= 0:  # only valid topics
        rows.append(idx)
        cols.append(topic_id)
        data.append(1.0)

# Create sparse topic embedding matrix
topic_embeddings = csr_matrix((data, (rows, cols)), shape=(num_items, num_topics))


In [None]:

from scipy.sparse import save_npz
save_npz("/content/drive/MyDrive/deezer/artifacts/topic_embeddings.npz", topic_embeddings)
print("✅ Topic embeddings saved")


✅ Topic embeddings saved


There are many missing topics, since artist track and album name are not long, they might be generalized as outlier or noise (-1 score). So

Here we make a HybridRecommender class with LightFM- annoy retrieval and Bertopic similarity reranking

In [17]:
import annoy
class HybridRecommender:
    def __init__(self, model, user_features_matrix, item_features_matrix,
                 topic_embeddings, user_topic_matrix,
                 top_k=100, rerank_k=10, num_trees=50):
        self.model = model
        self.user_features_matrix = user_features_matrix
        self.item_features_matrix = item_features_matrix
        self.topic_embeddings = topic_embeddings
        self.user_topic_matrix = user_topic_matrix
        self.top_k = top_k
        self.rerank_k = rerank_k

        self.annoy_index = self._build_annoy_index(num_trees)

    def _build_annoy_index(self, num_trees=500):
        from annoy import AnnoyIndex
        n_items = self.item_features_matrix.shape[0]
        dim = self.model.no_components
        annoy_index = AnnoyIndex(dim, metric='dot')

        _, item_reps = self.model.get_item_representations(self.item_features_matrix)
        for i in range(n_items):
            annoy_index.add_item(i, item_reps[i])
        annoy_index.build(num_trees)
        return annoy_index

    def recommend(self, user_idx):
        # LightFM embedding for retrieval
        _, user_reps = self.model.get_user_representations(self.user_features_matrix)
        user_embedding = user_reps[user_idx]

    # Retrieve top-k candidates
        candidate_indices = self.annoy_index.get_nns_by_vector(user_embedding, self.top_k,search_k=10000)

        if self.user_topic_matrix is not None:
            user_topic_vector = self.user_topic_matrix[user_idx]  # dense, no .toarray()
            valid_candidates = [i for i in candidate_indices if i < self.topic_embeddings.shape[0]]

            if valid_candidates:
                candidate_topics = self.topic_embeddings[valid_candidates].toarray()
                sim = candidate_topics @ user_topic_vector
                reranked = np.array(valid_candidates)[np.argsort(-sim)]
            else:
                reranked = []

        # Append remaining candidates without topic embeddings
            missing_candidates = [i for i in candidate_indices if i not in valid_candidates]
            final_ranking = np.concatenate([reranked, missing_candidates])
        else:
            final_ranking = candidate_indices

        return final_ranking[:self.rerank_k]



In [None]:
def build_user_topic_profiles(interactions_df, topic_embeddings, user_map, item_map):
    """
    Build user topic vectors from interactions and item topic embeddings.
    Safely handles missing items (no topic).
    """
    num_users = len(user_map)
    num_topics = topic_embeddings.shape[1]
    user_topic_matrix = np.zeros((num_users, num_topics), dtype=np.float32)

    for user_id, group in interactions_df.groupby('user_id'):
        u_idx = user_map.get(str(user_id))
        if u_idx is None:
            continue

        valid_items = []
        for item_id in group['item_id']:
            i_idx = item_map.get(str(item_id))
            if i_idx is not None and i_idx < topic_embeddings.shape[0]:
                valid_items.append(i_idx)

        if not valid_items:
            continue

        topics = topic_embeddings[valid_items].toarray()
        user_topic_matrix[u_idx] = topics.mean(axis=0)

    return user_topic_matrix



In [None]:

user_map = dataset.mapping()[0]  # user_id -> index
item_map = dataset.mapping()[2]  # item_id -> index

user_topic_matrix = build_user_topic_profiles(weighted_df, topic_embeddings, user_map, item_map)



In [20]:
recommender = HybridRecommender(
    model=model,
    user_features_matrix=user_features_matrix,
    item_features_matrix=item_features_matrix,
    topic_embeddings=topic_embeddings,
    user_topic_matrix=user_topic_matrix,
    top_k=5000,
    rerank_k=10,
    num_trees=500
)

# Recommend for a user index
recommended_items = recommender.recommend(user_idx=2)
print(recommended_items)

[179167 383064 126008 309586 133249 272808  72406 294000 202299  45571]


 ***LightFM Model (Collaborative + Content Hybrid)**

LightFM uses a matrix factorization approach, where:

Each **user** has a latent vector (embedding)

Each **item** has a latent vector (embedding) -two tower architecture.

The predicted interaction is roughly the dot product of these vectors. LightFM is fit on the train dataset, using user_features and item_features. Evaluation is made on validation dataset using metrics like precision_at_k. Retrieval is with Annoy (Fast Candidate Generation). This approach solves the problem of speed with approximate nearest neighbors indexing since scoring all items for a user is slow when there are millions of items.

For a given user embedding, Annoy quickly retrieves the top-k nearest items in embedding space (dot product). Then reranking is done with BERTopic (Topic-Aware Relevance). The comparison is made between user embedding with topic vectors of retrieved candidate items. Cosine similarity is computed to rerank the candidates. To summarize:

**Collaborative filtering:** LightFM embeddings capture user-item interactions.

**Content-based:** Topic embeddings add reranking based on item content
.

In [None]:
topic_embeddings

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 166395 stored elements and shape (445665, 961)>

In [27]:
import numpy as np

def verify_annoy_index(recommender, model, user_features_matrix, item_features_matrix, topic_embeddings=None, n_samples=10):
    """
    Verify that the Annoy index in the recommender matches LightFM inner IDs and embeddings.

    Parameters
    ----------
    recommender : object
        Your HybridRecommender with .annoy_index and .model
    model : LightFM
        Trained LightFM model
    user_features_matrix : csr_matrix
        LightFM user features matrix
    item_features_matrix : csr_matrix
        LightFM item features matrix
    topic_embeddings : csr_matrix or None
        Optional topic embeddings for reranking
    n_samples : int
        Number of items and users to spot-check
    """
    # --- 1) Verify item embeddings in Annoy ---
    _, item_embeddings = model.get_item_representations(item_features_matrix)
    n_items = item_embeddings.shape[0]

    print(f"Checking first {n_samples} items in Annoy...")
    for i in range(min(n_samples, n_items)):
        annoy_vec = recommender.annoy_index.get_item_vector(i)
        if not np.allclose(item_embeddings[i], annoy_vec):
            print(f"Mismatch in Annoy index {i}")
    print("Item embeddings verified (spot check).")

    # --- 2) Check candidate indices validity ---
    test_user_idx = 0
    _, user_embeddings = model.get_user_representations(user_features_matrix)
    user_vec = user_embeddings[test_user_idx]

    candidates = recommender.annoy_index.get_nns_by_vector(user_vec, recommender.top_k)
    invalid = [i for i in candidates if i >= n_items]
    if invalid:
        print("Invalid candidate indices found:", invalid)
    else:
        print("All candidate indices are valid.")

    # --- 3) Compare Annoy retrieval with brute-force ---
    scores = item_embeddings @ user_vec
    top_brute = np.argsort(-scores)[:recommender.top_k]
    print("Annoy top-k:", candidates)
    print("Brute-force top-k:", top_brute.tolist())

    # --- 4) Verify topic embedding limits ---
    if topic_embeddings is not None:
        valid_candidates = [i for i in candidates if i < topic_embeddings.shape[0]]
        dropped = set(candidates) - set(valid_candidates)
        if dropped:
            print("Candidates dropped due to topic embeddings limit:", dropped)
        else:
            print("All candidates valid for topic rerank.")


    print("Annoy verification complete.")
    return candidates, top_brute.tolist()

In [None]:
annoy, top_brute=verify_annoy_index(recommender, model, user_features_matrix, item_features_matrix, topic_embeddings=None, n_samples=10)


In [None]:
def recall_at_k(annoy, brute, k):
    return len(set(annoy[:k]) & set(brute[:k])) / k


In [None]:
recall_at_k(annoy, top_brute, k=10)

0.9

## Evaluation

In [None]:
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix

mapping = dataset.mapping()     # tuple of dicts
user_map = mapping[0]           # user_id (str) -> idx
item_map = mapping[2]           # item_id (str) -> idx

n_users = len(user_map)
n_items = len(item_map)

# --- 2) Build binary validation CSR from a validation DataFrame

val_df = valid_weighted_df.copy()


rows = []
cols = []

for u, i in zip(val_df['user_id'].astype(str), val_df['item_id'].astype(str)):
    u_idx = user_map.get(u)
    i_idx = item_map.get(i)
    if (u_idx is not None) and (i_idx is not None):
        rows.append(u_idx)
        cols.append(i_idx)

if len(rows) == 0:
    raise RuntimeError("No validation (user,item) pairs were found that map to your dataset ids. "
                       "Check dataset.mapping() and the user/item types (strings).")

coo = coo_matrix((np.ones(len(rows), dtype=np.int8), (rows, cols)), shape=(n_users, n_items))
valid_csr = coo.tocsr()
valid_csr = (valid_csr > 0).astype(np.int8)

print("Built validation CSR:", valid_csr.shape, "nnz:", valid_csr.nnz)


import numpy as np
from scipy.sparse import coo_matrix, csr_matrix

# Build train CSR
rows, cols = [], []
for u, i in zip(train_weighted_df['user_id'].astype(str), train_weighted_df['item_id'].astype(str)):
    u_idx = user_map.get(u)
    i_idx = item_map.get(i)
    if u_idx is not None and i_idx is not None:
        rows.append(u_idx)
        cols.append(i_idx)

train_coo = coo_matrix((np.ones(len(rows), dtype=np.int8), (rows, cols)),
                       shape=(len(user_map), len(item_map)))
train_csr = train_coo.tocsr()
train_csr = (train_csr > 0).astype(np.int8)  # binarize

print("Built train CSR:", train_csr.shape, "nnz:", train_csr.nnz)






Built validation CSR: (19911, 452508) nnz: 502029
Built train CSR: (19911, 452508) nnz: 2008115


### Precision @10

In [24]:
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

def sampled_precision_at_k(model, train_csr, valid_csr, user_map,
                           recommender=None,
                           user_features=None, item_features=None,
                           k=5, n_users=5000, n_negatives=5000,
                           seed=42, show_progress=True):
    """
    Compute Precision@K using sampled users and negative sampling.
    Supports optional reranking pipeline.

    Parameters
    ----------
    model : LightFM
        Trained LightFM model.
    train_csr, valid_csr : csr_matrix
        Interaction matrices.
    user_map : dict
        Mapping raw user_id -> inner index.
    recommender : object or None
        Optional reranking pipeline with .recommend(user_idx) method.
    user_features, item_features : csr_matrix or None
        Feature matrices.
    k : int
        Top-K cutoff for precision.
    n_users : int
        Number of users to sample.
    n_negatives : int
        Number of negative items to sample per user.
    seed : int
        Random seed.
    show_progress : bool
        Show tqdm progress bar.

    Returns
    -------
    dict : {'baseline': float, 'rerank': float or None}
    """
    rng = np.random.default_rng(seed)
    all_users = list(user_map.values())
    sampled_users = rng.choice(all_users, size=min(n_users, len(all_users)), replace=False)
    n_items = train_csr.shape[1]

    baseline_precisions = []
    rerank_precisions = []

    users_iter = sampled_users
    if show_progress:
        users_iter = tqdm(sampled_users, desc="Evaluating Precision@K")

    for u_idx in users_iter:
        true_items = set(valid_csr[u_idx].indices)
        if not true_items:
            continue
        seen_items = set(train_csr[u_idx].indices)

        # ------------------------
        # Baseline: random negatives
        # ------------------------
        neg_candidates = np.array(list(set(range(n_items)) - seen_items - true_items))
        if len(neg_candidates) > n_negatives:
            neg_candidates = rng.choice(neg_candidates, size=n_negatives, replace=False)
        candidates = np.array(list(true_items) + list(neg_candidates))

        scores = model.predict(
            np.full(len(candidates), u_idx, dtype=np.int32),
            candidates,
            user_features=user_features,
            item_features=item_features
        )

        top_k = [iid for _, iid in sorted(zip(scores, candidates), key=lambda x: -x[0])[:k]]
        baseline_precisions.append(sum(iid in true_items for iid in top_k) / k)

        # ------------------------
        # Optional rerank
        # ------------------------
        if recommender is not None:
            reranked_candidates = recommender.recommend(u_idx)
            reranked_candidates = [i for i in reranked_candidates if i not in seen_items]
            if reranked_candidates:
                rerank_scores = model.predict(
                    np.full(len(reranked_candidates), u_idx, dtype=np.int32),
                    np.array(reranked_candidates, dtype=np.int32),
                    user_features=user_features,
                    item_features=item_features
                )
                top_k_rerank = [iid for _, iid in sorted(zip(rerank_scores, reranked_candidates), key=lambda x: -x[0])[:k]]
                rerank_precisions.append(sum(iid in true_items for iid in top_k_rerank) / k)

    return {
        'baseline': np.mean(baseline_precisions) if baseline_precisions else 0.0,
        'rerank': np.mean(rerank_precisions) if rerank_precisions else None
    }



### AUC @ 10- Precision @ 10 for items with topics dataset




In [27]:
import numpy as np
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm

def full_auc_precision_rerank(model, train_csr, valid_csr,
                              recommender=None,
                              user_features=None, item_features=None,
                              k=10, show_progress=True):
    """
    Compute AUC and Precision@K over all users/items with topics.
    Supports optional reranking pipeline.

    Parameters
    ----------
    model : LightFM
        Trained LightFM model.
    train_csr, valid_csr : csr_matrix
        Interaction matrices.
    recommender : object or None
        Optional reranking pipeline with .recommend(user_idx) method.
    user_features, item_features : csr_matrix or None
        Feature matrices.
    k : int
        Top-K cutoff for precision.
    show_progress : bool
        Show tqdm progress bar.

    Returns
    -------
    dict : {'baseline_auc': float, 'rerank_auc': float or None,
            'baseline_precision': float, 'rerank_precision': float or None}
    """
    n_users, n_items = train_csr.shape
    baseline_aucs = []
    rerank_aucs = []
    baseline_precisions = []
    rerank_precisions = []

    users_iter = range(n_users)
    if show_progress:
        users_iter = tqdm(users_iter, desc="Evaluating AUC & Precision@K")

    for u_idx in users_iter:
        true_items = set(valid_csr[u_idx].indices)
        if not true_items:
            continue
        seen_items = set(train_csr[u_idx].indices)

        # Candidate items: all items not seen in training
        candidate_items = np.array([i for i in range(n_items) if i not in seen_items])
        if len(candidate_items) == 0:
            continue

        # Ground truth labels
        y_true = np.array([1 if i in true_items else 0 for i in candidate_items])

        # Baseline predictions
        scores = model.predict(
            np.full(len(candidate_items), u_idx, dtype=np.int32),
            candidate_items,
            user_features=user_features,
            item_features=item_features
        )

        # Compute AUC
        if len(set(y_true)) > 1:
            baseline_aucs.append(roc_auc_score(y_true, scores))

        # Compute Precision@K
        top_k_items = [iid for _, iid in sorted(zip(scores, candidate_items), key=lambda x: -x[0])[:k]]
        baseline_precisions.append(sum(iid in true_items for iid in top_k_items) / k)

        # Optional rerank
        if recommender is not None:
            reranked_candidates = recommender.recommend(u_idx)
            reranked_candidates = [i for i in reranked_candidates if i not in seen_items]
            if reranked_candidates:
                y_true_rerank = np.array([1 if i in true_items else 0 for i in reranked_candidates])
                rerank_scores = model.predict(
                    np.full(len(reranked_candidates), u_idx, dtype=np.int32),
                    np.array(reranked_candidates, dtype=np.int32),
                    user_features=user_features,
                    item_features=item_features
                )
                if len(set(y_true_rerank)) > 1:
                    rerank_aucs.append(roc_auc_score(y_true_rerank, rerank_scores))
                top_k_rerank = [iid for _, iid in sorted(zip(rerank_scores, reranked_candidates), key=lambda x: -x[0])[:k]]
                rerank_precisions.append(sum(iid in true_items for iid in top_k_rerank) / k)

    return {
        'baseline_auc': np.mean(baseline_aucs) if baseline_aucs else 0.0,
        'rerank_auc': np.mean(rerank_aucs) if rerank_aucs else None,
        'baseline_precision': np.mean(baseline_precisions) if baseline_precisions else 0.0,
        'rerank_precision': np.mean(rerank_precisions) if rerank_precisions else None
    }



In [28]:


from tqdm.notebook import tqdm

# Run evaluation
metrics = full_auc_precision_rerank(
    model=model,
    train_csr=train_csr,
    valid_csr=valid_csr,
    recommender=recommender,  # pass None if no rerank
    user_features=user_features_matrix,      # pass None if not using features
    item_features=item_features_matrix,      # pass None if not using features
    k=10,                             # top-K for precision
    show_progress=True
)

print("Baseline AUC:", metrics['baseline_auc'])
print("Rerank AUC:", metrics['rerank_auc'])
print("Baseline Precision@10:", metrics['baseline_precision'])
print("Rerank Precision@10:", metrics['rerank_precision'])


Evaluating AUC & Precision@K:   0%|          | 0/5078 [00:00<?, ?it/s]

Baseline AUC: 0.9189379758891082
Rerank AUC: 0.7356852188858558
Baseline Precision@10: 0.06554121151936444
Rerank Precision@10: 0.017693836978131212


We see that reranking with topics does not improve the pipeline and LigthFM itself enriched with metadata is better.

In [23]:
import numpy as np
from collections import defaultdict
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm

def sampled_auc_rerank(model, train_csr, valid_csr, user_map,
                       recommender=None,
                       user_features=None, item_features=None,
                       n_users=5000, n_negatives=5000,
                       seed=42, show_progress=True):
    """
    Compute AUC using sampled users and negative sampling.
    Supports optional reranking pipeline.

    Parameters
    ----------
    model : LightFM
        Trained LightFM model.
    train_csr, valid_csr : csr_matrix
        Interaction matrices.
    user_map : dict
        Mapping raw user_id -> inner index.
    recommender : object or None
        Optional reranking pipeline with .recommend(user_idx) method.
    user_features, item_features : csr_matrix or None
        Feature matrices.
    n_users : int
        Number of users to sample.
    n_negatives : int
        Number of negative items to sample per user.
    seed : int
        Random seed.
    show_progress : bool
        Show tqdm progress bar.

    Returns
    -------
    dict : {'baseline': float, 'rerank': float or None}
    """
    rng = np.random.default_rng(seed)
    all_users = list(user_map.values())
    sampled_users = rng.choice(all_users, size=min(n_users, len(all_users)), replace=False)
    n_items = train_csr.shape[1]

    baseline_aucs = []
    rerank_aucs = []

    users_iter = sampled_users
    if show_progress:
        users_iter = tqdm(sampled_users, desc="Evaluating AUC")

    for u_idx in users_iter:
        true_items = set(valid_csr[u_idx].indices)
        if not true_items:
            continue
        seen_items = set(train_csr[u_idx].indices)

        # ------------------------
        # Baseline: sample negatives
        # ------------------------
        neg_candidates = np.array(list(set(range(n_items)) - seen_items - true_items))
        if len(neg_candidates) > n_negatives:
            neg_candidates = rng.choice(neg_candidates, size=n_negatives, replace=False)
        candidates = np.array(list(true_items) + list(neg_candidates))

        y_true = np.array([1 if i in true_items else 0 for i in candidates])
        scores = model.predict(
            np.full(len(candidates), u_idx, dtype=np.int32),
            candidates,
            user_features=user_features,
            item_features=item_features
        )
        if len(set(y_true)) > 1:
            baseline_aucs.append(roc_auc_score(y_true, scores))

        # ------------------------
        # Optional rerank
        # ------------------------
        if recommender is not None:
            reranked_candidates = recommender.recommend(u_idx)
            reranked_candidates = [i for i in reranked_candidates if i not in seen_items]
            if reranked_candidates:
                y_true_rerank = np.array([1 if i in true_items else 0 for i in reranked_candidates])
                rerank_scores = model.predict(
                    np.full(len(reranked_candidates), u_idx, dtype=np.int32),
                    np.array(reranked_candidates, dtype=np.int32),
                    user_features=user_features,
                    item_features=item_features
                )
                if len(set(y_true_rerank)) > 1:
                    rerank_aucs.append(roc_auc_score(y_true_rerank, rerank_scores))

    return {
        'baseline': np.mean(baseline_aucs) if baseline_aucs else 0.0,
        'rerank': np.mean(rerank_aucs) if rerank_aucs else None
    }



In [26]:
rerank_precision = sampled_precision_at_k(
    model, train_csr, valid_csr, user_map,recommender=recommender,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    k=10, n_users=5000, n_negatives=5000
)

print("Sampled rerank Precision@K:", rerank_precision)



Evaluating Precision@K:   0%|          | 0/5000 [00:00<?, ?it/s]

Sampled rerank Precision@K: {'baseline': np.float64(0.5357845233005558), 'rerank': np.float64(0.0076117169125507825)}


In [26]:

rerank_auc = sampled_auc_rerank(
    model, train_csr, valid_csr, user_map,recommender=recommender,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    n_users=5000
)
print("Sampled Baseline AUC:", rerank_auc)

Evaluating AUC:   0%|          | 0/5000 [00:00<?, ?it/s]

Sampled Baseline AUC: {'baseline': np.float64(0.9585578009573187), 'rerank': np.float64(0.8835837571274463)}


Since dataset has ~400000 items it is really costly to run the evaluation algorithm with all the items so here, we sample it to 5000 users and 5000 unseen/ not listened items. Precision is quite high for the LightFM only model. Half of the items recommended are relevant for users and it's AUC score shows that ranking for relevant items is above random. Again we see that reranking with topic embeddings does not help the model.