In [1]:
import random
import warnings
import zipfile
from pathlib import Path

import pandas as pd
import tensorflow as tf
import tqdm
warnings.filterwarnings("ignore")


2025-09-25 13:48:28.038779: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-25 13:48:28.038843: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-25 13:48:28.040265: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-25 13:48:28.049420: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

def load_ml_1m():
    # download and extract zip file
    tf.keras.utils.get_file(
        "ml-1m.zip",
        "http://files.grouplens.org/datasets/movielens/ml-1m.zip",
        cache_dir=".",
        cache_subdir=".",
        extract=True,
    )
    # read and merge data into same table
    cur_path = Path(".").absolute()
    ratings = pd.read_csv(
        cur_path / "ml-1m" / "ratings.dat",
        sep="::",
        usecols=[0, 1, 2, 3],
        names=["user", "item", "rating", "time"],
    )
    users = pd.read_csv(
        cur_path / "ml-1m" / "users.dat",
        sep="::",
        usecols=[0, 1, 2, 3],
        names=["user", "sex", "age", "occupation"],
    )
    items = pd.read_csv(
        cur_path / "ml-1m" / "movies.dat",
        sep="::",
        usecols=[0, 2],
        names=["item", "genre"],
        encoding="iso-8859-1",
    )
    items[["genre1", "genre2", "genre3"]] = (
        items["genre"].str.split(r"|", expand=True).fillna("missing").iloc[:, :3]
    )
    items.drop("genre", axis=1, inplace=True)
    data = ratings.merge(users, on="user").merge(items, on="item")
    data.rename(columns={"rating": "label"}, inplace=True)
    # random shuffle data
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)
    return data

In [3]:
data = load_ml_1m()
print("data shape:", data.shape)

Downloading data from http://files.grouplens.org/datasets/movielens/ml-1m.zip
data shape: (1000209, 10)


In [4]:
data.iloc[random.choices(range(len(data)), k=10)]  # randomly select 10 rows

Unnamed: 0,user,item,label,time,sex,age,occupation,genre1,genre2,genre3
570749,1125,1911,2,975625119,F,18,4,Comedy,missing,missing
375697,3756,1801,4,966101466,M,18,12,Action,Drama,Romance
279993,3103,3801,5,969566597,M,25,20,Drama,Mystery,missing
618961,5137,1265,5,964337733,M,18,18,Comedy,Romance,missing
51686,4456,1566,4,965230295,F,35,2,Adventure,Animation,Children's
463716,4141,628,5,965349185,M,35,17,Drama,Thriller,missing
503364,2608,1957,5,973728054,F,25,1,Drama,missing,missing
99385,2419,3506,3,974248046,M,25,0,Comedy,Drama,missing
363722,5026,3113,3,962586716,M,25,17,Action,Thriller,missing
144824,1387,3685,2,974768055,F,50,13,Comedy,Drama,Romance


In [5]:
from libreco.data import random_split

# split data into three folds for training, evaluating and testing
first_half_data = data[: (len(data) // 2)]
train_data, eval_data, test_data = random_split(first_half_data, multi_ratios=[0.8, 0.1, 0.1], seed=42)

In [6]:
print("first half data shape:", first_half_data.shape)

first half data shape: (500104, 10)


In [7]:
from libreco.data import DatasetFeat

sparse_col = ["sex", "occupation", "genre1", "genre2", "genre3"]
dense_col = ["age"]
user_col = ["sex", "age", "occupation"]
item_col = ["genre1", "genre2", "genre3"]

train_data, data_info = DatasetFeat.build_trainset(train_data, user_col, item_col, sparse_col, dense_col)
eval_data = DatasetFeat.build_evalset(eval_data)
test_data = DatasetFeat.build_testset(test_data)

In [8]:
print(data_info)

n_users: 6040, n_items: 3580, data density: 1.8502 %


In [9]:

from libreco.algorithms import WideDeep

Instructions for updating:
non-resource variables are not supported in the long term


In [10]:
model = WideDeep(
    task="ranking",
    data_info=data_info,
    embed_size=16,
    n_epochs=2,
    loss_type="cross_entropy",
    lr={"wide": 0.05, "deep": 7e-4},
    batch_size=2048,
    use_bn=True,
    hidden_units=(128, 64, 32),
)

model.fit(
    train_data,
    neg_sampling=True,  # perform negative sampling on training and eval data
    verbose=2,
    shuffle=True,
    eval_data=eval_data,
    metrics=["loss", "roc_auc", "precision", "recall", "ndcg"],
)

Training start time: [35m2025-09-25 13:48:45[0m
Instructions for updating:
Colocations handled automatically by placer.


2025-09-25 13:48:45.901786: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-25 13:48:45.903364: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-25 13:48:45.903495: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

total params: [33m192,481[0m | embedding params: [33m165,177[0m | network params: [33m27,304[0m


2025-09-25 13:48:46.737169: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
train:   0%|          | 0/391 [00:00<?, ?it/s]2025-09-25 13:48:47.377102: W external/local_xla/xla/stream_executor/gpu/asm_compiler.cc:225] Falling back to the CUDA driver for PTX compilation; ptxas does not support CC 8.9
2025-09-25 13:48:47.377122: W external/local_xla/xla/stream_executor/gpu/asm_compiler.cc:228] Used ptxas at ptxas
2025-09-25 13:48:47.377170: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.
2025-09-25 13:48:47.543457: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.
2025-09-25 13:48:47.543485: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile g

Epoch 1 elapsed: 2.704s
	 [32mtrain_loss: 0.9671[0m


eval_pointwise: 100%|██████████| 13/13 [00:00<00:00, 169.42it/s]
eval_listwise: 100%|██████████| 2817/2817 [00:05<00:00, 553.24it/s]


	 eval log_loss: 0.5922
	 eval roc_auc: 0.7903
	 eval precision@10: 0.0246
	 eval recall@10: 0.0372
	 eval ndcg@10: 0.0983


train: 100%|██████████| 391/391 [00:01<00:00, 248.79it/s]


Epoch 2 elapsed: 1.574s
	 [32mtrain_loss: 0.4991[0m


eval_pointwise: 100%|██████████| 13/13 [00:00<00:00, 429.85it/s]
eval_listwise: 100%|██████████| 2817/2817 [00:05<00:00, 553.31it/s]


	 eval log_loss: 0.4917
	 eval roc_auc: 0.8365
	 eval precision@10: 0.0322
	 eval recall@10: 0.0525
	 eval ndcg@10: 0.1341


In [11]:
from libreco.evaluation import evaluate

evaluate(
    model=model,
    data=test_data,
    neg_sampling=True,  # perform negative sampling on test data
    metrics=["loss", "roc_auc", "precision", "recall", "ndcg"],
)

eval_pointwise: 100%|██████████| 13/13 [00:00<00:00, 347.88it/s]
eval_listwise: 100%|██████████| 2798/2798 [00:05<00:00, 556.23it/s]


{'loss': 0.49328231155373836,
 'roc_auc': 0.8353801910786779,
 'precision': 0.030879199428162977,
 'recall': 0.04892803975512679,
 'ndcg': 0.12823252881266994}

In [12]:
model.recommend_user(user=1, n_rec=3)

{1: array([3751, 1097, 2355])}

In [13]:

model.recommend_user(user=[1, 2, 3], n_rec=3)

{1: array([3751, 1097, 2355]),
 2: array([1198, 2858, 2028]),
 3: array([1580, 1197, 2028])}

In [14]:
model.recommend_user(user=1, n_rec=3, user_feats={"sex": "M", "age": 33})

{1: array([2858, 1197,  110])}

In [15]:
model.recommend_user(user=1, n_rec=3, user_feats={"occupation": 17})

{1: array([3751, 2858, 2355])}