In [1]:
import torch
torch.set_num_threads(16)
torch.set_num_interop_threads(16)

In [2]:
import cornac
from cornac.eval_methods import RatioSplit
from cornac.models import MF, PMF, BPR, SANSA, BiVAECF, LightGCN, RecVAE, EASE, NGCF, VAECF, IBPR, NeuMF, HPF, WBPR
from cornac.metrics import Precision, Recall, NDCG, MAP, MRR

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### 1. Import datasets

In [3]:
from typing import Optional

def stratified_ranking_split(
    df: pd.DataFrame,
    entity_field: str,
    test_size: float = 0.1,
    random_state: Optional[int] = None,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits a ranking-based dataset into training and test sets while preserving the entity distribution.

    This function is useful for ranking models (e.g., next-best-offer, personalized recommendations) where each
    entity (e.g., user) has multiple interactions with different items, and stratification ensures that
    different user interaction levels are maintained in both splits.

    Parameters:
        df: The ranking dataset, where each row represents an interaction between an entity (e.g., user)
            and an item (e.g., game, offer).
        entity_field: The column representing the entity to be stratified.
        test_size: Fraction of unique entities to allocate to the test set.
        random_state: Random seed for reproducibility.

    Returns:
        DataFrames containing training and test data.

    Example:
        >>> train_validation_df, test_df = stratified_ranking_split(df, entity_field='user_id', test_size=0.1)
        >>> train_df, validation_df = stratified_ranking_split(
        ...     train_validation_df, entity_field='user_id', test_size=0.1
        ... )
        >>> print(train_df.shape, validation_df.shape, test_df.shape)
    """
    entity_interaction_counts = df[entity_field].value_counts()

    interaction_frequencies = entity_interaction_counts.value_counts()
    stratifiable_interaction_counts = interaction_frequencies[interaction_frequencies >= 2].index

    stratifiable_entities = entity_interaction_counts[
        entity_interaction_counts.isin(stratifiable_interaction_counts)
    ].index
    non_stratifiable_entities = entity_interaction_counts[
        ~entity_interaction_counts.isin(stratifiable_interaction_counts)
    ].index

    train_strat, test_strat = (
        train_test_split(
            stratifiable_entities,
            test_size=test_size,
            stratify=entity_interaction_counts[stratifiable_entities],
            random_state=random_state,
        )
        if len(stratifiable_entities) > 1
        else (stratifiable_entities, [])
    )

    if len(non_stratifiable_entities) > 1:
        train_non_strat, test_non_strat = train_test_split(
            non_stratifiable_entities,
            test_size=test_size,
            random_state=random_state,
        )
    else:
        train_non_strat = non_stratifiable_entities
        test_non_strat = []

    train_users = np.concatenate([train_strat, train_non_strat])
    test_users = np.concatenate([test_strat, test_non_strat])

    return df[df[entity_field].isin(train_users)], df[df[entity_field].isin(test_users)]

In [4]:
SEED = 123

metrics = [Precision(k=10), Recall(k=10), NDCG(k=10), MAP(), MRR()]

In [5]:
lastfm_dataset = (
    pd.read_csv(
        "/Users/a-shyraliev/phd/rec-sys-research/collab_filtering_battlefield/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv",
        sep="\t",
        header=None,
        usecols=[0, 2, 3],
        names=['user_id', 'item_id', 'play_count'],
    )
    .loc[:, ['user_id', 'item_id', 'play_count']]
)

In [6]:
lastfm_dataset['user_id'].nunique(), lastfm_dataset['item_id'].nunique()

(358868, 292363)

In [7]:
_, lastfm_dataset_sample_str = stratified_ranking_split(
    lastfm_dataset,
    entity_field='user_id',
    test_size=0.1,
    random_state=SEED,
)

del lastfm_dataset, _

In [8]:
rs = RatioSplit(data=lastfm_dataset_sample_str.values, test_size=0.2, rating_threshold=0.0, seed=SEED)
rs.train_set.csr_matrix, rs.test_set.csr_matrix



(<35887x93612 sparse matrix of type '<class 'numpy.float64'>'
 	with 1402848 stored elements in Compressed Sparse Row format>,
 <35887x93612 sparse matrix of type '<class 'numpy.float64'>'
 	with 339585 stored elements in Compressed Sparse Row format>)

In [10]:
del lastfm_dataset_sample_str

### 1. MF hyperparams optimization

In [13]:
mf_models = [
    MF(use_bias=True, seed=SEED),
]

In [14]:
cornac.Experiment(
    eval_method=rs,
    models=mf_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()


TEST:
...
   | MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
-- + --- + ------ + ------- + ------------ + --------- + --------- + --------
MF | nan | 0.0003 |  0.0000 |       0.0000 |    0.0000 |    0.4127 | 143.9471



### 2. PMF hyperparams optimization

In [15]:
pmf_models = [
    PMF(seed=SEED),
]

In [16]:
cornac.Experiment(
    eval_method=rs,
    models=pmf_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()


TEST:
...
    |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------- + ------------ + --------- + --------- + --------
PMF | 0.0001 | 0.0001 |  0.0000 |       0.0000 |    0.0000 |   12.6263 | 338.8230



### 3. BPR hyperparams optimization

In [17]:
bpr_models = [
    BPR(seed=SEED),
]

In [18]:
cornac.Experiment(
    eval_method=rs,
    models=bpr_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()


TEST:
...
    |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------- + ------------ + --------- + --------- + --------
BPR | 0.0273 | 0.1294 |  0.0424 |       0.0343 |    0.0358 |   11.6591 | 451.1734



### 4. BiVAECF hyperparams optimization

In [19]:
bivaecf_models = [
    BiVAECF(use_gpu=False, seed=SEED),
]

In [20]:
cornac.Experiment(
    eval_method=rs,
    models=bivaecf_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()


TEST:
...
        |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
------- + ------ + ------ + ------- + ------------ + --------- + --------- + --------
BiVAECF | 0.0637 | 0.2451 |  0.0930 |       0.0757 |    0.0796 | 7616.4112 | 398.0724



### 5. RecVAE hyperparams optimization

In [21]:
recvae_models = [
    RecVAE(n_epochs=10, use_gpu=False, seed=SEED),
]

In [22]:
cornac.Experiment(
    eval_method=rs,
    models=recvae_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()


TEST:
...
       |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
------ + ------ + ------ + ------- + ------------ + --------- + --------- + --------
RecVae | 0.0763 | 0.2847 |  0.1142 |       0.0925 |    0.0979 | 6903.3554 | 813.5656



### 6. EASE hyperparams optimization

In [9]:
ease_models = [
    EASE(seed=SEED),
]

In [10]:
cornac.Experiment(
    eval_method=rs,
    models=ease_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()

: 

93,612 × 93,612=8,763,220,944 elements 

Assuming each float is 8 bytes (float64), that matrix alone would use:
8,763,220,944 × 8≈70.1 GB8

That’s just one matrix — and there are intermediate copies made during matrix operations, inversion, etc., 
so actual memory usage can easily balloon to 100–150 GB RAM or more.

### 7. HPF hyperparams optimization

In [11]:
hpf_models = [
    HPF(seed=SEED),
]

In [12]:
cornac.Experiment(
    eval_method=rs,
    models=hpf_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()

Learning...
Learning completed!

TEST:
...
    |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------ + ------- + ------------ + --------- + --------- + --------
HPF | 0.0422 | 0.1825 |  0.0641 |       0.0517 |    0.0538 |  370.1086 | 386.1145



### 8. IBPR hyperparams optimization

In [17]:
ibpr_models = [
    IBPR(),
]

In [None]:
cornac.Experiment(
    eval_method=rs,
    models=ibpr_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()

In [None]:
## too long to train

### 9. SANSA hyperparams optimization

In [13]:
sansa_models = [
    SANSA(verbose=False, seed=SEED),
]

In [14]:
cornac.Experiment(
    eval_method=rs,
    models=sansa_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()

INFO:sansa.core.factorizers:Computing incomplete Cholesky decomposition of X^TX + 1.0*I...
INFO:sansa.core.factorizers:Finding a fill-in reducing ordering (method = colamd)...
INFO:sansa.core.factorizers:Computing X^TX...
INFO:sansa.core.factorizers:
                X^TX info:
                    shape = (93612, 93612) 
                    nnz = 26274852 
                    density = 0.299831% 
                    size = 210.6 MB
                
INFO:sansa.core.factorizers:Sorting indices of A...
INFO:sansa.core.factorizers:Casting indptr of A to int64...
INFO:sansa.core.factorizers:Casting indices of A to int64...
INFO:sansa.core.factorizers:Computing approximate Cholesky decomposition (method = ICF)...
INFO:sansa.core.factorizers:Scaling columns and creating diagonal matrix D (LL^T -> L'DL'^T)...
INFO:sansa.core.inverters:Calculating initial guess using 1 step of Schultz method...
INFO:sansa.core.inverters:Calculating approximate inverse using Uniform Minimal Residual algorithm...



TEST:
...
      |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------ + ------- + ------------ + --------- + --------- + --------
SANSA | 0.0659 | 0.2583 |  0.1029 |       0.0832 |    0.0880 |  584.8892 | 786.6559



### 10. VAECF hyperparams optimization

In [15]:
vaecf_models = [
    VAECF(use_gpu=False, seed=SEED),
]

In [16]:
cornac.Experiment(
    eval_method=rs,
    models=vaecf_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()


TEST:
...
      |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------ + ------- + ------------ + --------- + --------- + --------
VAECF | 0.0675 | 0.2632 |  0.1019 |       0.0809 |    0.0851 | 3870.0494 | 498.0539



### 11. NeuMF hyperparams optimization

In [10]:
neumf_models = [
    NeuMF(verbose=False, backend="pytorch", seed=SEED),
]

In [11]:
cornac.Experiment(
    eval_method=rs,
    models=neumf_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()


TEST:
...
      |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------ + ------- + ------------ + --------- + --------- + --------
NeuMF | 0.0459 | 0.1531 |  0.0567 |       0.0503 |    0.0529 | 1231.0541 | 616.6095



### 12. LightGCN hyperparams optimization

In [11]:
lightgcn_models = [
    LightGCN(verbose=False, num_epochs=10, seed=SEED),
]

In [12]:
cornac.Experiment(
    eval_method=rs,
    models=lightgcn_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()

################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################




TEST:
...
         |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 |  Train (s) |  Test (s)
-------- + ------ + ------ + ------- + ------------ + --------- + ---------- + ---------
LightGCN | 0.0538 | 0.2174 |  0.0799 |       0.0647 |    0.0677 | 20095.9606 | 1072.9381



### 13. NGCF hyperparams optimization

In [14]:
ngcf_models = [
    NGCF(verbose=False, num_epochs=10, seed=SEED),
]

In [15]:
cornac.Experiment(
    eval_method=rs,
    models=ngcf_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()

KeyboardInterrupt: 

won't finish, too slow

### 15. WBPR hyperparams optimization

In [9]:
wbpr_models = [
    WBPR(seed=SEED),
]

In [10]:
cornac.Experiment(
    eval_method=rs,
    models=wbpr_models,
    metrics=metrics,
    user_based=True,
    save_dir=None,
).run()


TEST:
...
     |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
---- + ------ + ------ + ------- + ------------ + --------- + --------- + --------
WBPR | 0.0024 | 0.0151 |  0.0037 |       0.0033 |    0.0034 |   11.5198 | 363.8079



In [None]:
from elsa import ELSA
import torch
import numpy as np

device = torch.device("cpu")


items_cnt = rs.train_set.csr_matrix.shape[1]
factors = 256 
num_epochs = 5
batch_size = 128

model = ELSA(n_items=items_cnt, device=device, n_dims=factors)

model.fit(rs.train_set.csr_matrix, batch_size=batch_size, epochs=num_epochs)

# # save item embeddings into np array
A = torch.nn.functional.normalize(model.get_items_embeddings(), dim=-1).cpu().numpy()

# # get predictions in PyTorch
# predictions = model.predict(rs.test_set.csr_matrix, batch_size=batch_size)

# # get predictions in numpy
# predictions = ((rs.test_set.csr_matrix @ A) @ (A.T)) - rs.test_set.csr_matrix

predictions = model.predict(rs.test_set.csr_matrix, batch_size=batch_size).numpy()
predictions_idx = (-predictions).argsort()
predictions_idx = np.array(rs.test_set.item_ids)[predictions_idx]

************************** [START] **************************
Runing on cpu.
Total steps 281
Epoch: 1/5; nmse_train: 0.0; cosine_train: 0.8837; training time: 43.314320s3.314159s
Epoch: 2/5; nmse_train: 0.0; cosine_train: 0.839; training time: 43.276342s.276242s6s
Epoch: 3/5; nmse_train: 0.0; cosine_train: 0.8272; training time: 43.718410s3.718304s
Epoch: 4/5; nmse_train: 0.0; cosine_train: 0.8211; training time: 44.072291s4.072195s
Epoch: 5/5; nmse_train: 0.0; cosine_train: 0.8167; training time: 44.138961s4.138862s

************************** [END] **************************


{'nmse_train': [np.float64(1.8880895251537476e-05),
  np.float64(1.7924249936367494e-05),
  np.float64(1.7673339722651135e-05),
  np.float64(1.754322696395974e-05),
  np.float64(1.7448970143810856e-05)],
 'cosine_train': [np.float64(0.8837340938072781),
  np.float64(0.8389571728814539),
  np.float64(0.8272125980289805),
  np.float64(0.8211221527374511),
  np.float64(0.816709994792514)]}