# Imports & Data

In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, diags, identity
from scipy.spatial.distance import cdist

from pathlib import Path
from tqdm import tqdm

import scipy.sparse as sp
from scipy.sparse.linalg import svds, spsolve_triangular, eigsh, splu


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder


from dataprep import (
    generate_interactions_matrix,
    leave_last_out,
    transform_indices,
    verify_time_split,
    reindex_data,
)
from evaluation import (
    topn_recommendations,
    model_evaluate,
    downvote_seen_items,
    calculate_rmse,
    postprocess_scores,
)

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.simplefilter("ignore", category=ConvergenceWarning)

In [3]:
np.random.seed(42)

# Helpers

In [4]:
def split_by_time(data: pd.DataFrame, split_time, userid="userid", timeid="timestamp"):

    test_users = data[data[timeid] > split_time][userid].unique()

    return data[~data[userid].isin(test_users)], data[data[userid].isin(test_users)]

In [5]:
def get_gower_sim(A, w=None):

    mins = A.min(axis=0)
    maxes = A.max(axis=0)
    ranges = maxes - mins
    ranges[ranges == 0] = 1.0

    An = (A - mins) / ranges

    _, d = An.shape
    w = w if w is not None else np.ones(d)
    assert w.size == d

    diffs = An[:, None, :] - An[None, :, :]
    diffs = np.abs(diffs)
    diffs *= w

    similarity = 1 - np.sum(diffs, axis=2) / np.sum(w)

    similarity = csr_matrix(similarity)
    similarity.setdiag(0)
    similarity.eliminate_zeros()

    return similarity

# Models

In [6]:
def cosine_similarity_zd(A):
    """Build cosine similarity matrix with zero diagonal."""
    A = csr_matrix(A)
    similarity = cosine_similarity(A, dense_output=False)
    similarity.setdiag(0)
    similarity.eliminate_zeros()
    return similarity


def get_gower_sim(A, w=None):
    mins = A.min(axis=0)
    maxes = A.max(axis=0)
    ranges = maxes - mins
    ranges[ranges == 0] = 1.0

    A = (A - mins) / ranges

    n, d = A.shape
    w = w if w is not None else np.ones(d)
    assert w.size == d

    Aw = A * w
    D = cdist(Aw, Aw, metric="cityblock") / np.sum(w)

    similarity = 1 - D
    similarity = csr_matrix(similarity)
    similarity.setdiag(0)
    similarity.eliminate_zeros()
    return similarity


def sparse_dropout(A, p=0.5):
    # Randomly zero p of all elements
    A_coo = A.tocoo()
    nnz = A_coo.nnz
    keep_indices = np.random.choice(nnz, size=int(nnz * (1 - p)), replace=False)
    return csr_matrix(
        (A_coo.data[keep_indices], (A_coo.row[keep_indices], A_coo.col[keep_indices])),
        shape=A.shape,
    )


def topk(A, p=0.5):
    # Leave only p of all elements per row
    A = A.copy()
    for i in range(A.shape[0]):
        start, stop = A.indptr[i], A.indptr[i + 1]
        row = A.data[start:stop]
        k = int(len(row) * p)
        keep_mask = np.zeros(len(row), dtype=bool)
        keep_mask[np.argpartition(row, -k)[-k:]] = True
        A.data[start:stop][~keep_mask] = 0
    A.eliminate_zeros()
    return A


def build_iknn_model(config, data, data_description):
    item_similarity = None
    if config.get("gower", False):
        feats = data_description["item_features_mtx"]
        item_similarity = get_gower_sim(feats, config.get("gower_w", None))
    else:
        item_similarity = cosine_similarity_zd(data_description["item_features_mtx"])
    if config.get("sampling", True):
        # we leave half of the samples and then only highest rated half
        item_similarity = sparse_dropout(item_similarity, config.get("p_dropout", 0.5))
        item_similarity = topk(item_similarity, config.get("p_topk", 0.5))
    return item_similarity


@postprocess_scores
def iknn_model_scoring(params, testset, testset_description):
    item_similarity = params
    test_mtx = generate_interactions_matrix(
        testset, testset_description, rebase_users=True
    )
    scores = test_mtx @ item_similarity
    return scores.toarray()

In [7]:
def random_scoring(testset, testset_description):
    test_mtx = generate_interactions_matrix(
        testset, testset_description, rebase_users=True
    )
    return np.random.rand(*test_mtx.shape)

In [8]:
def easer_build(config, data, data_description):

    gower = config["gower"]
    alpha = config["alpha"]
    l = config["l"]

    A = generate_interactions_matrix(data, data_description)
    I = csr_matrix(np.eye(data_description["n_items"]))

    Y = data_description["item_features_mtx"]
    if gower:
        F = get_gower_sim(Y)
    else:
        F = cosine_similarity_zd(Y)
        # F = csr_matrix(Y @ Y.T)
        # F.setdiag(0)

    G = A.T @ A + alpha * F + l * I

    P = np.linalg.inv(G.toarray())
    W = np.eye(A.shape[1]) - P / np.diag(P)
    return W


@postprocess_scores
def easer_scoring(params, testset, testset_description):
    W = params
    A = generate_interactions_matrix(testset, testset_description, rebase_users=True)

    return A @ W

In [9]:
def knn(Y, mode, nnbrs=10):
    if mode == "gower":
        S = get_gower_sim(Y).toarray()
    else:
        S = cosine_similarity(Y, Y)

    np.fill_diagonal(S, 0.0)

    neighbors = np.argpartition(-S, nnbrs, axis=1)[:, :nnbrs]
    return neighbors


def fsSLIM_build(config, data, data_description):
    nnbrs = config["nnbrs"]
    alpha_reg = config["alpha_reg"]
    beta_reg = config["beta_reg"]
    gower = config["gower"]

    assert alpha_reg + beta_reg > 0

    A = generate_interactions_matrix(data, data_description)
    Y = data_description["item_features_mtx"]

    neighbours = knn(Y, "gower", nnbrs) if gower else knn(Y, "cos_sim", nnbrs)

    n_items = data_description["n_items"]
    W = np.zeros((n_items, n_items))

    model = ElasticNet(
        alpha=alpha_reg + beta_reg,
        l1_ratio=alpha_reg / (alpha_reg + beta_reg),
        fit_intercept=False,
        positive=True,
        max_iter=50,
        tol=config.get("tol", 1e-2),
    )

    for j in range(n_items):
        idx = neighbours[j]

        X = A[:, idx].toarray()
        y = A[:, j].toarray().ravel()

        model.fit(X, y)

        W[idx, j] = model.coef_

    return W


@postprocess_scores
def fsSLIM_scoring(params, testset, testset_description):
    W = params
    A = generate_interactions_matrix(testset, testset_description, rebase_users=True)
    return A @ W

In [10]:
def make_spd_similarity(sim, gamma=0.9, eps=1e-6):
    n = sim.shape[0]
    I = sp.eye(n, format="csr")

    sim = 0.5 * (sim + sim.T)

    sim = sim.copy()
    sim.setdiag(1.0)

    K = gamma * sim + (1 - gamma) * I

    K = K + eps * I

    return K


def safe_dot_kernel(Y, eps=1e-6):
    Y = Y.astype(np.float64)
    norms = np.linalg.norm(Y, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    Y = Y / norms
    K = csr_matrix(Y @ Y.T)
    K.setdiag(1.0)
    K = K + eps * sp.eye(K.shape[0], format="csr")
    return K


def sparse_cholesky(A):
    sparse_matrix = A.T @ A
    sparse_matrix += 1e-6 * identity(sparse_matrix.shape[0])
    n = sparse_matrix.shape[0]
    LU = splu(sparse_matrix, diag_pivot_thresh=0.0, permc_spec="NATURAL")

    L = LU.L @ diags(LU.U.diagonal() ** 0.5)

    return L


def hysvd_build(config, data, data_description):
    rank = config.get("rank", 25)
    gamma_s = config.get("gamma_s", 1)
    gamma_k = config.get("gamma_k", 1)
    gower = config.get("gower", False)

    A = generate_interactions_matrix(data, data_description).tocsr()

    Y = data_description["item_features_mtx"]
    X = data_description["user_features_mtx"]
    if gower:
        S_sim = get_gower_sim(Y)
        S = make_spd_similarity(S_sim, gamma=gamma_s)
        K_sim = get_gower_sim(X)
        K = make_spd_similarity(K_sim, gamma=gamma_k)
    else:
        S_sim = safe_dot_kernel(Y)
        S = make_spd_similarity(S_sim, gamma=gamma_s)
        K_sim = safe_dot_kernel(X)
        K = make_spd_similarity(K_sim, gamma=gamma_k)

    Ls = csr_matrix(np.linalg.cholesky(S.toarray()))
    Lk = csr_matrix(np.linalg.cholesky(K.toarray()))

    # Ls = sparse_cholesky(S)
    # Lk = sparse_cholesky(K)

    M = Lk.T @ A @ Ls

    U_hat, s, VT_hat = svds(M, k=rank)
    idx = np.argsort(-s)
    s = s[idx]
    U_hat = U_hat[:, idx]
    V_hat = VT_hat[idx, :].T

    V = spsolve_triangular(Ls.T.tocsr(), V_hat, lower=False)

    LV = Ls @ V
    RV = spsolve_triangular(Ls.T.tocsr(), V, lower=False)

    return {"LV": LV, "RV": RV, "singular_values": s}


@postprocess_scores
def hysvd_scoring(params, testset, testset_description):
    LV = params["LV"]
    RV = params["RV"]

    A_test = generate_interactions_matrix(
        testset, testset_description, rebase_users=True
    )

    return A_test @ LV @ RV.T

In [11]:
def popularity_scaling(A, alpha):
    item_pop = np.array(A.sum(axis=0)).ravel()
    item_pop[item_pop == 0] = 1.0
    D_alpha = sp.diags(item_pop ** (-alpha))
    return D_alpha


def eigenrec_build(config, data, data_description):
    rank = config.get("rank", 100)
    alpha = config.get("alpha", 0.5)
    gamma = config.get("gamma", 0.8)
    gower = config.get("gower", True)

    A = generate_interactions_matrix(data, data_description).tocsr()

    D_alpha = popularity_scaling(A, alpha)
    A_scaled = A @ D_alpha

    C = A_scaled.T @ A_scaled
    C.setdiag(0)

    Y = data_description["item_features_mtx"]

    if gower:
        S = get_gower_sim(Y)
    else:
        S = cosine_similarity_zd(Y)

    W_sim = gamma * S + (1.0 - gamma) * C

    W_sim = W_sim + 1e-6 * sp.eye(W_sim.shape[0], format="csr")

    vals, vecs = eigsh(W_sim, k=rank, which="LA")

    idx = np.argsort(-vals)
    vals = vals[idx]
    vecs = vecs[:, idx]

    return {"Q": vecs, "Lambda": vals}


@postprocess_scores
def eigenrec_scoring(params, testset, testset_description):
    Q = params["Q"]
    Lambda = params["Lambda"]

    A = generate_interactions_matrix(testset, testset_description, rebase_users=True)

    AQ = A @ Q
    scores = (AQ * Lambda) @ Q.T

    return scores

# Dataprep

## MovieLens1M & BX

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer


def encode_features(
    X: pd.DataFrame,
    element_id: str,
    movie_lens_flg: bool = False,
    categorical: list[str] = None,
    numerical: list[str] = None,
    text: list[str] = None,
    max_tfidf_features: int = 50,
    scale_numeric: bool = True,
    limit_categorical: dict = None,
):
    """
    Encode heterogeneous features into a feature DataFrame.

    limit_categorical:
        dict like {"Publisher": 100, "Book-Author": 200}
        keeps top-K categories, others -> 'OTHER'
    """

    parts = []
    if movie_lens_flg and "genres" in X.columns:
        genre_ohe = X["genres"].fillna("").str.get_dummies(sep="|").add_prefix("genre_")
        parts.append(genre_ohe)
    if categorical:
        X_cat_src = X[categorical].copy()

        if limit_categorical:
            for col, top_k in limit_categorical.items():
                if col in X_cat_src.columns:
                    top_values = X_cat_src[col].value_counts().head(top_k).index
                    X_cat_src[col] = X_cat_src[col].where(
                        X_cat_src[col].isin(top_values), "OTHER"
                    )

        ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        X_cat = ohe.fit_transform(X_cat_src.fillna("NA"))
        cat_cols = ohe.get_feature_names_out(categorical)

        X_cat_df = pd.DataFrame(X_cat, columns=cat_cols, index=X.index)
        parts.append(X_cat_df)
    if numerical:
        X_num_df = X[numerical].copy()

        for col in numerical:
            X_num_df[col] = pd.to_numeric(X_num_df[col], errors="coerce")
            mean_val = X_num_df[col].mean()
            X_num_df[col] = X_num_df[col].fillna(mean_val)

        X_num = X_num_df.to_numpy(dtype=float)

        if scale_numeric:
            scaler = StandardScaler()
            X_num = scaler.fit_transform(X_num)

        X_num_df = pd.DataFrame(X_num, columns=numerical, index=X.index)
        parts.append(X_num_df)
    if text:
        for col in text:
            tfidf = TfidfVectorizer(
                max_features=max_tfidf_features, stop_words="english"
            )
            X_txt = tfidf.fit_transform(X[col].fillna("").astype(str))
            txt_cols = [f"{col}_tfidf_{t}" for t in tfidf.get_feature_names_out()]

            X_txt_df = pd.DataFrame(X_txt.toarray(), columns=txt_cols, index=X.index)
            parts.append(X_txt_df)
    if not parts:
        raise ValueError("No features were encoded")
    features_df = pd.concat(parts, axis=1)
    result = pd.concat(
        [X[[element_id]].reset_index(drop=True), features_df.reset_index(drop=True)],
        axis=1,
    )

    return result

In [13]:
def cold_start_scenario(
    feedback: pd.DataFrame,
    time_quantile: float = 0.8,
    user_col: str = "userid",
    item_col: str = "movieid",
    time_col=None,
):
    if time_col is None:
        time_col = "timestamp"
        feedback[time_col] = np.random.rand(len(feedback))

    training_, testing_h_ = split_by_time(
        feedback, feedback[time_col].quantile(time_quantile), userid=user_col
    )

    training, training_index = transform_indices(
        training_, users=user_col, items=item_col
    )

    _, testset_h_index = transform_indices(testing_h_, users=user_col, items=item_col)

    data_index = {}
    data_index["users"] = pd.Index(
        list(training_index["users"]) + list(testset_h_index["users"]), name=user_col
    )
    data_index["items"] = training_index["items"]

    testset_h = reindex_data(
        data=testing_h_,
        data_index=data_index,
        entities=["items", "users"],
        filter_invalid=True,
    )

    testset, holdout_ = leave_last_out(testset_h, userid=user_col)
    holdout_ = holdout_[holdout_[user_col].isin(testset[user_col])]
    holdout = holdout_.sort_values(user_col)

    return training, data_index, testset, holdout, training_

In [14]:
data_dir = Path("data/")

feedback_ml = pd.read_csv(
    data_dir / "ml-1m/ratings.dat",
    sep="::",
    engine="python",
    header=None,
    names=["userid", "movieid", "rating", "timestamp"],
    encoding="latin-1",
)

items_ml = pd.read_csv(
    data_dir / "ml-1m/movies.dat",
    sep="::",
    engine="python",
    header=None,
    names=["movieid", "title", "genres"],
    encoding="latin-1",
)

users_ml = pd.read_csv(
    data_dir / "ml-1m/users.dat",
    sep="::",
    engine="python",
    header=None,
    names=["userid", "sex", "age", "occupation", "zipcode"],
    encoding="latin-1",
).drop(columns=["zipcode"])


feedback_bx = pd.read_csv(
    data_dir / "bx/BX-Book-Ratings.csv", delimiter=";", encoding="latin1"
)

items_bx = pd.read_csv(
    data_dir / "bx/BX-Books.csv",
    delimiter=";",
    encoding="latin-1",
    on_bad_lines="skip",
    low_memory=False,
)

users_bx = pd.read_csv(
    data_dir / "bx/BX-Users.csv",
    delimiter=";",
    encoding="latin-1",
    on_bad_lines="skip",
    low_memory=False,
)
users_bx[["Location_1", "Location_2", "Location_3"]] = (
    users_bx["Location"].str.split(r"\s*,\s*", expand=True).iloc[:, :3]
)
users_bx = users_bx.drop(columns=["Location"])
users_bx["Age"] = users_bx["Age"].fillna(users_bx["Age"].median())

isbn_index = pd.Index(items_bx["ISBN"].unique(), name="ISBN")
isbn_to_id = {isbn: i for i, isbn in enumerate(isbn_index)}
items_bx = items_bx.copy()
items_bx["ISBN"] = items_bx["ISBN"].map(isbn_to_id)
feedback_bx = feedback_bx.copy()
feedback_bx["ISBN"] = feedback_bx["ISBN"].map(isbn_to_id)


training, data_index, testset, holdout, feedback_val = cold_start_scenario(
    feedback_ml, time_col="timestamp"
)
item_features_ = encode_features(
    items_ml, element_id="movieid", categorical=[], text=["title"], movie_lens_flg=True
)
user_features_ = encode_features(
    users_ml,
    element_id="userid",
    categorical=["sex", "occupation"],
    text=[],
    numerical=["age"],
)

item_features = reindex_data(
    data=item_features_, data_index=data_index, entities=["items"], filter_invalid=True
)
user_features = reindex_data(
    data=user_features_,
    data_index=data_index,
    entities=["users"],
    filter_invalid=True,
)

training_val, data_index_val, testset_val, holdout_val, _ = cold_start_scenario(
    feedback_val, time_quantile=0.5
)
item_features_val = reindex_data(
    data=item_features_,
    data_index=data_index_val,
    entities=["items"],
    filter_invalid=True,
)


training_bx, data_index_bx, testset_bx, holdout_bx, feedback_bx_val = (
    cold_start_scenario(
        feedback_bx, user_col="UserID", item_col="ISBN", time_quantile=0.05
    )
)
training_bx = training_bx[training_bx["ISBN"] > 0]
item_features_bx_ = encode_features(
    items_bx,
    element_id="ISBN",
    categorical=["Book-Author", "Publisher"],
    numerical=["Year-Of-Publication"],
    text=["Book-Title"],
    limit_categorical={
        "Publisher": 10,
        "Book-Author": 10,
    },
)
user_features_bx_ = encode_features(
    users_bx,
    element_id="UserID",
    categorical=["Location_1", "Location_2", "Location_3"],
    text=[],
    numerical=["Age"],
    limit_categorical={
        "Location_1": 10,
        "Location_2": 10,
        "Location_3": 10,
    },
)

item_features_bx = reindex_data(
    data=item_features_bx_,
    data_index=data_index_bx,
    entities=["items"],
    filter_invalid=True,
)
user_features_bx = reindex_data(
    data=user_features_bx_,
    data_index=data_index_bx,
    entities=["users"],
    filter_invalid=True,
)


training_bx_val, data_index_bx_val, testset_bx_val, holdout_bx_val, _ = (
    cold_start_scenario(
        feedback_bx_val, item_col="ISBN", user_col="UserID", time_quantile=0.5
    )
)
item_features_bx_val = reindex_data(
    data=item_features_bx_,
    data_index=data_index_bx_val,
    entities=["items"],
    filter_invalid=True,
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feedback[time_col] = np.random.rand(len(feedback))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feedback[time_col] = np.random.rand(len(feedback))


In [15]:
data_description = dict(
    users=data_index["users"].name,
    items=data_index["items"].name,
    feedback="rating",
    n_users=len(data_index["users"]),
    n_items=len(data_index["items"]),
    test_users=holdout[data_index["users"].name].values,
    item_features_mtx=item_features.to_numpy(dtype=float),
    user_features_mtx=user_features.to_numpy(dtype=float),
)

data_description_val = dict(
    users=data_index_val["users"].name,
    items=data_index_val["items"].name,
    feedback="rating",
    n_users=len(data_index_val["users"]),
    n_items=len(data_index_val["items"]),
    test_users=holdout_val[data_index_val["users"].name].values,
    item_features_mtx=item_features_val.to_numpy(dtype=float),
    user_features_mtx=None,
)

data_description_bx = dict(
    users=data_index_bx["users"].name,
    items=data_index_bx["items"].name,
    feedback="Book-Rating",
    n_users=len(data_index_bx["users"]),
    n_items=len(data_index_bx["items"]),
    test_users=holdout_bx[data_index_bx["users"].name].values,
    item_features_mtx=item_features_bx.to_numpy(dtype=float),
    user_features_mtx=user_features_bx.to_numpy(dtype=float),
)

# data_description_val_bx = dict(
#     users = data_index_val['users'].name,
#     items = data_index_val['items'].name,
#     feedback = 'rating',
#     n_users = len(data_index_val['users']),
#     n_items = len(data_index_val['items']),
#     test_users = holdout_val[data_index_val['users'].name].values,
#     item_features_mtx = item_features_val.to_numpy(dtype=float),
#     user_features_mtx=None,
# )

# Evaluation

In [16]:
def compare_models(
    configs,
    build_fn,
    scoring_fn,
    training,
    testset,
    holdout,
    data_description,
    topn: int = 10,
    model_name: str = None,
):
    res = []

    for config in tqdm(
        configs, desc=f"Running {model_name}" if model_name else "Running ..."
    ):
        params = build_fn(
            config=config, data=training, data_description=data_description
        )

        scores = scoring_fn(params, testset, data_description)

        recs = topn_recommendations(scores, topn=topn)

        metrics = model_evaluate(recs, holdout, data_description, topn=topn)

        row = {}
        if model_name is not None:
            row["model"] = model_name
        else:
            row["model"] = "noname"

        row["is_gower"] = config["gower"]
        row.update(metrics._asdict())

        res.append(row)

    return res

In [17]:
models = {
    "knn": (build_iknn_model, iknn_model_scoring),
    "EASER": (easer_build, easer_scoring),
    "fsSLIM": (fsSLIM_build, fsSLIM_scoring),
    "HySVD": (hysvd_build, hysvd_scoring),
    "EigenRec": (eigenrec_build, eigenrec_scoring),
}

model_configs = {
    "knn": [
        {"sampling": False, "gower": True},
        {"sampling": False, "gower": False},
    ],
    "EASER": [
        {"alpha": 1, "l": 100, "gower": True},
        {"alpha": 1, "l": 100, "gower": False},
    ],
    "fsSLIM": [
        {"nnbrs": 50, "alpha_reg": 1e-3, "beta_reg": 1e-1, "gower": True},
        {"nnbrs": 50, "alpha_reg": 1e-3, "beta_reg": 1e-1, "gower": False},
    ],
    "HySVD": [
        {"gower": True, "rank": 35},
        {"gower": False, "rank": 35},
    ],
    "EigenRec": [{"gower": True, "rank": 50}, {"gower": False, "rank": 50}],
}

In [18]:
result = []

for name, (build_fn, scoring_fn) in models.items():
    result += compare_models(
        configs=model_configs[name],
        build_fn=build_fn,
        scoring_fn=scoring_fn,
        training=training,
        testset=testset,
        holdout=holdout,
        data_description=data_description,
        model_name=name,
    )

pd.DataFrame(result)

Running knn: 100%|██████████| 2/2 [00:03<00:00,  1.52s/it]
Running EASER: 100%|██████████| 2/2 [00:03<00:00,  1.74s/it]
Running fsSLIM: 100%|██████████| 2/2 [00:18<00:00,  9.22s/it]
Running HySVD: 100%|██████████| 2/2 [00:44<00:00, 22.32s/it]
Running EigenRec: 100%|██████████| 2/2 [00:05<00:00,  2.80s/it]


Unnamed: 0,model,is_gower,HR,MRR,Coverage
0,knn,True,0.003926,0.001064,0.105968
1,knn,False,0.006169,0.002085,0.270496
2,EASER,True,0.064498,0.023526,0.288622
3,EASER,False,0.064498,0.023526,0.288622
4,fsSLIM,True,0.065059,0.021077,0.258784
5,fsSLIM,False,0.060572,0.021082,0.257111
6,HySVD,True,0.023556,0.009088,0.013385
7,HySVD,False,0.008974,0.002344,0.012828
8,EigenRec,True,0.047672,0.017062,0.098438
9,EigenRec,False,0.04599,0.018376,0.085332


In [19]:
model_evaluate(
    topn_recommendations(random_scoring(testset, data_description), 10),
    holdout,
    data_description,
    10,
)

EvaluationMetrics(HR=0.002243409983174425, MRR=0.0011217049915872126, Coverage=0.9947016174010039)

In [20]:
# we omit HybridSVD on Book Crossing due to technical constraints

models = {
    "knn": (build_iknn_model, iknn_model_scoring),
    "EASER": (easer_build, easer_scoring),
    "fsSLIM": (fsSLIM_build, fsSLIM_scoring),
    "EigenRec": (eigenrec_build, eigenrec_scoring),
}

model_configs = {
    "knn": [
        {"sampling": False, "gower": True},
        {"sampling": False, "gower": False},
    ],
    "EASER": [
        {"alpha": 1, "l": 100, "gower": True},
        {"alpha": 1, "l": 100, "gower": False},
    ],
    "fsSLIM": [
        {
            "nnbrs": 50,
            "alpha_reg": 0,
            "beta_reg": 1e-3,
            "gower": True,
        },
        {"nnbrs": 50, "alpha_reg": 0, "beta_reg": 1e-3, "gower": False},
    ],
    "EigenRec": [{"gower": True, "rank": 50}, {"gower": False, "rank": 50}],
}

In [21]:
result = []

for name, (build_fn, scoring_fn) in models.items():
    result += compare_models(
        configs=model_configs[name],
        build_fn=build_fn,
        scoring_fn=scoring_fn,
        training=training_bx,
        testset=testset_bx,
        holdout=holdout_bx,
        data_description=data_description_bx,
        model_name=name,
    )

pd.DataFrame(result)

Running knn: 100%|██████████| 2/2 [00:00<00:00,  2.51it/s]
Running EASER: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s]
Running fsSLIM: 100%|██████████| 2/2 [03:41<00:00, 110.98s/it]
Running EigenRec: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]


Unnamed: 0,model,is_gower,HR,MRR,Coverage
0,knn,True,0.017882,0.006551,0.823908
1,knn,False,0.017351,0.00623,0.848276
2,EASER,True,0.022486,0.008475,0.896552
3,EASER,False,0.021335,0.007361,0.25977
4,fsSLIM,True,0.004338,0.001312,0.068046
5,fsSLIM,False,0.004338,0.001312,0.068046
6,EigenRec,True,0.007436,0.0021,0.435862
7,EigenRec,False,0.009738,0.002605,0.191264


In [22]:
model_evaluate(
    topn_recommendations(random_scoring(testset_bx, data_description_bx), 10),
    holdout_bx,
    data_description_bx,
    10,
)

EvaluationMetrics(HR=0.0048689801699716715, MRR=0.0011833799856108636, Coverage=1.0)