In [1]:
import numpy as np
import pandas as pd
import optuna
from scipy.sparse import csr_matrix
from weighting_strategies import (
    bm25_weight, tfidf_weight, normalized_weight,
    log_weight, confidence_weight, power_weight,
    pmi_weight, robust_user_centric_weight, sigmoid_propensity_weight, power_lift_weight, robust_user_centric_weight_v2
)
from implicit.nearest_neighbours import CosineRecommender
from implicit.evaluation import train_test_split, ranking_metrics_at_k

In [2]:
import sys
import os

# Add the parent directory to sys.path to resolve imports from sibling directories
sys.path.append(os.path.abspath(".."))

from utils.sparse import transform_dataframe_to_sparse

In [3]:
steam_df = (
    pd.read_csv(
        "/home/coder/projects/rec-sys-research/data/steam/steam_recommendations.csv",
        usecols=['user_id', 'app_id', 'hours'],
    )
    .loc[:, ['user_id', 'app_id', 'hours']]
    .drop_duplicates()
    .dropna()
    .rename(columns={'app_id': 'item_id', 'hours': 'target'})
)
steam_df['user_id'].nunique(), steam_df['item_id'].nunique(), steam_df.shape[0]

(13781059, 37610, 41154773)

In [4]:
user_item_matrix, user_mapping, item_mapping = transform_dataframe_to_sparse(
    steam_df, row_field='user_id', col_field='item_id', data_field='target'
)


train_val_mat, test_mat = train_test_split(user_item_matrix, train_percentage=0.9, random_state=42)
train_mat, val_mat = train_test_split(train_val_mat, train_percentage=0.9, random_state=42)

print(f"Train Shape: {train_mat.shape}, Val Shape: {val_mat.shape}, Test Shape: {test_mat.shape}")

Train Shape: (13781059, 37610), Val Shape: (13781059, 37610), Test Shape: (13781059, 37610)


In [5]:
results_folder = "results/steam_knn"
results_filename = "steam_knn_results.csv"

import time

def run_hyperparameter_optimization(
    train_mat: csr_matrix,
    val_mat: csr_matrix,
    train_val_mat: csr_matrix,
    test_mat: csr_matrix,
    weighting_strategy: str,
    algorithm: str,
    n_trials: int = 20,
    output_dir: str = None,
) -> pd.DataFrame:
    results = []
    algorithms = {
        "KNN_k=20": lambda: CosineRecommender(K=20),
        "KNN_k=100": lambda: CosineRecommender(K=100),
    }
    strategies = [
        "no_weighting",
        "bm25",
        "tfidf",
        "log", 
        "confidence",
        "power",
        "normalized",
        "pmi",
        "robust_user_centric",
        "robust_user_centric_weight_v2",
        "sigmoid_propensity",
        "power_lift"
    ]
    if weighting_strategy not in strategies:
        raise ValueError(f"Weighting strategy '{weighting_strategy}' is not recognized.")
    strategy = weighting_strategy

    if algorithm not in algorithms:
        raise ValueError(f"Algorithm '{algorithm}' is not recognized.")
    algo_name = algorithm
    AlgoFactory = algorithms[algorithm]

    print(f"Running optimization for {algo_name} with {strategy}...")

    def get_weighted_matrix(matrix, params):
        weighted = matrix.copy()
        if strategy == "bm25":
            weighted = bm25_weight(weighted, K1=params.get("bm25_k1"), B=params.get("bm25_b"))
        elif strategy == "confidence":
            weighted = confidence_weight(weighted, alpha=params.get("conf_alpha"))
        elif strategy == "power":
            weighted = power_weight(weighted, p=params.get("power_p"))
        elif strategy == "tfidf":
            weighted = tfidf_weight(weighted)
        elif strategy == "log":
            weighted = log_weight(weighted)
        elif strategy == "normalized":
            weighted = normalized_weight(weighted)
        elif strategy == "pmi":
            weighted = pmi_weight(weighted)
        elif strategy == "robust_user_centric":
            weighted = robust_user_centric_weight(weighted, scale_factor=params.get("scale_factor"))
        elif strategy == "sigmoid_propensity":
            weighted = sigmoid_propensity_weight(weighted, p=params.get("p"), beta=params.get("beta"))
        elif strategy == "power_lift":
            weighted = power_lift_weight(weighted, p=params.get("p"))
        elif strategy == "robust_user_centric_weight_v2":
            weighted = robust_user_centric_weight_v2(weighted, lower_q=params.get("lower_q"), upper_q=params.get("upper_q"))
        return weighted

    def objective(trial):
        params = {}
        # Suggest weighting strategy parameters
        if strategy == "bm25":
            params["bm25_k1"] = trial.suggest_float("bm25_k1", 0.1, 1000)
            params["bm25_b"] = trial.suggest_float("bm25_b", 0.0, 1.0)
        elif strategy == "confidence":
            params["conf_alpha"] = trial.suggest_float("conf_alpha", 1.0, 150.0)
        elif strategy == "power":
            params["power_p"] = trial.suggest_float("power_p", 0.1, 1.5)
        elif strategy == "robust_user_centric":
            params["scale_factor"] = trial.suggest_float("scale_factor", 0.1, 10.0)
        elif strategy == "robust_user_centric_weight_v2":
            params["lower_q"] = trial.suggest_float("lower_q", 5.0, 45.0)
            params["upper_q"] = trial.suggest_float("upper_q", 55.0, 95.0)
        elif strategy == "sigmoid_propensity":
            params["p"] = trial.suggest_float("p", 0.1, 5.0)
            params["beta"] = trial.suggest_float("beta", 0.0, 1.0)
        elif strategy == "power_lift":
            # params["p"] = trial.suggest_float("p", 0.1, 1.5)
            params["p"] = trial.suggest_float("p", 0.2, 1.3)
        weighted_train = get_weighted_matrix(train_mat, params)

        # Train Model
        model = AlgoFactory()
        model.fit(weighted_train, show_progress=False)

        # Evaluate on Validation Set
        return ranking_metrics_at_k(model, train_mat, val_mat, K=20, show_progress=False)['ndcg']

    # Optimize only if strategy has parameters
    current_trials = n_trials if strategy in ["bm25", "confidence", "power", "robust_user_centric", "robust_user_centric_weight_v2", "sigmoid_propensity", "power_lift"] else 1
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=current_trials, n_jobs=-1)

    # --- Final Retraining & Testing ---
    # Use best params to weight the full train_val matrix
    best_params = study.best_params
    weighted_train_val = get_weighted_matrix(train_val_mat, best_params)

    # Train Final Model
    final_model = AlgoFactory()
    
    start_time = time.time()
    final_model.fit(weighted_train_val, show_progress=False)
    end_time = time.time()
    
    # Evaluate on Test Set
    metrics_at_10 = ranking_metrics_at_k(final_model, train_val_mat, test_mat, K=10, show_progress=False)
    metrics_at_20 = ranking_metrics_at_k(final_model, train_val_mat, test_mat, K=20, show_progress=False)

    results.append({
        "Algorithm": algo_name,
        "Strategy": strategy,
        "Number of Optimization Trials": current_trials,
        "Best Val NDCG@20": study.best_value,
        "Test NDCG@10": metrics_at_10['ndcg'],
        "Test NDCG@20": metrics_at_20['ndcg'],
        "Test Precision@10": metrics_at_10['precision'],
        "Test Precision@20": metrics_at_20['precision'],
        "Final Train Time (s)": end_time - start_time,
        "Best Params": best_params
    })

    if output_dir:
        output_path = os.path.join(output_dir, f"{algo_name}_{strategy}_results.csv")
        pd.DataFrame(results).to_csv(output_path, index=False)
    return pd.DataFrame(results)

In [6]:
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [7]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="no_weighting", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 10:29:27,025] A new study created in memory with name: no-name-3ccf8bb3-3610-4824-9b6a-7a71ce012cdc


Running optimization for KNN_k=20 with no_weighting...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 10:32:28,695] Trial 0 finished with value: 0.03317366163332138 and parameters: {}. Best is trial 0 with value: 0.03317366163332138.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,no_weighting,1,0.033174,0.032947,0.039046,0.054574,0.077333,2.659427,{}


In [9]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="bm25", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 10:49:53,482] A new study created in memory with name: no-name-3033a303-0c90-4d03-bb95-6cd6dc2bc543


Running optimization for KNN_k=20 with bm25...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 11:30:13,097] Trial 9 finished with value: 0.05057881392267363 and parameters: {'bm25_k1': 31.33329274964659, 'bm25_b': 0.7770367948199423}. Best is trial 9 with value: 0.05057881392267363.
[I 2026-02-10 11:30:25,752] Trial 6 finished with value: 0.047706728624800146 and parameters: {'bm25_k1': 197.16155583672443, 'bm25_b': 0.12324519127078215}. Best is trial 9 with value: 0.05057881392267363.
[I 2026-02-10 11:30:32,060] Trial 5 finished with value: 0.04430933560111045 and parameters: {'bm25_k1': 780.6301029905442, 'bm25_b': 0.3633229975708161}. Best is trial 9 with value: 0.05057881392267363.
[I 2026-02-10 11:30:36,141] Trial 1 finished with value: 0.04413966150711353 and parameters: {'bm25_k1': 372.3545683306342, 'bm25_b': 0.8566742299225406}. Best is trial 9 with value: 0.05057881392267363.
[I 2026-02-10 11:30:37,060] Trial 8 finished with value: 0.04167842066424512 and parameters: {'bm25_k1': 902.145214212935

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,bm25,10,0.050579,0.046935,0.056762,0.080643,0.11796,2.696452,"{'bm25_k1': 31.33329274964659, 'bm25_b': 0.777..."


In [10]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="tfidf", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)


[I 2026-02-10 11:38:24,912] A new study created in memory with name: no-name-234d60e3-a3ac-4bd8-8e89-cb74e5dc8efd


Running optimization for KNN_k=20 with tfidf...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 11:41:27,373] Trial 0 finished with value: 0.04625727221801684 and parameters: {}. Best is trial 0 with value: 0.04625727221801684.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,tfidf,1,0.046257,0.043153,0.051849,0.074355,0.107704,2.658919,{}


In [11]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="log", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 11:48:03,721] A new study created in memory with name: no-name-7ba4244d-6181-4d03-b570-6aa124fb98b4


Running optimization for KNN_k=20 with log...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 11:51:06,241] Trial 0 finished with value: 0.049217863420257676 and parameters: {}. Best is trial 0 with value: 0.049217863420257676.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,log,1,0.049218,0.045598,0.054935,0.078724,0.114667,2.669255,{}


In [12]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="confidence", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 11:57:42,026] A new study created in memory with name: no-name-a19b1076-e555-4baf-bb0d-8948f9a7951e


Running optimization for KNN_k=20 with confidence...


[I 2026-02-10 12:37:49,481] Trial 0 finished with value: 0.05166755533348305 and parameters: {'conf_alpha': 40.09762102737804}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:01,759] Trial 5 finished with value: 0.051666210024467286 and parameters: {'conf_alpha': 121.37006301403984}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:16,326] Trial 4 finished with value: 0.05166655487100263 and parameters: {'conf_alpha': 59.38987039111477}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:19,552] Trial 6 finished with value: 0.051666270116179344 and parameters: {'conf_alpha': 115.89880197999125}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:21,344] Trial 9 finished with value: 0.05166661000799903 and parameters: {'conf_alpha': 56.454954758371805}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:32,551] Trial 8 finished with value: 0.05166624606529235 and parameters: {'conf_alpha': 14

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,confidence,10,0.051668,0.045684,0.055217,0.078853,0.115229,2.677147,{'conf_alpha': 39.30074639498613}


In [13]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 12:46:16,766] A new study created in memory with name: no-name-413144d9-68df-4009-a3f4-bf772178671a


Running optimization for KNN_k=20 with power...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 13:27:03,809] Trial 0 finished with value: 0.03961969333370199 and parameters: {'power_p': 0.7485169857093352}. Best is trial 0 with value: 0.03961969333370199.
[I 2026-02-10 13:27:06,808] Trial 7 finished with value: 0.027279800483805933 and parameters: {'power_p': 1.2343449449248376}. Best is trial 0 with value: 0.03961969333370199.
[I 2026-02-10 13:27:22,182] Trial 1 finished with value: 0.04977093384043038 and parameters: {'power_p': 0.16643832838443454}. Best is trial 1 with value: 0.04977093384043038.
[I 2026-02-10 13:27:23,077] Trial 8 finished with value: 0.034038326447379884 and parameters: {'power_p': 0.9615557580198211}. Best is trial 1 with value: 0.04977093384043038.
[I 2026-02-10 13:27:23,758] Trial 4 finished with value: 0.0282876569191582 and parameters: {'power_p': 1.2004169746524196}. Best is trial 1 with value: 0.04977093384043038.
[I 2026-02-10 13:27:26,220] Trial 9 finished with value: 0.0253

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,power,10,0.049771,0.045984,0.055527,0.079498,0.116189,2.734898,{'power_p': 0.16643832838443454}


In [14]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="normalized", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 13:34:13,588] A new study created in memory with name: no-name-2323d629-e021-445c-b2b8-2954201a0dbc


Running optimization for KNN_k=20 with normalized...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 13:37:17,122] Trial 0 finished with value: 0.0021026602927233505 and parameters: {}. Best is trial 0 with value: 0.0021026602927233505.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,normalized,1,0.002103,0.003591,0.002727,0.006848,0.005739,2.839982,{}


In [15]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="pmi", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 13:44:24,953] A new study created in memory with name: no-name-85674416-7b46-42be-9584-b6975e6c0fd3


Running optimization for KNN_k=20 with pmi...


  pmi = log((X.data * N) / denominator) # we could use np.power(X.data, p) instead of log for a softer effect
  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 13:47:29,275] Trial 0 finished with value: 0.04768733965605798 and parameters: {}. Best is trial 0 with value: 0.04768733965605798.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,pmi,1,0.047687,0.044153,0.053316,0.076539,0.111868,2.815338,{}


In [16]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 13:54:24,935] A new study created in memory with name: no-name-5a6729fb-17c2-437e-b7fe-5b358abb4c67


Running optimization for KNN_k=20 with robust_user_centric...


  weights = 1 / (1 + np.exp(-z_scores))
[I 2026-02-10 14:33:45,355] Trial 0 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 9.607177688936702}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:05,787] Trial 9 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 0.18679194942379201}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:05,842] Trial 4 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 4.099596666976707}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:22,983] Trial 6 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 0.7337888337379402}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:24,351] Trial 3 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 5.042861580512368}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:30,450] Trial 2 finished with value: 0.051

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,robust_user_centric,10,0.051041,0.045036,0.054587,0.077673,0.11412,2.819693,{'scale_factor': 9.607177688936702}


In [11]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric_weight_v2", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-11 14:06:06,057] A new study created in memory with name: no-name-936ecf02-46ee-4bf8-9fd0-35eadb1a9869


Running optimization for KNN_k=20 with robust_user_centric_weight_v2...


  weights = 1 / (1 + np.exp(-z_scores))
[I 2026-02-11 14:48:52,577] Trial 1 finished with value: 0.05079662449566239 and parameters: {'lower_q': 42.654022850205145, 'upper_q': 80.61176300615404}. Best is trial 1 with value: 0.05079662449566239.
[I 2026-02-11 14:48:57,649] Trial 8 finished with value: 0.050872876812807714 and parameters: {'lower_q': 39.16143802692331, 'upper_q': 80.51752942273106}. Best is trial 8 with value: 0.050872876812807714.
[I 2026-02-11 14:49:30,435] Trial 3 finished with value: 0.05118837915574804 and parameters: {'lower_q': 35.396665974594015, 'upper_q': 87.05576664524745}. Best is trial 3 with value: 0.05118837915574804.
[I 2026-02-11 14:49:55,671] Trial 7 finished with value: 0.05022382884786496 and parameters: {'lower_q': 40.526390537016866, 'upper_q': 64.2747792291893}. Best is trial 3 with value: 0.05118837915574804.
[I 2026-02-11 14:50:03,678] Trial 2 finished with value: 0.05163171012102409 and parameters: {'lower_q': 16.77507398129061, 'upper_q': 94.50

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,robust_user_centric_weight_v2,10,0.051632,0.045473,0.055058,0.07842,0.115005,2.973098,"{'lower_q': 16.77507398129061, 'upper_q': 94.5..."


In [8]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="sigmoid_propensity", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 17:31:23,624] A new study created in memory with name: no-name-1fac9482-1d15-43c4-b4fc-8bc7d65f4a88


Running optimization for KNN_k=20 with sigmoid_propensity...


  sum_log += np.sum(np.log(X.data[start:end]))
  term = C * np.power(data_chunk, neg_p)
  term = C * np.power(data_chunk, neg_p)
[I 2026-02-10 18:09:42,331] Trial 1 finished with value: 0.00010805872153133262 and parameters: {'p': 1.4962287565556627, 'beta': 0.30789820083645314}. Best is trial 1 with value: 0.00010805872153133262.
[I 2026-02-10 18:10:02,943] Trial 0 finished with value: 0.00010808033611370411 and parameters: {'p': 4.429911784997341, 'beta': 0.2570127997899406}. Best is trial 0 with value: 0.00010808033611370411.
[I 2026-02-10 18:10:12,719] Trial 6 finished with value: 0.00010808317746843375 and parameters: {'p': 0.995471324344374, 'beta': 0.8693745425744215}. Best is trial 6 with value: 0.00010808317746843375.
[I 2026-02-10 18:10:13,274] Trial 2 finished with value: 0.00010811209915694863 and parameters: {'p': 0.5536633300907005, 'beta': 0.08149765515557539}. Best is trial 2 with value: 0.00010811209915694863.
[I 2026-02-10 18:10:13,854] Trial 7 finished with value: 0.

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,sigmoid_propensity,10,0.000108,5.4e-05,9.7e-05,0.000154,0.000373,2.740127,"{'p': 4.571656888585181, 'beta': 0.46188283140..."


In [7]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power_lift", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 16:43:50,995] A new study created in memory with name: no-name-82ebfcba-9765-4f92-b654-f4b8201ab252


Running optimization for KNN_k=20 with power_lift...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 17:24:14,441] Trial 8 finished with value: 0.050992517563183945 and parameters: {'p': 0.1410362558015314}. Best is trial 8 with value: 0.050992517563183945.
[I 2026-02-10 17:24:26,933] Trial 0 finished with value: 0.05084135686957546 and parameters: {'p': 0.3003341359392696}. Best is trial 8 with value: 0.050992517563183945.
[I 2026-02-10 17:24:30,756] Trial 1 finished with value: 0.050718898657294974 and parameters: {'p': 0.11284986796701375}. Best is trial 8 with value: 0.050992517563183945.
[I 2026-02-10 17:24:34,572] Trial 3 finished with value: 0.0423668751716062 and parameters: {'p': 0.788500758870356}. Best is trial 8 with value: 0.050992517563183945.
[I 2026-02-10 17:24:44,080] Trial 9 finished with value: 0.051026176485000706 and parameters: {'p': 0.3221406843716537}. Best is trial 9 with value: 0.051026176485000706.
[I 2026-02-10 17:24:45,936] Trial 4 finished with value: 0.040666353087975844 and parame

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,power_lift,10,0.051308,0.047369,0.057228,0.08163,0.119258,2.755537,{'p': 0.24434787681681283}


In [8]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="no_weighting", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-10 10:39:03,658] A new study created in memory with name: no-name-581a908b-dcb0-4101-9033-0e43418fc7e0


Running optimization for KNN_k=100 with no_weighting...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 10:42:26,124] Trial 0 finished with value: 0.03503657093469969 and parameters: {}. Best is trial 0 with value: 0.03503657093469969.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,no_weighting,1,0.035037,0.032108,0.038481,0.054769,0.078682,2.847643,{}


In [8]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="bm25", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-10 21:21:40,768] A new study created in memory with name: no-name-03fe3213-4a95-4850-8d1b-ddf25ec14e51


Running optimization for KNN_k=100 with bm25...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 22:06:18,598] Trial 8 finished with value: 0.04537021178415411 and parameters: {'bm25_k1': 527.2178571823598, 'bm25_b': 0.87710676795688}. Best is trial 8 with value: 0.04537021178415411.
[I 2026-02-10 22:06:18,952] Trial 9 finished with value: 0.045971302734703234 and parameters: {'bm25_k1': 801.5914725953826, 'bm25_b': 0.5737927438967196}. Best is trial 9 with value: 0.045971302734703234.
[I 2026-02-10 22:06:21,875] Trial 2 finished with value: 0.05014843884987901 and parameters: {'bm25_k1': 75.51376956944377, 'bm25_b': 0.9570921651658091}. Best is trial 2 with value: 0.05014843884987901.
[I 2026-02-10 22:06:26,881] Trial 7 finished with value: 0.04648376355731938 and parameters: {'bm25_k1': 418.2107344355659, 'bm25_b': 0.8141862525429516}. Best is trial 2 with value: 0.05014843884987901.
[I 2026-02-10 22:06:30,057] Trial 6 finished with value: 0.04657233936352849 and parameters: {'bm25_k1': 893.8088296527357, 

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,bm25,10,0.052653,0.044927,0.055103,0.079599,0.118547,2.879537,"{'bm25_k1': 47.35533899208951, 'bm25_b': 0.324..."


In [9]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="tfidf", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)


[I 2026-02-10 22:13:42,207] A new study created in memory with name: no-name-8b165c34-699f-4fb4-b3f0-52cd1b006347


Running optimization for KNN_k=100 with tfidf...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 22:17:01,386] Trial 0 finished with value: 0.04800290142885423 and parameters: {}. Best is trial 0 with value: 0.04800290142885423.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,tfidf,1,0.048003,0.041996,0.051024,0.074192,0.1089,2.881835,{}


In [9]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="log", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-10 18:17:00,030] A new study created in memory with name: no-name-80569e20-ac51-4f47-ba4b-860e44f8a6cf


Running optimization for KNN_k=100 with log...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 18:20:22,573] Trial 0 finished with value: 0.051434579943995674 and parameters: {}. Best is trial 0 with value: 0.051434579943995674.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,log,1,0.051435,0.044641,0.054344,0.078939,0.116149,2.953212,{}


In [10]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="confidence", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-11 13:08:32,883] A new study created in memory with name: no-name-186c57b4-b751-4d68-b4a0-7955467ed00d


Running optimization for KNN_k=100 with confidence...


[I 2026-02-11 13:58:10,615] Trial 2 finished with value: 0.05488313702999749 and parameters: {'conf_alpha': 98.78881115823468}. Best is trial 2 with value: 0.05488313702999749.
[I 2026-02-11 13:58:15,931] Trial 9 finished with value: 0.05488303619257179 and parameters: {'conf_alpha': 145.89825548083905}. Best is trial 2 with value: 0.05488313702999749.
[I 2026-02-11 13:58:21,345] Trial 6 finished with value: 0.05488409699288221 and parameters: {'conf_alpha': 63.60460506757278}. Best is trial 6 with value: 0.05488409699288221.
[I 2026-02-11 13:58:21,805] Trial 4 finished with value: 0.0548836072087075 and parameters: {'conf_alpha': 76.35041681312498}. Best is trial 6 with value: 0.05488409699288221.
[I 2026-02-11 13:58:21,986] Trial 0 finished with value: 0.05488431039248341 and parameters: {'conf_alpha': 54.954464354080116}. Best is trial 0 with value: 0.05488431039248341.
[I 2026-02-11 13:58:22,182] Trial 7 finished with value: 0.05488423626629647 and parameters: {'conf_alpha': 66.742

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,confidence,10,0.054884,0.048238,0.058631,0.084611,0.124065,3.178591,{'conf_alpha': 54.954464354080116}


In [10]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-10 18:27:35,198] A new study created in memory with name: no-name-3003f95b-4ae4-4967-bd7f-b1bd25f61414


Running optimization for KNN_k=100 with power...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 19:12:03,989] Trial 9 finished with value: 0.023780824569619975 and parameters: {'power_p': 1.492722743281366}. Best is trial 9 with value: 0.023780824569619975.
[I 2026-02-10 19:12:18,141] Trial 6 finished with value: 0.0344118383827114 and parameters: {'power_p': 1.0072856569572282}. Best is trial 6 with value: 0.0344118383827114.
[I 2026-02-10 19:12:21,461] Trial 3 finished with value: 0.03389269026192602 and parameters: {'power_p': 1.0427256284580615}. Best is trial 6 with value: 0.0344118383827114.
[I 2026-02-10 19:12:30,966] Trial 8 finished with value: 0.042708196500420174 and parameters: {'power_p': 0.7371183716057305}. Best is trial 8 with value: 0.042708196500420174.
[I 2026-02-10 19:12:32,390] Trial 4 finished with value: 0.050970728418190836 and parameters: {'power_p': 0.2540575635986686}. Best is trial 4 with value: 0.050970728418190836.
[I 2026-02-10 19:12:33,906] Trial 0 finished with value: 0.0300

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,power,10,0.050971,0.044707,0.054421,0.07919,0.116518,2.847769,{'power_p': 0.2540575635986686}


In [7]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="normalized", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-11 16:58:12,679] A new study created in memory with name: no-name-9adca430-8533-48e4-97e2-64202db67bff


Running optimization for KNN_k=100 with normalized...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-11 17:01:43,512] Trial 0 finished with value: 0.00027654626799082535 and parameters: {}. Best is trial 0 with value: 0.00027654626799082535.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,normalized,1,0.000277,0.000488,0.000421,0.001064,0.001159,3.118231,{}


In [11]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="pmi", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-10 19:19:50,225] A new study created in memory with name: no-name-f7783399-91ca-476e-b02c-b29dcfa0f876


Running optimization for KNN_k=100 with pmi...


  pmi = log((X.data * N) / denominator) # we could use np.power(X.data, p) instead of log for a softer effect
  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 19:23:12,660] Trial 0 finished with value: 0.05020722359980607 and parameters: {}. Best is trial 0 with value: 0.05020722359980607.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,pmi,1,0.050207,0.043481,0.053079,0.077446,0.114489,2.848038,{}


In [10]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-10 22:24:09,382] A new study created in memory with name: no-name-d7776091-6395-4daa-9707-d2460f80de0b


Running optimization for KNN_k=100 with robust_user_centric...


  weights = 1 / (1 + np.exp(-z_scores))
[I 2026-02-10 23:08:08,975] Trial 6 finished with value: 0.054128985121973505 and parameters: {'scale_factor': 1.2109369175291178}. Best is trial 6 with value: 0.054128985121973505.
[I 2026-02-10 23:08:48,306] Trial 3 finished with value: 0.054128985121973505 and parameters: {'scale_factor': 2.095365935685995}. Best is trial 6 with value: 0.054128985121973505.
[I 2026-02-10 23:08:59,539] Trial 0 finished with value: 0.054128985121973505 and parameters: {'scale_factor': 8.789611219039372}. Best is trial 6 with value: 0.054128985121973505.
[I 2026-02-10 23:09:03,095] Trial 2 finished with value: 0.054128985121973505 and parameters: {'scale_factor': 3.383244747575725}. Best is trial 6 with value: 0.054128985121973505.
[I 2026-02-10 23:09:03,569] Trial 4 finished with value: 0.054128985121973505 and parameters: {'scale_factor': 5.311745987088769}. Best is trial 6 with value: 0.054128985121973505.
[I 2026-02-10 23:09:04,101] Trial 9 finished with valu

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,robust_user_centric,10,0.054129,0.047525,0.057806,0.083225,0.122233,2.904375,{'scale_factor': 1.2109369175291178}


In [6]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric_weight_v2", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-11 16:00:13,646] A new study created in memory with name: no-name-cebd0d43-7290-4a1d-b612-abf520d693d4


Running optimization for KNN_k=100 with robust_user_centric_weight_v2...


  weights = 1 / (1 + np.exp(-z_scores))
[I 2026-02-11 16:49:46,009] Trial 2 finished with value: 0.054363438956735 and parameters: {'lower_q': 35.57548545435593, 'upper_q': 89.37936177138793}. Best is trial 2 with value: 0.054363438956735.
[I 2026-02-11 16:49:58,719] Trial 3 finished with value: 0.05415449925439028 and parameters: {'lower_q': 12.569891218425994, 'upper_q': 67.09401918158422}. Best is trial 2 with value: 0.054363438956735.
[I 2026-02-11 16:50:04,395] Trial 8 finished with value: 0.05465276352089224 and parameters: {'lower_q': 12.889732489092145, 'upper_q': 90.13093054344188}. Best is trial 8 with value: 0.05465276352089224.
[I 2026-02-11 16:50:06,950] Trial 5 finished with value: 0.05470028640983565 and parameters: {'lower_q': 14.726291257870932, 'upper_q': 92.01540796337524}. Best is trial 5 with value: 0.05470028640983565.
[I 2026-02-11 16:50:15,362] Trial 0 finished with value: 0.05396304701283849 and parameters: {'lower_q': 42.42118545443038, 'upper_q': 82.537683160

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,robust_user_centric_weight_v2,10,0.0547,0.04793,0.058268,0.083896,0.123102,3.143589,"{'lower_q': 14.726291257870932, 'upper_q': 92...."


In [7]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="sigmoid_propensity", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-11 12:11:36,473] A new study created in memory with name: no-name-76b6f171-aca5-4c6a-8638-b8fb332b1a73


Running optimization for KNN_k=100 with sigmoid_propensity...


  sum_log += np.sum(np.log(X.data[start:end]))
  term = C * np.power(data_chunk, neg_p)
  term = C * np.power(data_chunk, neg_p)
[I 2026-02-11 12:57:12,203] Trial 6 finished with value: 7.392331497447478e-05 and parameters: {'p': 3.153823758518224, 'beta': 0.7672450596109225}. Best is trial 6 with value: 7.392331497447478e-05.
[I 2026-02-11 12:57:29,394] Trial 4 finished with value: 7.392331497447478e-05 and parameters: {'p': 3.9834823983995125, 'beta': 0.3753049055357984}. Best is trial 6 with value: 7.392331497447478e-05.
[I 2026-02-11 12:58:41,457] Trial 8 finished with value: 7.392331497447478e-05 and parameters: {'p': 3.068126378260519, 'beta': 0.013083166799415014}. Best is trial 6 with value: 7.392331497447478e-05.
[I 2026-02-11 12:58:57,666] Trial 2 finished with value: 7.392331497447478e-05 and parameters: {'p': 0.2728539683470476, 'beta': 0.49199574056717943}. Best is trial 6 with value: 7.392331497447478e-05.
[I 2026-02-11 12:59:00,138] Trial 1 finished with value: 7.3923314

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,sigmoid_propensity,10,7.4e-05,3.7e-05,6.3e-05,0.000101,0.000228,3.070615,"{'p': 3.153823758518224, 'beta': 0.76724505961..."


In [6]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power_lift", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-11 11:13:54,842] A new study created in memory with name: no-name-21d3860a-4a05-4229-89b8-a6835adfb7aa


Running optimization for KNN_k=100 with power_lift...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-11 12:02:59,552] Trial 9 finished with value: 0.028814352641721133 and parameters: {'p': 1.275265429489131}. Best is trial 9 with value: 0.028814352641721133.
[I 2026-02-11 12:03:18,697] Trial 8 finished with value: 0.035678554114474605 and parameters: {'p': 1.0829560035002268}. Best is trial 8 with value: 0.035678554114474605.
[I 2026-02-11 12:03:29,309] Trial 1 finished with value: 0.04806064920997718 and parameters: {'p': 0.6647057944230309}. Best is trial 1 with value: 0.04806064920997718.
[I 2026-02-11 12:03:44,243] Trial 3 finished with value: 0.04997313118284417 and parameters: {'p': 0.56504078016851}. Best is trial 3 with value: 0.04997313118284417.
[I 2026-02-11 12:03:47,213] Trial 7 finished with value: 0.0487735973328784 and parameters: {'p': 0.633399806161067}. Best is trial 3 with value: 0.04997313118284417.
[I 2026-02-11 12:03:47,351] Trial 0 finished with value: 0.04934319792807656 and parameters: {'p

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,power_lift,10,0.053267,0.045968,0.056272,0.081386,0.120801,3.157452,{'p': 0.29080951981015246}


In [9]:
import glob

all_results = []
# Match any CSV in the result folder
for f in glob.glob(f"{results_folder}/*.csv"):
    all_results.append(pd.read_csv(f))

if all_results:
    experiment_results = pd.concat(all_results)
    experiment_results = experiment_results.sort_values("Test NDCG@20", ascending=False)
    experiment_results.to_csv(results_filename, index=False)
else:
    print("No results found.")

experiment_results

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,confidence,10,0.054884,0.048238,0.058631,0.084611,0.124065,3.178591,{'conf_alpha': 54.954464354080116}
0,KNN_k=100,robust_user_centric_weight_v2,10,0.0547,0.04793,0.058268,0.083896,0.123102,3.143589,"{'lower_q': 14.726291257870932, 'upper_q': 92...."
0,KNN_k=100,robust_user_centric,10,0.054129,0.047525,0.057806,0.083225,0.122233,2.904375,{'scale_factor': 1.2109369175291178}
0,KNN_k=20,power_lift,10,0.051308,0.047369,0.057228,0.08163,0.119258,2.755537,{'p': 0.24434787681681283}
0,KNN_k=20,bm25,10,0.050579,0.046935,0.056762,0.080643,0.11796,2.696452,"{'bm25_k1': 31.33329274964659, 'bm25_b': 0.777..."
0,KNN_k=100,power_lift,10,0.053807,0.045878,0.056078,0.081257,0.120402,3.230722,{'p': 0.259185148987678}
0,KNN_k=20,power,10,0.049771,0.045984,0.055527,0.079498,0.116189,2.734898,{'power_p': 0.16643832838443454}
0,KNN_k=20,confidence,10,0.051668,0.045684,0.055217,0.078853,0.115229,2.677147,{'conf_alpha': 39.30074639498613}
0,KNN_k=100,bm25,10,0.052653,0.044927,0.055103,0.079599,0.118547,2.879537,"{'bm25_k1': 47.35533899208951, 'bm25_b': 0.324..."
0,KNN_k=20,robust_user_centric_weight_v2,10,0.051632,0.045473,0.055058,0.07842,0.115005,2.973098,"{'lower_q': 16.77507398129061, 'upper_q': 94.5..."
