In [1]:
import numpy as np
import pandas as pd
import optuna
from scipy.sparse import csr_matrix
from weighting_strategies import (
    bm25_weight, tfidf_weight, normalized_weight,
    log_weight, log_idf_weight, power_weight,
    pmi_weight, robust_user_centric_weight, sigmoid_propensity_weight, power_lift_weight, robust_user_centric_weight_v2
)
from implicit.nearest_neighbours import CosineRecommender
from implicit.evaluation import train_test_split, ranking_metrics_at_k

In [2]:
import sys
import os

# Add the parent directory to sys.path to resolve imports from sibling directories
sys.path.append(os.path.abspath(".."))

from utils.sparse import transform_dataframe_to_sparse

In [3]:
taste_profile_df = (
    pd.read_table(
        "/home/coder/projects/rec-sys-research/data/The Echo Nest Taste Profile Subset.txt",
        sep="\t",
        header=None,
        usecols=[0, 1, 2],
        names=['user_id', 'item_id', 'target'],
    )
)
taste_profile_df['user_id'].nunique(), taste_profile_df['item_id'].nunique(), taste_profile_df.shape[0]

(1019318, 384546, 48373586)

In [4]:
user_item_matrix, user_mapping, item_mapping = transform_dataframe_to_sparse(
    taste_profile_df, row_field='user_id', col_field='item_id', data_field='target'
)


train_val_mat, test_mat = train_test_split(user_item_matrix, train_percentage=0.9, random_state=42)
train_mat, val_mat = train_test_split(train_val_mat, train_percentage=0.9, random_state=42)

print(f"Train Shape: {train_mat.shape}, Val Shape: {val_mat.shape}, Test Shape: {test_mat.shape}")

Train Shape: (1019318, 384546), Val Shape: (1019318, 384546), Test Shape: (1019318, 384546)


In [5]:
import gc
del taste_profile_df
gc.collect()

10

In [6]:
results_folder = "results/taste_profile_knn"
results_filename = "taste_profile_knn_results.csv"

import time

def run_hyperparameter_optimization(
    train_mat: csr_matrix,
    val_mat: csr_matrix,
    train_val_mat: csr_matrix,
    test_mat: csr_matrix,
    weighting_strategy: str,
    algorithm: str,
    n_trials: int = 20,
    output_dir: str = None,
) -> pd.DataFrame:
    results = []
    algorithms = {
        "KNN_k=20": lambda: CosineRecommender(K=20),
        "KNN_k=100": lambda: CosineRecommender(K=100),
    }
    strategies = [
        "no_weighting",
        "bm25",
        "tfidf",
        "log", 
        "log_idf",
        "power",
        "normalized",
        "pmi",
        "robust_user_centric",
        "robust_user_centric_weight_v2",
        "sigmoid_propensity",
        "power_lift"
    ]
    if weighting_strategy not in strategies:
        raise ValueError(f"Weighting strategy '{weighting_strategy}' is not recognized.")
    strategy = weighting_strategy

    if algorithm not in algorithms:
        raise ValueError(f"Algorithm '{algorithm}' is not recognized.")
    algo_name = algorithm
    AlgoFactory = algorithms[algorithm]

    print(f"Running optimization for {algo_name} with {strategy}...")

    def get_weighted_matrix(matrix, params):
        weighted = matrix.copy()
        if strategy == "bm25":
            weighted = bm25_weight(weighted, K1=params.get("bm25_k1"), B=params.get("bm25_b"))
        elif strategy == "log_idf":
            weighted = log_idf_weight(weighted, alpha=params.get("conf_alpha"))
        elif strategy == "power":
            weighted = power_weight(weighted, p=params.get("power_p"))
        elif strategy == "tfidf":
            weighted = tfidf_weight(weighted)
        elif strategy == "log":
            weighted = log_weight(weighted)
        elif strategy == "normalized":
            weighted = normalized_weight(weighted)
        elif strategy == "pmi":
            weighted = pmi_weight(weighted)
        elif strategy == "robust_user_centric":
            weighted = robust_user_centric_weight(weighted, scale_factor=params.get("scale_factor"))
        elif strategy == "sigmoid_propensity":
            weighted = sigmoid_propensity_weight(weighted, p=params.get("p"), beta=params.get("beta"))
        elif strategy == "power_lift":
            weighted = power_lift_weight(weighted, p=params.get("p"))
        elif strategy == "robust_user_centric_weight_v2":
            weighted = robust_user_centric_weight_v2(weighted, lower_q=params.get("lower_q"), upper_q=params.get("upper_q"))
        return weighted

    def objective(trial):
        params = {}
        # Suggest weighting strategy parameters
        if strategy == "bm25":
            params["bm25_k1"] = trial.suggest_float("bm25_k1", 0.1, 1000)
            params["bm25_b"] = trial.suggest_float("bm25_b", 0.0, 1.0)
        elif strategy == "log_idf":
            params["conf_alpha"] = trial.suggest_float("conf_alpha", 1.0, 150.0)
        elif strategy == "power":
            params["power_p"] = trial.suggest_float("power_p", 0.1, 1.5)
        elif strategy == "robust_user_centric":
            params["scale_factor"] = trial.suggest_float("scale_factor", 0.1, 10.0)
        elif strategy == "robust_user_centric_weight_v2":
            params["lower_q"] = trial.suggest_float("lower_q", 5.0, 45.0)
            params["upper_q"] = trial.suggest_float("upper_q", 55.0, 95.0)
        elif strategy == "sigmoid_propensity":
            params["p"] = trial.suggest_float("p", 0.1, 5.0)
            params["beta"] = trial.suggest_float("beta", 0.0, 1.0)
        elif strategy == "power_lift":
            # params["p"] = trial.suggest_float("p", 0.1, 1.5)
            params["p"] = trial.suggest_float("p", 0.2, 1.3)
        weighted_train = get_weighted_matrix(train_mat, params)

        # Train Model
        model = AlgoFactory()
        model.fit(weighted_train, show_progress=False)

        # Evaluate on Validation Set
        return ranking_metrics_at_k(model, train_mat, val_mat, K=20, show_progress=False)['ndcg']

    # Optimize only if strategy has parameters
    current_trials = n_trials if strategy in ["bm25", "log_idf", "power", "robust_user_centric", "robust_user_centric_weight_v2", "sigmoid_propensity", "power_lift"] else 1
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=current_trials, n_jobs=1)

    # --- Final Retraining & Testing ---
    # Use best params to weight the full train_val matrix
    best_params = study.best_params
    weighted_train_val = get_weighted_matrix(train_val_mat, best_params)

    # Train Final Model
    final_model = AlgoFactory()
    
    start_time = time.time()
    final_model.fit(weighted_train_val, show_progress=False)
    end_time = time.time()
    
    # Evaluate on Test Set
    metrics_at_10 = ranking_metrics_at_k(final_model, train_val_mat, test_mat, K=10, show_progress=False)
    metrics_at_20 = ranking_metrics_at_k(final_model, train_val_mat, test_mat, K=20, show_progress=False)

    results.append({
        "Algorithm": algo_name,
        "Strategy": strategy,
        "Number of Optimization Trials": current_trials,
        "Best Val NDCG@20": study.best_value,
        "Test NDCG@10": metrics_at_10['ndcg'],
        "Test NDCG@20": metrics_at_20['ndcg'],
        "Test Precision@10": metrics_at_10['precision'],
        "Test Precision@20": metrics_at_20['precision'],
        "Final Train Time (s)": end_time - start_time,
        "Best Params": best_params
    })

    if output_dir:
        output_path = os.path.join(output_dir, f"{algo_name}_{strategy}_results.csv")
        pd.DataFrame(results).to_csv(output_path, index=False)
    return pd.DataFrame(results)

In [6]:
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [7]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="no_weighting", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 10:29:27,025] A new study created in memory with name: no-name-3ccf8bb3-3610-4824-9b6a-7a71ce012cdc


Running optimization for KNN_k=20 with no_weighting...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 10:32:28,695] Trial 0 finished with value: 0.03317366163332138 and parameters: {}. Best is trial 0 with value: 0.03317366163332138.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,no_weighting,1,0.033174,0.032947,0.039046,0.054574,0.077333,2.659427,{}


In [9]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="bm25", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 10:49:53,482] A new study created in memory with name: no-name-3033a303-0c90-4d03-bb95-6cd6dc2bc543


Running optimization for KNN_k=20 with bm25...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 11:30:13,097] Trial 9 finished with value: 0.05057881392267363 and parameters: {'bm25_k1': 31.33329274964659, 'bm25_b': 0.7770367948199423}. Best is trial 9 with value: 0.05057881392267363.
[I 2026-02-10 11:30:25,752] Trial 6 finished with value: 0.047706728624800146 and parameters: {'bm25_k1': 197.16155583672443, 'bm25_b': 0.12324519127078215}. Best is trial 9 with value: 0.05057881392267363.
[I 2026-02-10 11:30:32,060] Trial 5 finished with value: 0.04430933560111045 and parameters: {'bm25_k1': 780.6301029905442, 'bm25_b': 0.3633229975708161}. Best is trial 9 with value: 0.05057881392267363.
[I 2026-02-10 11:30:36,141] Trial 1 finished with value: 0.04413966150711353 and parameters: {'bm25_k1': 372.3545683306342, 'bm25_b': 0.8566742299225406}. Best is trial 9 with value: 0.05057881392267363.
[I 2026-02-10 11:30:37,060] Trial 8 finished with value: 0.04167842066424512 and parameters: {'bm25_k1': 902.145214212935

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,bm25,10,0.050579,0.046935,0.056762,0.080643,0.11796,2.696452,"{'bm25_k1': 31.33329274964659, 'bm25_b': 0.777..."


In [10]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="tfidf", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)


[I 2026-02-10 11:38:24,912] A new study created in memory with name: no-name-234d60e3-a3ac-4bd8-8e89-cb74e5dc8efd


Running optimization for KNN_k=20 with tfidf...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 11:41:27,373] Trial 0 finished with value: 0.04625727221801684 and parameters: {}. Best is trial 0 with value: 0.04625727221801684.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,tfidf,1,0.046257,0.043153,0.051849,0.074355,0.107704,2.658919,{}


In [11]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="log", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 11:48:03,721] A new study created in memory with name: no-name-7ba4244d-6181-4d03-b570-6aa124fb98b4


Running optimization for KNN_k=20 with log...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 11:51:06,241] Trial 0 finished with value: 0.049217863420257676 and parameters: {}. Best is trial 0 with value: 0.049217863420257676.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,log,1,0.049218,0.045598,0.054935,0.078724,0.114667,2.669255,{}


In [None]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="log_idf", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 11:57:42,026] A new study created in memory with name: no-name-a19b1076-e555-4baf-bb0d-8948f9a7951e


Running optimization for KNN_k=20 with confidence...


[I 2026-02-10 12:37:49,481] Trial 0 finished with value: 0.05166755533348305 and parameters: {'conf_alpha': 40.09762102737804}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:01,759] Trial 5 finished with value: 0.051666210024467286 and parameters: {'conf_alpha': 121.37006301403984}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:16,326] Trial 4 finished with value: 0.05166655487100263 and parameters: {'conf_alpha': 59.38987039111477}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:19,552] Trial 6 finished with value: 0.051666270116179344 and parameters: {'conf_alpha': 115.89880197999125}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:21,344] Trial 9 finished with value: 0.05166661000799903 and parameters: {'conf_alpha': 56.454954758371805}. Best is trial 0 with value: 0.05166755533348305.
[I 2026-02-10 12:39:32,551] Trial 8 finished with value: 0.05166624606529235 and parameters: {'conf_alpha': 14

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,confidence,10,0.051668,0.045684,0.055217,0.078853,0.115229,2.677147,{'conf_alpha': 39.30074639498613}


In [13]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 12:46:16,766] A new study created in memory with name: no-name-413144d9-68df-4009-a3f4-bf772178671a


Running optimization for KNN_k=20 with power...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 13:27:03,809] Trial 0 finished with value: 0.03961969333370199 and parameters: {'power_p': 0.7485169857093352}. Best is trial 0 with value: 0.03961969333370199.
[I 2026-02-10 13:27:06,808] Trial 7 finished with value: 0.027279800483805933 and parameters: {'power_p': 1.2343449449248376}. Best is trial 0 with value: 0.03961969333370199.
[I 2026-02-10 13:27:22,182] Trial 1 finished with value: 0.04977093384043038 and parameters: {'power_p': 0.16643832838443454}. Best is trial 1 with value: 0.04977093384043038.
[I 2026-02-10 13:27:23,077] Trial 8 finished with value: 0.034038326447379884 and parameters: {'power_p': 0.9615557580198211}. Best is trial 1 with value: 0.04977093384043038.
[I 2026-02-10 13:27:23,758] Trial 4 finished with value: 0.0282876569191582 and parameters: {'power_p': 1.2004169746524196}. Best is trial 1 with value: 0.04977093384043038.
[I 2026-02-10 13:27:26,220] Trial 9 finished with value: 0.0253

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,power,10,0.049771,0.045984,0.055527,0.079498,0.116189,2.734898,{'power_p': 0.16643832838443454}


In [14]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="normalized", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 13:34:13,588] A new study created in memory with name: no-name-2323d629-e021-445c-b2b8-2954201a0dbc


Running optimization for KNN_k=20 with normalized...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 13:37:17,122] Trial 0 finished with value: 0.0021026602927233505 and parameters: {}. Best is trial 0 with value: 0.0021026602927233505.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,normalized,1,0.002103,0.003591,0.002727,0.006848,0.005739,2.839982,{}


In [15]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="pmi", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 13:44:24,953] A new study created in memory with name: no-name-85674416-7b46-42be-9584-b6975e6c0fd3


Running optimization for KNN_k=20 with pmi...


  pmi = log((X.data * N) / denominator) # we could use np.power(X.data, p) instead of log for a softer effect
  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 13:47:29,275] Trial 0 finished with value: 0.04768733965605798 and parameters: {}. Best is trial 0 with value: 0.04768733965605798.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,pmi,1,0.047687,0.044153,0.053316,0.076539,0.111868,2.815338,{}


In [16]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 13:54:24,935] A new study created in memory with name: no-name-5a6729fb-17c2-437e-b7fe-5b358abb4c67


Running optimization for KNN_k=20 with robust_user_centric...


  weights = 1 / (1 + np.exp(-z_scores))
[I 2026-02-10 14:33:45,355] Trial 0 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 9.607177688936702}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:05,787] Trial 9 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 0.18679194942379201}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:05,842] Trial 4 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 4.099596666976707}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:22,983] Trial 6 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 0.7337888337379402}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:24,351] Trial 3 finished with value: 0.05104108861975967 and parameters: {'scale_factor': 5.042861580512368}. Best is trial 0 with value: 0.05104108861975967.
[I 2026-02-10 14:35:30,450] Trial 2 finished with value: 0.051

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,robust_user_centric,10,0.051041,0.045036,0.054587,0.077673,0.11412,2.819693,{'scale_factor': 9.607177688936702}


In [11]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric_weight_v2", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-11 14:06:06,057] A new study created in memory with name: no-name-936ecf02-46ee-4bf8-9fd0-35eadb1a9869


Running optimization for KNN_k=20 with robust_user_centric_weight_v2...


  weights = 1 / (1 + np.exp(-z_scores))
[I 2026-02-11 14:48:52,577] Trial 1 finished with value: 0.05079662449566239 and parameters: {'lower_q': 42.654022850205145, 'upper_q': 80.61176300615404}. Best is trial 1 with value: 0.05079662449566239.
[I 2026-02-11 14:48:57,649] Trial 8 finished with value: 0.050872876812807714 and parameters: {'lower_q': 39.16143802692331, 'upper_q': 80.51752942273106}. Best is trial 8 with value: 0.050872876812807714.
[I 2026-02-11 14:49:30,435] Trial 3 finished with value: 0.05118837915574804 and parameters: {'lower_q': 35.396665974594015, 'upper_q': 87.05576664524745}. Best is trial 3 with value: 0.05118837915574804.
[I 2026-02-11 14:49:55,671] Trial 7 finished with value: 0.05022382884786496 and parameters: {'lower_q': 40.526390537016866, 'upper_q': 64.2747792291893}. Best is trial 3 with value: 0.05118837915574804.
[I 2026-02-11 14:50:03,678] Trial 2 finished with value: 0.05163171012102409 and parameters: {'lower_q': 16.77507398129061, 'upper_q': 94.50

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,robust_user_centric_weight_v2,10,0.051632,0.045473,0.055058,0.07842,0.115005,2.973098,"{'lower_q': 16.77507398129061, 'upper_q': 94.5..."


In [8]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="sigmoid_propensity", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 17:31:23,624] A new study created in memory with name: no-name-1fac9482-1d15-43c4-b4fc-8bc7d65f4a88


Running optimization for KNN_k=20 with sigmoid_propensity...


  sum_log += np.sum(np.log(X.data[start:end]))
  term = C * np.power(data_chunk, neg_p)
  term = C * np.power(data_chunk, neg_p)
[I 2026-02-10 18:09:42,331] Trial 1 finished with value: 0.00010805872153133262 and parameters: {'p': 1.4962287565556627, 'beta': 0.30789820083645314}. Best is trial 1 with value: 0.00010805872153133262.
[I 2026-02-10 18:10:02,943] Trial 0 finished with value: 0.00010808033611370411 and parameters: {'p': 4.429911784997341, 'beta': 0.2570127997899406}. Best is trial 0 with value: 0.00010808033611370411.
[I 2026-02-10 18:10:12,719] Trial 6 finished with value: 0.00010808317746843375 and parameters: {'p': 0.995471324344374, 'beta': 0.8693745425744215}. Best is trial 6 with value: 0.00010808317746843375.
[I 2026-02-10 18:10:13,274] Trial 2 finished with value: 0.00010811209915694863 and parameters: {'p': 0.5536633300907005, 'beta': 0.08149765515557539}. Best is trial 2 with value: 0.00010811209915694863.
[I 2026-02-10 18:10:13,854] Trial 7 finished with value: 0.

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,sigmoid_propensity,10,0.000108,5.4e-05,9.7e-05,0.000154,0.000373,2.740127,"{'p': 4.571656888585181, 'beta': 0.46188283140..."


In [7]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power_lift", algorithm="KNN_k=20", n_trials=10, output_dir=results_folder)

[I 2026-02-10 16:43:50,995] A new study created in memory with name: no-name-82ebfcba-9765-4f92-b654-f4b8201ab252


Running optimization for KNN_k=20 with power_lift...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-10 17:24:14,441] Trial 8 finished with value: 0.050992517563183945 and parameters: {'p': 0.1410362558015314}. Best is trial 8 with value: 0.050992517563183945.
[I 2026-02-10 17:24:26,933] Trial 0 finished with value: 0.05084135686957546 and parameters: {'p': 0.3003341359392696}. Best is trial 8 with value: 0.050992517563183945.
[I 2026-02-10 17:24:30,756] Trial 1 finished with value: 0.050718898657294974 and parameters: {'p': 0.11284986796701375}. Best is trial 8 with value: 0.050992517563183945.
[I 2026-02-10 17:24:34,572] Trial 3 finished with value: 0.0423668751716062 and parameters: {'p': 0.788500758870356}. Best is trial 8 with value: 0.050992517563183945.
[I 2026-02-10 17:24:44,080] Trial 9 finished with value: 0.051026176485000706 and parameters: {'p': 0.3221406843716537}. Best is trial 9 with value: 0.051026176485000706.
[I 2026-02-10 17:24:45,936] Trial 4 finished with value: 0.040666353087975844 and parame

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,power_lift,10,0.051308,0.047369,0.057228,0.08163,0.119258,2.755537,{'p': 0.24434787681681283}


In [13]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="no_weighting", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 10:12:32,561] A new study created in memory with name: no-name-8c8556b1-1ed8-4e15-8833-effa53051ea3


Running optimization for KNN_k=100 with no_weighting...


[I 2026-02-23 10:14:54,125] Trial 0 finished with value: 0.0998855154039036 and parameters: {}. Best is trial 0 with value: 0.0998855154039036.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,no_weighting,1,0.099886,0.107938,0.119734,0.119193,0.144117,15.410982,{}


In [12]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="bm25", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 09:43:44,812] A new study created in memory with name: no-name-1a34fecc-8b20-4c97-b246-67894c43ae96


Running optimization for KNN_k=100 with bm25...


[I 2026-02-23 09:46:09,995] Trial 0 finished with value: 0.1348270478306797 and parameters: {'bm25_k1': 374.60266483547775, 'bm25_b': 0.9507143064099162}. Best is trial 0 with value: 0.1348270478306797.
[I 2026-02-23 09:48:34,581] Trial 1 finished with value: 0.13044038937127397 and parameters: {'bm25_k1': 732.0207424172239, 'bm25_b': 0.5986584841970366}. Best is trial 0 with value: 0.1348270478306797.
[I 2026-02-23 09:50:57,647] Trial 2 finished with value: 0.1256233292030479 and parameters: {'bm25_k1': 156.10303857839227, 'bm25_b': 0.15599452033620265}. Best is trial 0 with value: 0.1348270478306797.
[I 2026-02-23 09:53:21,006] Trial 3 finished with value: 0.13997703507901318 and parameters: {'bm25_k1': 58.17780380698264, 'bm25_b': 0.8661761457749352}. Best is trial 3 with value: 0.13997703507901318.
[I 2026-02-23 09:55:45,291] Trial 4 finished with value: 0.13210455271347138 and parameters: {'bm25_k1': 601.1549002420345, 'bm25_b': 0.7080725777960455}. Best is trial 3 with value: 0.1

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,bm25,10,0.144326,0.152103,0.167281,0.165946,0.198132,15.327767,"{'bm25_k1': 20.68243584637287, 'bm25_b': 0.969..."


In [11]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="tfidf", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)


[I 2026-02-23 09:36:55,876] A new study created in memory with name: no-name-711b8d95-362c-44f0-816f-1aaad3622675


Running optimization for KNN_k=100 with tfidf...


[I 2026-02-23 09:39:12,178] Trial 0 finished with value: 0.1396993549986942 and parameters: {}. Best is trial 0 with value: 0.1396993549986942.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,tfidf,1,0.139699,0.146597,0.160044,0.156631,0.184182,15.353258,{}


In [14]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="log", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 10:19:41,869] A new study created in memory with name: no-name-cbea28ef-1159-45cc-a017-a7c609927d91


Running optimization for KNN_k=100 with log...


[I 2026-02-23 10:21:57,576] Trial 0 finished with value: 0.14092595016235412 and parameters: {}. Best is trial 0 with value: 0.14092595016235412.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,log,1,0.140926,0.147697,0.161189,0.157501,0.185052,15.277404,{}


In [16]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="log_idf", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 10:33:37,060] A new study created in memory with name: no-name-ee21f67b-da5c-4c3f-af00-2c2b6ea34d81


Running optimization for KNN_k=100 with log_idf...


[I 2026-02-23 10:35:53,665] Trial 0 finished with value: 0.1409334859236783 and parameters: {'conf_alpha': 56.80647770825701}. Best is trial 0 with value: 0.1409334859236783.
[I 2026-02-23 10:38:09,318] Trial 1 finished with value: 0.14093158071231301 and parameters: {'conf_alpha': 142.6564316550775}. Best is trial 0 with value: 0.1409334859236783.
[I 2026-02-23 10:40:25,590] Trial 2 finished with value: 0.14093168031935988 and parameters: {'conf_alpha': 110.06709732989935}. Best is trial 0 with value: 0.1409334859236783.
[I 2026-02-23 10:42:42,005] Trial 3 finished with value: 0.14093306711355755 and parameters: {'conf_alpha': 90.20011414535846}. Best is trial 0 with value: 0.1409334859236783.
[I 2026-02-23 10:44:58,400] Trial 4 finished with value: 0.14094832079885572 and parameters: {'conf_alpha': 24.246777425923042}. Best is trial 4 with value: 0.14094832079885572.
[I 2026-02-23 10:47:14,770] Trial 5 finished with value: 0.1409484553872543 and parameters: {'conf_alpha': 24.24318353

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,log_idf,10,0.140995,0.147766,0.161249,0.157546,0.18507,15.286721,{'conf_alpha': 9.65445821306172}


In [17]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 11:00:48,699] A new study created in memory with name: no-name-9018bf65-d77c-4fdd-a2c8-767d2ce480fe


Running optimization for KNN_k=100 with power...


[I 2026-02-23 11:03:06,773] Trial 0 finished with value: 0.1344979435138109 and parameters: {'power_p': 0.6243561663863074}. Best is trial 0 with value: 0.1344979435138109.
[I 2026-02-23 11:05:30,470] Trial 1 finished with value: 0.05295853187888493 and parameters: {'power_p': 1.4310000289738827}. Best is trial 0 with value: 0.1344979435138109.
[I 2026-02-23 11:07:54,301] Trial 2 finished with value: 0.08371425883242443 and parameters: {'power_p': 1.1247915185359671}. Best is trial 0 with value: 0.1344979435138109.
[I 2026-02-23 11:10:16,826] Trial 3 finished with value: 0.10746384441653496 and parameters: {'power_p': 0.9381218778758512}. Best is trial 0 with value: 0.1344979435138109.
[I 2026-02-23 11:12:31,459] Trial 4 finished with value: 0.1430784679273025 and parameters: {'power_p': 0.3184260966194111}. Best is trial 4 with value: 0.1430784679273025.
[I 2026-02-23 11:14:45,073] Trial 5 finished with value: 0.14307991927353975 and parameters: {'power_p': 0.31839232847068366}. Best 

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,power,10,0.143688,0.150346,0.163801,0.159753,0.187221,15.253011,{'power_p': 0.18131705703547923}


In [15]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="normalized", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 10:26:28,965] A new study created in memory with name: no-name-1ea861f1-45f3-4e87-9490-4c2fba856d65


Running optimization for KNN_k=100 with normalized...


[I 2026-02-23 10:28:51,744] Trial 0 finished with value: 0.14173275276122504 and parameters: {}. Best is trial 0 with value: 0.14173275276122504.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,normalized,1,0.141733,0.149469,0.164173,0.16318,0.194391,15.321922,{}


In [10]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="pmi", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 09:30:14,177] A new study created in memory with name: no-name-54372408-cd64-4984-ada5-300c480e9ab0


Running optimization for KNN_k=100 with pmi...


[I 2026-02-23 09:32:28,774] Trial 0 finished with value: 0.14629254791918214 and parameters: {}. Best is trial 0 with value: 0.14629254791918214.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,pmi,1,0.146293,0.152881,0.167129,0.163907,0.193245,15.317664,{}


In [9]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric", algorithm="KNN_k=100", n_trials=1, output_dir=results_folder)

[I 2026-02-23 09:23:29,797] A new study created in memory with name: no-name-96222b5a-9f88-43c8-bbf9-5080d01447a2


Running optimization for KNN_k=100 with robust_user_centric...


[I 2026-02-23 09:25:45,547] Trial 0 finished with value: 0.14234480722461104 and parameters: {'scale_factor': 3.807947176588889}. Best is trial 0 with value: 0.14234480722461104.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,robust_user_centric,1,0.142345,0.14912,0.162463,0.158474,0.185772,15.354809,{'scale_factor': 3.807947176588889}


In [None]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric_weight_v2", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

In [7]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="sigmoid_propensity", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 11:59:23,318] A new study created in memory with name: no-name-60fa1123-aa6e-4db8-a58b-7a862218b97d


Running optimization for KNN_k=100 with sigmoid_propensity...


[I 2026-02-23 12:01:52,777] Trial 0 finished with value: 0.14103911801257518 and parameters: {'p': 1.9352465823520764, 'beta': 0.9507143064099162}. Best is trial 0 with value: 0.14103911801257518.
[I 2026-02-23 12:04:18,296] Trial 1 finished with value: 0.1386496720212859 and parameters: {'p': 3.6867703148758855, 'beta': 0.5986584841970366}. Best is trial 0 with value: 0.14103911801257518.
[I 2026-02-23 12:06:32,756] Trial 2 finished with value: 0.14309384145093543 and parameters: {'p': 0.864491338167939, 'beta': 0.15599452033620265}. Best is trial 2 with value: 0.14309384145093543.
[I 2026-02-23 12:08:46,238] Trial 3 finished with value: 0.14364372215666374 and parameters: {'p': 0.38460969962417735, 'beta': 0.8661761457749352}. Best is trial 3 with value: 0.14364372215666374.
[I 2026-02-23 12:11:04,301] Trial 4 finished with value: 0.13929217689613355 and parameters: {'p': 3.0454635575417233, 'beta': 0.7080725777960455}. Best is trial 3 with value: 0.14364372215666374.
[I 2026-02-23 1

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,sigmoid_propensity,10,0.143644,0.150289,0.163719,0.159719,0.187141,15.287412,"{'p': 0.38460969962417735, 'beta': 0.866176145..."


In [7]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power_lift", algorithm="KNN_k=100", n_trials=10, output_dir=results_folder)

[I 2026-02-23 08:52:55,689] A new study created in memory with name: no-name-c3b372f6-6422-4742-8cda-7145d53342e4


Running optimization for KNN_k=100 with power_lift...


[I 2026-02-23 08:55:25,423] Trial 0 finished with value: 0.1460404323404775 and parameters: {'p': 0.6119941307320989}. Best is trial 0 with value: 0.1460404323404775.
[I 2026-02-23 08:58:00,645] Trial 1 finished with value: 0.11654326487226556 and parameters: {'p': 1.2457857370509078}. Best is trial 0 with value: 0.1460404323404775.
[I 2026-02-23 09:00:35,007] Trial 2 finished with value: 0.13167259415827226 and parameters: {'p': 1.0051933359925456}. Best is trial 0 with value: 0.1460404323404775.
[I 2026-02-23 09:03:08,411] Trial 3 finished with value: 0.1387663712069375 and parameters: {'p': 0.8585243326167402}. Best is trial 0 with value: 0.1460404323404775.
[I 2026-02-23 09:05:33,352] Trial 4 finished with value: 0.1479681244991293 and parameters: {'p': 0.37162050448668016}. Best is trial 4 with value: 0.1479681244991293.
[I 2026-02-23 09:07:58,910] Trial 5 finished with value: 0.14796815383298145 and parameters: {'p': 0.37159397236982294}. Best is trial 5 with value: 0.14796815383

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,power_lift,10,0.147968,0.154531,0.169288,0.166464,0.197192,15.257368,{'p': 0.37159397236982294}


In [8]:
import glob

all_results = []
# Match any CSV in the result folder
for f in glob.glob(f"{results_folder}/*.csv"):
    all_results.append(pd.read_csv(f))

if all_results:
    experiment_results = pd.concat(all_results)
    experiment_results = experiment_results.sort_values("Test NDCG@20", ascending=False)
    experiment_results.to_csv(results_filename, index=False)
else:
    print("No results found.")

experiment_results

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,power_lift,10,0.147968,0.154531,0.169288,0.166464,0.197192,15.257368,{'p': 0.37159397236982294}
0,KNN_k=100,bm25,10,0.144326,0.152103,0.167281,0.165946,0.198132,15.327767,"{'bm25_k1': 20.68243584637287, 'bm25_b': 0.969..."
0,KNN_k=100,pmi,1,0.146293,0.152881,0.167129,0.163907,0.193245,15.317664,{}
0,KNN_k=100,normalized,1,0.141733,0.149469,0.164173,0.16318,0.194391,15.321922,{}
0,KNN_k=100,power,10,0.143688,0.150346,0.163801,0.159753,0.187221,15.253011,{'power_p': 0.18131705703547923}
0,KNN_k=100,sigmoid_propensity,10,0.143644,0.150289,0.163719,0.159719,0.187141,15.287412,"{'p': 0.38460969962417735, 'beta': 0.866176145..."
0,KNN_k=100,robust_user_centric,1,0.142345,0.14912,0.162463,0.158474,0.185772,15.354809,{'scale_factor': 3.807947176588889}
0,KNN_k=100,log_idf,10,0.140995,0.147766,0.161249,0.157546,0.18507,15.286721,{'conf_alpha': 9.65445821306172}
0,KNN_k=100,log,1,0.140926,0.147697,0.161189,0.157501,0.185052,15.277404,{}
0,KNN_k=100,tfidf,1,0.139699,0.146597,0.160044,0.156631,0.184182,15.353258,{}
