In [1]:
import numpy as np
import pandas as pd
import optuna
from scipy.sparse import csr_matrix
from weighting_strategies import (
    bm25_weight, tfidf_weight, normalized_weight,
    log_weight, confidence_weight, power_weight,
    pmi_weight, robust_user_centric_weight, sigmoid_propensity_weight, power_lift_weight, robust_user_centric_weight_v2
)
from implicit.nearest_neighbours import CosineRecommender
from implicit.evaluation import train_test_split, precision_at_k, ndcg_at_k

import cornac


In [2]:
import sys
import os

# Add the parent directory to sys.path to resolve imports from sibling directories
sys.path.append(os.path.abspath(".."))

from utils.sparse import transform_dataframe_to_sparse

In [3]:
steam_df = (
    pd.read_csv(
        "/home/coder/projects/rec-sys-research/data/steam/steam_recommendations.csv",
        usecols=['user_id', 'app_id', 'hours'],
    )
    .loc[:, ['user_id', 'app_id', 'hours']]
    .drop_duplicates()
    .dropna()
    .rename(columns={'app_id': 'item_id', 'hours': 'target'})
)
steam_df['user_id'].nunique(), steam_df['item_id'].nunique(), steam_df.shape[0]

(13781059, 37610, 41154773)

In [4]:
unique_users = steam_df['user_id'].unique()
keep_ratio = 0.2
sampled_users = np.random.choice(unique_users, size=int(len(unique_users) * keep_ratio), replace=False)
steam_df = steam_df[steam_df['user_id'].isin(sampled_users)]

steam_df['user_id'].nunique(), steam_df['item_id'].nunique(), steam_df.shape[0]

(2756211, 34923, 8221106)

In [5]:
user_item_matrix, user_mapping, item_mapping = transform_dataframe_to_sparse(
    steam_df, row_field='user_id', col_field='item_id', data_field='target'
)


train_val_mat, test_mat = train_test_split(user_item_matrix, train_percentage=0.9, random_state=42)
train_mat, val_mat = train_test_split(train_val_mat, train_percentage=0.9, random_state=42)

print(f"Train Shape: {train_mat.shape}, Val Shape: {val_mat.shape}, Test Shape: {test_mat.shape}")

Train Shape: (2756211, 34923), Val Shape: (2756211, 34923), Test Shape: (2756211, 34923)


In [6]:
results_folder = "results/steam_knn"
results_filename = "steam_knn_results.csv"

import time

def run_hyperparameter_optimization(
    train_mat: csr_matrix,
    val_mat: csr_matrix,
    train_val_mat: csr_matrix,
    test_mat: csr_matrix,
    weighting_strategy: str,
    algorithm: str,
    n_trials: int = 20,
    output_dir: str = None,
) -> pd.DataFrame:
    results = []
    algorithms = {
        "KNN_k=20": lambda: CosineRecommender(K=20),
        "KNN_k=100": lambda: CosineRecommender(K=100),
    }
    strategies = [
        "no_weighting",
        "bm25",
        "tfidf",
        "log", 
        "confidence",
        "power",
        "normalized",
        "pmi",
        "robust_user_centric",
        "robust_user_centric_weight_v2",
        "sigmoid_propensity",
        "power_lift"
    ]
    if weighting_strategy not in strategies:
        raise ValueError(f"Weighting strategy '{weighting_strategy}' is not recognized.")
    strategy = weighting_strategy

    if algorithm not in algorithms:
        raise ValueError(f"Algorithm '{algorithm}' is not recognized.")
    algo_name = algorithm
    AlgoFactory = algorithms[algorithm]

    print(f"Running optimization for {algo_name} with {strategy}...")

    def get_weighted_matrix(matrix, params):
        weighted = matrix.copy()
        if strategy == "bm25":
            weighted = bm25_weight(weighted, K1=params.get("bm25_k1"), B=params.get("bm25_b"))
        elif strategy == "confidence":
            weighted = confidence_weight(weighted, alpha=params.get("conf_alpha"))
        elif strategy == "power":
            weighted = power_weight(weighted, p=params.get("power_p"))
        elif strategy == "tfidf":
            weighted = tfidf_weight(weighted)
        elif strategy == "log":
            weighted = log_weight(weighted)
        elif strategy == "normalized":
            weighted = normalized_weight(weighted)
        elif strategy == "pmi":
            weighted = pmi_weight(weighted)
        elif strategy == "robust_user_centric":
            weighted = robust_user_centric_weight(weighted, scale_factor=params.get("scale_factor"))
        elif strategy == "sigmoid_propensity":
            weighted = sigmoid_propensity_weight(weighted, p=params.get("p"), beta=params.get("beta"))
        elif strategy == "power_lift":
            weighted = power_lift_weight(weighted, p=params.get("p"))
        elif strategy == "robust_user_centric_weight_v2":
            weighted = robust_user_centric_weight_v2(weighted, lower_q=params.get("lower_q"), upper_q=params.get("upper_q"))
        return weighted

    def objective(trial):
        params = {}
        # Suggest weighting strategy parameters
        if strategy == "bm25":
            params["bm25_k1"] = trial.suggest_float("bm25_k1", 0.1, 1000)
            params["bm25_b"] = trial.suggest_float("bm25_b", 0.0, 1.0)
        elif strategy == "confidence":
            params["conf_alpha"] = trial.suggest_float("conf_alpha", 1.0, 150.0)
        elif strategy == "power":
            params["power_p"] = trial.suggest_float("power_p", 0.1, 1.5)
        elif strategy == "robust_user_centric":
            params["scale_factor"] = trial.suggest_float("scale_factor", 0.1, 10.0)
        elif strategy == "robust_user_centric_weight_v2":
            params["lower_q"] = trial.suggest_float("lower_q", 5.0, 45.0)
            params["upper_q"] = trial.suggest_float("upper_q", 55.0, 95.0)
        elif strategy == "sigmoid_propensity":
            params["p"] = trial.suggest_float("p", 0.1, 5.0)
            params["beta"] = trial.suggest_float("beta", 0.0, 1.0)
        elif strategy == "power_lift":
            params["p"] = trial.suggest_float("p", 0.1, 1.5)
        weighted_train = get_weighted_matrix(train_mat, params)

        # Train Model
        model = AlgoFactory()
        model.fit(weighted_train, show_progress=False)

        # Evaluate on Validation Set
        return ndcg_at_k(model, train_mat, val_mat, K=20, show_progress=False)

    # Optimize only if strategy has parameters
    current_trials = n_trials if strategy in ["bm25", "confidence", "power", "robust_user_centric", "robust_user_centric_weight_v2", "sigmoid_propensity", "power_lift"] else 1
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=current_trials, n_jobs=-1)

    # --- Final Retraining & Testing ---
    # Use best params to weight the full train_val matrix
    best_params = study.best_params
    weighted_train_val = get_weighted_matrix(train_val_mat, best_params)

    # Train Final Model
    final_model = AlgoFactory()
    
    start_time = time.time()
    final_model.fit(weighted_train_val, show_progress=False)
    end_time = time.time()
    
    # Evaluate on Test Set
    test_ndcg_10 = ndcg_at_k(final_model, train_val_mat, test_mat, K=10, show_progress=False)
    test_precision_10 = precision_at_k(final_model, train_val_mat, test_mat, K=10, show_progress=False)
    test_ndcg_20 = ndcg_at_k(final_model, train_val_mat, test_mat, K=20, show_progress=False)
    test_precision_20 = precision_at_k(final_model, train_val_mat, test_mat, K=20, show_progress=False)

    results.append({
        "Algorithm": algo_name,
        "Strategy": strategy,
        "Number of Optimization Trials": current_trials,
        "Best Val NDCG@20": study.best_value,
        "Test NDCG@10": test_ndcg_10,
        "Test NDCG@20": test_ndcg_20,
        "Test Precision@10": test_precision_10,
        "Test Precision@20": test_precision_20,
        "Final Train Time (s)": end_time - start_time,
        "Best Params": best_params
    })

    if output_dir:
        output_path = os.path.join(output_dir, f"{algo_name}_{strategy}_results.csv")
        pd.DataFrame(results).to_csv(output_path, index=False)
    return pd.DataFrame(results)

In [7]:
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [8]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="no_weighting", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

[I 2026-02-08 20:57:13,081] A new study created in memory with name: no-name-4018c675-c844-416d-a827-7c51ce14e200


Running optimization for KNN_k=20 with no_weighting...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-08 20:57:48,601] Trial 0 finished with value: 0.022315868292687932 and parameters: {}. Best is trial 0 with value: 0.022315868292687932.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,no_weighting,1,0.022316,0.022292,0.026283,0.035798,0.05061,0.762561,{}


In [9]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="bm25", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

[I 2026-02-08 21:00:22,650] A new study created in memory with name: no-name-018f04eb-1fc1-41cb-89a6-2521c51eb6bc


Running optimization for KNN_k=20 with bm25...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-08 21:13:19,151] Trial 3 finished with value: 0.023037339534957107 and parameters: {'bm25_k1': 665.4144343657177, 'bm25_b': 0.7893638556798601}. Best is trial 3 with value: 0.023037339534957107.
[I 2026-02-08 21:13:19,426] Trial 12 finished with value: 0.03180185529494313 and parameters: {'bm25_k1': 676.5458042216974, 'bm25_b': 0.21172095877384534}. Best is trial 12 with value: 0.03180185529494313.
[I 2026-02-08 21:13:20,867] Trial 13 finished with value: 0.03428503437493566 and parameters: {'bm25_k1': 362.2190137117366, 'bm25_b': 0.24735720048872978}. Best is trial 13 with value: 0.03428503437493566.
[I 2026-02-08 21:13:23,906] Trial 1 finished with value: 0.02220871916040659 and parameters: {'bm25_k1': 590.1487655426472, 'bm25_b': 0.8561350101835346}. Best is trial 13 with value: 0.03428503437493566.
[I 2026-02-08 21:13:28,588] Trial 4 finished with value: 0.029849832034617094 and parameters: {'bm25_k1': 850.36374

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,bm25,20,0.047995,0.043466,0.052271,0.074397,0.107907,0.703722,"{'bm25_k1': 5.70943264168309, 'bm25_b': 0.6801..."


In [8]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="tfidf", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)


[I 2026-02-08 21:30:48,056] A new study created in memory with name: no-name-5fc16871-5ed5-45b9-ae1a-17c20c080d17


Running optimization for KNN_k=20 with tfidf...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-08 21:31:24,833] Trial 0 finished with value: 0.03940907499912941 and parameters: {}. Best is trial 0 with value: 0.03940907499912941.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,tfidf,1,0.039409,0.03685,0.044045,0.062813,0.090411,0.743291,{}


In [9]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="log", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

[I 2026-02-08 21:34:01,614] A new study created in memory with name: no-name-6a56625c-f643-47cf-af10-c93a92fd9862


Running optimization for KNN_k=20 with log...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-08 21:34:38,016] Trial 0 finished with value: 0.04454237791769868 and parameters: {}. Best is trial 0 with value: 0.04454237791769868.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,log,1,0.044542,0.040929,0.048828,0.069961,0.100757,0.694295,{}


In [10]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="confidence", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

[I 2026-02-08 21:37:15,895] A new study created in memory with name: no-name-392c79d4-a0a0-426e-94f5-761a672518c9


Running optimization for KNN_k=20 with confidence...


[I 2026-02-08 21:50:15,411] Trial 0 finished with value: 0.04585265074584159 and parameters: {'conf_alpha': 106.92631807853583}. Best is trial 0 with value: 0.04585265074584159.
[I 2026-02-08 21:50:26,125] Trial 2 finished with value: 0.045853714683484294 and parameters: {'conf_alpha': 71.9736299556975}. Best is trial 2 with value: 0.045853714683484294.
[I 2026-02-08 21:50:27,623] Trial 15 finished with value: 0.04585454511904639 and parameters: {'conf_alpha': 44.20800607354046}. Best is trial 15 with value: 0.04585454511904639.
[I 2026-02-08 21:50:30,396] Trial 8 finished with value: 0.04588111162951468 and parameters: {'conf_alpha': 9.722986878770948}. Best is trial 8 with value: 0.04588111162951468.
[I 2026-02-08 21:50:33,425] Trial 4 finished with value: 0.04585250184183873 and parameters: {'conf_alpha': 146.89043946176227}. Best is trial 8 with value: 0.04588111162951468.
[I 2026-02-08 21:50:35,282] Trial 5 finished with value: 0.04585371987993375 and parameters: {'conf_alpha': 58

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,confidence,20,0.045881,0.042271,0.050608,0.071883,0.103768,0.690609,{'conf_alpha': 9.722986878770948}


In [11]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

[I 2026-02-08 21:56:21,074] A new study created in memory with name: no-name-32b105bb-fa5e-4df3-a3bf-ccbb05204d89


Running optimization for KNN_k=20 with power...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-08 22:09:18,711] Trial 14 finished with value: 0.03339700292044895 and parameters: {'power_p': 0.6820831134842941}. Best is trial 14 with value: 0.03339700292044895.
[I 2026-02-08 22:09:22,619] Trial 1 finished with value: 0.016437978647781154 and parameters: {'power_p': 1.2878674566786075}. Best is trial 14 with value: 0.03339700292044895.
[I 2026-02-08 22:09:23,946] Trial 8 finished with value: 0.013659466626793355 and parameters: {'power_p': 1.4649598358988682}. Best is trial 14 with value: 0.03339700292044895.
[I 2026-02-08 22:09:24,209] Trial 10 finished with value: 0.014564330102794712 and parameters: {'power_p': 1.3993346603295296}. Best is trial 14 with value: 0.03339700292044895.
[I 2026-02-08 22:09:25,171] Trial 12 finished with value: 0.03545887416040226 and parameters: {'power_p': 0.6059527831733574}. Best is trial 12 with value: 0.03545887416040226.
[I 2026-02-08 22:09:27,611] Trial 7 finished with valu

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,power,20,0.046025,0.041837,0.050159,0.071557,0.103683,0.696797,{'power_p': 0.11935928797919947}


In [12]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="normalized", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

[I 2026-02-08 22:15:22,622] A new study created in memory with name: no-name-8e8d494f-0a2f-48e4-ada5-9cfec0a64a5c


Running optimization for KNN_k=20 with normalized...


  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]
[I 2026-02-08 22:15:58,439] Trial 0 finished with value: 0.004930852450007813 and parameters: {}. Best is trial 0 with value: 0.004930852450007813.


Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=20,normalized,1,0.004931,0.005854,0.005162,0.009406,0.009787,0.686958,{}


In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="pmi", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

[I 2026-02-08 22:18:32,579] A new study created in memory with name: no-name-7fc33c48-8206-44ee-8cd3-745b7e0e0ae2


Running optimization for KNN_k=20 with pmi...


  pmi = log((X.data * N) / denominator) # we could use np.power(X.data, p) instead of log for a softer effect
  X.data = X.data / sqrt(bincount(X.row, X.data**2))[X.row]


In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric_weight_v2", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="sigmoid_propensity", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power_lift", algorithm="KNN_k=20", n_trials=20, output_dir=results_folder)

In [None]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="no_weighting", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="bm25", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="tfidf", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)


In [None]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="log", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="confidence", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:
run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="normalized", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="pmi", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="robust_user_centric_weight_v2", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="sigmoid_propensity", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:

run_hyperparameter_optimization(train_mat, val_mat, train_val_mat, test_mat, weighting_strategy="power_lift", algorithm="KNN_k=100", n_trials=20, output_dir=results_folder)

In [None]:
import glob

all_results = []
# Match any CSV in the result folder
for f in glob.glob(f"{results_folder}/*.csv"):
    all_results.append(pd.read_csv(f))

if all_results:
    experiment_results = pd.concat(all_results)
    experiment_results = experiment_results.sort_values("Test NDCG@20", ascending=False)
    experiment_results.to_csv(results_filename, index=False)
else:
    print("No results found.")

experiment_results

Unnamed: 0,Algorithm,Strategy,Number of Optimization Trials,Best Val NDCG@20,Test NDCG@10,Test NDCG@20,Test Precision@10,Test Precision@20,Final Train Time (s),Best Params
0,KNN_k=100,power_lift,20,0.252733,0.297673,0.314228,0.307439,0.33432,0.028892,{'p': 1.1627526935071715}
0,KNN_k=100,bm25,20,0.251625,0.293014,0.30985,0.30414,0.330886,0.028216,"{'bm25_k1': 338.7558204049336, 'bm25_b': 0.996..."
0,KNN_k=100,pmi,1,0.248729,0.29483,0.309707,0.311397,0.335505,0.028297,{}
0,KNN_k=20,pmi,1,0.248759,0.284989,0.301401,0.304965,0.332662,0.019414,{}
0,KNN_k=20,bm25,20,0.250745,0.287349,0.300805,0.30381,0.328754,0.019425,"{'bm25_k1': 843.4301155557006, 'bm25_b': 0.763..."
0,KNN_k=20,normalized,1,0.247134,0.283596,0.298867,0.301666,0.331715,0.019569,{}
0,KNN_k=20,power_lift,20,0.246454,0.283755,0.29838,0.296388,0.324609,0.020371,{'p': 0.874576236478479}
0,KNN_k=100,normalized,1,0.241655,0.279204,0.295025,0.287152,0.31324,0.027931,{}
0,KNN_k=20,robust_user_centric,20,0.229796,0.270866,0.285502,0.289461,0.318806,0.019487,{'scale_factor': 2.6717703265899693}
0,KNN_k=20,robust_user_centric_weight_v2,20,0.235247,0.269902,0.284681,0.288471,0.318333,0.020085,"{'lower_q': 13.009637863954161, 'upper_q': 57...."
