#Here SLIM_EN was trained, but the backbone of the training can be applied with whatever model


In [None]:
#questo
%%cython
import numpy as np
cimport numpy as np
from libc.stdint cimport int32_t
import time

cdef inline double sign(double x):
    if x > 0:
        return 1.0
    elif x < 0:
        return -1.0
    else:
        return 0.0

def do_some_training_EN(
    URM_train,
    double initial_learning_rate,
    double regularization_2,
    double l1_ratio,
    double decay_rate,
    int num_iterations,
    double[:, :] existing_item_item_S,
    double[:, :] S_icm,     # New parameter
    double alpha           # New parameter
):
    cdef int n_items = URM_train.shape[1]
    URM_train_csr = URM_train.tocsr()
    URM_train_coo = URM_train.tocoo()
    cdef long start_time = time.time()
    cdef int32_t[:] indices = URM_train_csr.indices.view(dtype=np.int32)
    cdef int32_t[:] indptr = URM_train_csr.indptr.view(dtype=np.int32)
    cdef double[:] data = URM_train_csr.data.view(dtype=np.float64)
    cdef int32_t[:] coo_row = URM_train_coo.row.view(dtype=np.int32)
    cdef int32_t[:] coo_col = URM_train_coo.col.view(dtype=np.int32)
    cdef double[:] coo_data = URM_train_coo.data.view(dtype=np.float64)

    cdef double[:, :] item_item_S
    if existing_item_item_S is not None:
        item_item_S = existing_item_item_S
    else:
        item_item_S = np.zeros((n_items, n_items), dtype=np.float64)

    cdef double learning_rate = initial_learning_rate
    cdef double loss = 0.0
    cdef double prediction_error, predicted_rating, profile_rating
    cdef int user_id, item_id, profile_item_id, sample_index, index
    cdef int start_idx, end_idx
    cdef int32_t[:] random_indices = np.random.randint(0, URM_train_coo.nnz, size=num_iterations).astype(np.int32)
     # Early stopping variables
    cdef int patience_counter = 0
    cdef double last_loss = np.inf
    cdef int patience = 20
    cdef double min_delta = 1e-5
    cdef int warm_restart = 0
    for sample_num in range(num_iterations+1):
        sample_index = random_indices[sample_num]
        user_id = coo_row[sample_index]
        item_id = coo_col[sample_index]
        true_rating = coo_data[sample_index]

        predicted_rating = 0.0
        start_idx = indptr[user_id]
        end_idx = indptr[user_id + 1]

        for index in range(start_idx, end_idx):
            profile_item_id = indices[index]
            profile_rating = data[index]
            predicted_rating += profile_rating * item_item_S[profile_item_id, item_id]

        prediction_error = true_rating - predicted_rating
        loss += prediction_error ** 2

        for index in range(start_idx, end_idx):
            profile_item_id = indices[index]
            profile_rating = data[index]
            item_item_S[profile_item_id, item_id] += learning_rate * (
                prediction_error * profile_rating
                - (1 - l1_ratio) * regularization_2 * item_item_S[profile_item_id, item_id]
                - l1_ratio * sign(item_item_S[profile_item_id, item_id])
                - alpha * (item_item_S[profile_item_id, item_id] - S_icm[profile_item_id, item_id]) #new for icm
            )
        if sample_num % 5000 == 0:
            learning_rate *= decay_rate
            if sample_num > 0:
              current_loss = loss / sample_num
            else:
              current_loss=0
            #loss_history.append(current_loss)
            if sample_num % 1000000 == 0:
                  elapsed_time = time.time() - start_time
                  samples_per_second = sample_num / elapsed_time
                  print(
                      "Iteration {} in {:.2f} seconds, loss is {:.4f}. Samples per second {:.2f}".format(
                          sample_num, elapsed_time, current_loss, samples_per_second
                      )
                  )
            # Early stopping check
            if abs(last_loss - current_loss) < min_delta or current_loss > 1:
                patience_counter += 1
                if patience_counter >= patience:
                    if warm_restart == 0:
                        learning_rate = initial_learning_rate
                        warm_restart += 1
                        patience_counter = 0
                        print("warm restart")
                    else:
                        print("Early stopping at iteration {}. Loss has not improved significantly for {} iterations, or has stayed above 1 too much. loss was {}".format(sample_num, patience * 5000, current_loss))
                        break
            else:
                patience_counter = 0  # Reset patience counter if loss improves
            last_loss = current_loss

    return loss, item_item_S


performance hint: /root/.cache/ipython/cython/_cython_magic_793d7496f50a3060672d684fead44deb3ce1ba1a.pyx:56:38: Index should be typed for more efficient access


Content of stderr:
In file included from /usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/ndarraytypes.h:1929,
                 from /usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /usr/local/lib/python3.10/dist-packages/numpy/core/include/numpy/arrayobject.h:5,
                 from /root/.cache/ipython/cython/_cython_magic_793d7496f50a3060672d684fead44deb3ce1ba1a.c:1250:
      |  ^~~~~~~

In [None]:
from Recommenders.BaseRecommender import BaseRecommender
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix


class MyItemItemRecommender(BaseRecommender):
    RECOMMENDER_NAME = "MyItemItemRecommender"

    def __init__(self, URM_train, item_item_S):
        super(MyItemItemRecommender, self).__init__(URM_train)
        self.item_item_S = item_item_S

    def fit(self):
        # No training is needed since we already have item_item_S
        pass

    def _compute_item_score(self, user_id_array, items_to_compute=None):
        # Compute the scores for each user in user_id_array
        user_profiles = self.URM_train[user_id_array]
        scores = user_profiles.dot(self.item_item_S)

        # If items_to_compute is specified, filter the scores
        if items_to_compute is not None:
            # Initialize scores_all with -inf
            scores_all = np.full((len(user_id_array), self.URM_train.shape[1]), -np.inf, dtype=np.float32)
            for idx, user_id in enumerate(user_id_array):
                scores_all[idx, items_to_compute[user_id]] = scores[idx, items_to_compute[user_id]]
            scores = scores_all

        return scores
    def _remove_seen_on_scores(self, user_id_array, scores):
        assert self.URM_train.getformat() == "csr", "Recommender_Base_Class: URM_train is not CSR, this will cause errors in filtering seen items"

        # Iterate over each user in the user_id_array
        for idx, user_id in enumerate(user_id_array):
            start_pos = self.URM_train.indptr[user_id]
            end_pos = self.URM_train.indptr[user_id + 1]
            seen_items = self.URM_train.indices[start_pos:end_pos]
            scores[idx, seen_items] = -np.inf

        return scores


    def recommend(
    self,
    user_id_array,
    cutoff=None,
    remove_seen_flag=True,
    remove_top_pop_flag=False,
    remove_custom_items_flag=False,
    return_scores=False,
    ):
        # Ensure user_id_array is an array
        if np.isscalar(user_id_array):
            user_id_array = np.array([user_id_array])

        # Compute scores for the users
        scores = self._compute_item_score(user_id_array)

        # Convert scores to dense array if necessary
        if isinstance(scores, csr_matrix) or isinstance(scores, csc_matrix):
            scores = scores.toarray()

        # Exclude seen items
        if remove_seen_flag:
            scores = self._remove_seen_on_scores(user_id_array, scores)

        # Apply cutoff
        if cutoff is None:
            cutoff = scores.shape[1]

        # Get the top items
        ranking = np.zeros((scores.shape[0], cutoff), dtype=np.int32)
        for idx in range(scores.shape[0]):
            user_scores = scores[idx]
            top_items = np.argsort(-user_scores)[:cutoff]
            ranking[idx] = top_items

        if return_scores:
            return ranking, scores
        else:
            return ranking

In [None]:
import optuna
from sklearn.model_selection import KFold
from Evaluation.Evaluator import EvaluatorHoldout
import numpy as np

#500 000 9 secondi
def objective(trial):
    # Sample hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.002, log=True)
    regularization = trial.suggest_float("regularization", 1e-6, 1e-3, log=True)
    decay_rate = trial.suggest_float("decay_rate", 0.999999, 1.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 0.5)

    # K-Fold Cross Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_metrics = {"MAP@10": []}
    print(f"trying learning rate={learning_rate}, reg = {regularization}, l1_ratio={l1_ratio} and decay = {decay_rate} for this crossval round")
    for train_index, val_index in kf.split(URM_all):
        # Create train and validation splits
        URM_train_split = URM_all_csr[train_index]
        URM_validation_split = URM_all_csr[val_index]

        # Train model on the split
        loss, item_item_S= do_some_training_EN(
            URM_train_split, learning_rate, regularization, l1_ratio, decay_rate, 5000000, None
        )

        # Initialize and fit the recommender
        recommender = MyItemItemRecommender(URM_train_split, item_item_S)
        recommender.fit()

        # Evaluate on the validation split
        evaluator = EvaluatorHoldout(URM_validation_split, cutoff_list=[10])
        results, _ = evaluator.evaluateRecommender(recommender)

        # Collect MAP@10 metric
        fold_metrics["MAP@10"].append(results.loc[10, 'MAP'])
        print(results.loc[10, 'MAP'])

    # Return the mean MAP@10 across folds as the objective value
    mean_map_at_10 = np.mean(fold_metrics["MAP@10"])
    trial.set_user_attr("MAP@10", mean_map_at_10)  # Store MAP@10 in the trial

    return mean_map_at_10


# Initialize Optuna study
study = optuna.create_study(direction="maximize")  # We aim to maximize MAP@10

# Run optimization for 100 trials
study.optimize(objective, n_trials=100)

# Best parameters
print("Best Parameters:", study.best_params)

# Best score
print("Best MAP@10:", study.best_value)

# Save study for later analysis
df = study.trials_dataframe()
df["MAP@10"] = [trial.user_attrs.get("MAP@10", None) for trial in study.trials]
df.to_csv("optuna_results.csv", index=False)


In [None]:
"""
    Train SLIM-EN, having found the best parameters through optuna,  with validation and early stopping based on MAP@10.
"""
# Initialize similarity matrix
last_map = -np.inf  # Track the last MAP@10 score
patience_counter = 0  # Counter for early stopping
map_history = []  # To store MAP@10 scores over epochs
patience = 5  # Number of epochs to wait for improvement
max_epochs = 100  # Maximum number of epochs
best_item_item_S = None  # Store the best similarity matrix
current_item_item_S = None  # Store the  similarity matrix
URM_train_split, URM_validation_split = split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.8)
num_iterations_per_epoch = 5000000  # Number of iterations per epoch

# Ensure train and validation splits are properly set
evaluator = EvaluatorHoldout(URM_validation_split, cutoff_list=[10])  # Use validation split evaluator

for epoch in range(max_epochs):
    print(f"Starting epoch {epoch + 1}")
    S_icm = cosine_similarity(ICM)
    # Train for one epoch
    #loss, current_item_item_S = do_some_training_EN(
    #    URM_train_split, 0.00010012085679135815, 2.668605667480152e-06, 0.24613633537942015, 0.9999991522253808, num_iterations_per_epoch, item_item_S
    #)
    loss, current_item_item_S = do_some_training_EN(
        URM_train_split, 0.00010202373981397776, 1.8374449620346475e-05, 0.034021854645934185, 0.9999996279382791, num_iterations_per_epoch, current_item_item_S, S_icm, 0.01808726112039536
    )
    # Initialize and fit the recommender with the updated similarity matrix
    recommender = MyItemItemRecommender(URM_train_split, current_item_item_S)
    recommender.fit()

    # Evaluate on the validation split
    results, _ = evaluator.evaluateRecommender(recommender)

    # Collect MAP@10 metric
    current_map = results.loc[10, "MAP"]
    map_history.append(current_map)
    print(f"Epoch {epoch + 1}: MAP@10 = {current_map:.4f}")

    # Check for convergence (plateauing or overfitting)
    if current_map > last_map:
        last_map = current_map
        patience_counter = 0  # Reset patience if MAP improves
        best_item_item_S = current_item_item_S  # Save the best similarity matrix
        print("Improvement detected. Saving the current similarity matrix.")
    else:
        patience_counter += 1
        print(f"No improvement in MAP@10. Patience counter: {patience_counter}/{patience}")

    # Stop if patience is exhausted
    if patience_counter >= patience:
        print("Early stopping: No improvement in MAP@10 for consecutive epochs.")
        break

print(f"Training complete. Best MAP@10: {max(map_history):.4f}")


EvaluatorHoldout: Ignoring 145 ( 0.4%) Users that have less than 1 test interactions
Starting epoch 1
Iteration 0 in 0.85 seconds, loss is 0.0000. Samples per second 0.00
Iteration 1000000 in 25.76 seconds, loss is 0.9757. Samples per second 38814.45
Iteration 2000000 in 39.71 seconds, loss is 0.9538. Samples per second 50367.25
Iteration 3000000 in 53.31 seconds, loss is 0.9337. Samples per second 56271.70
Iteration 4000000 in 66.91 seconds, loss is 0.9152. Samples per second 59781.16
EvaluatorHoldout: Processed 35591 (100.0%) in 1.83 min. Users per second: 325
Epoch 1: MAP@10 = 0.0258
Improvement detected. Saving the current similarity matrix.
Starting epoch 2
Iteration 0 in 0.91 seconds, loss is 0.0000. Samples per second 0.00
Iteration 1000000 in 14.49 seconds, loss is 0.8009. Samples per second 68993.36
Iteration 2000000 in 28.75 seconds, loss is 0.7881. Samples per second 69558.73
Iteration 3000000 in 42.33 seconds, loss is 0.7758. Samples per second 70878.82
Iteration 4000000 in

In [None]:
#testing

# Initialize the recommender with the best similarity matrix
recommender = MyItemItemRecommender(URM_train, best_item_item_S) #usare betst_item_item_S
recommender.fit()

# Create an evaluator for the test set
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])  # Evaluate MAP@10

# Evaluate the recommender on the test set
results_test, _ = evaluator_test.evaluateRecommender(recommender)

# Print results
print("Test Results:")
for cutoff in results_test.index:
    print(f"Cutoff {cutoff}:")
    print(f"MAP: {results_test.loc[cutoff, 'MAP']:.4f}")
    print(f"Precision: {results_test.loc[cutoff, 'PRECISION']:.4f}")
    print(f"Recall: {results_test.loc[cutoff, 'RECALL']:.4f}")

EvaluatorHoldout: Ignoring 149 ( 0.4%) Users that have less than 1 test interactions
EvaluatorHoldout: Processed 35587 (100.0%) in 1.89 min. Users per second: 314
Test Results:
Cutoff 10:
MAP: 0.2944
Precision: 0.3697
Recall: 0.4656


In [None]:
import numpy as np
from Evaluation.Evaluator import EvaluatorHoldout

# Step 1: Compute user interaction counts using URM_all
user_interaction_counts = np.ediff1d(URM_all.tocsr().indptr)

# Step 2: Define user activity thresholds
low_activity_threshold = np.percentile(user_interaction_counts, 33)
high_activity_threshold = np.percentile(user_interaction_counts, 66)

# Step 3: Create boolean masks and get user IDs
user_ids = np.arange(URM_all.shape[0])

low_activity_users = user_interaction_counts <= low_activity_threshold
medium_activity_users = (user_interaction_counts > low_activity_threshold) & (user_interaction_counts <= high_activity_threshold)
high_activity_users = user_interaction_counts > high_activity_threshold

low_activity_user_ids = user_ids[low_activity_users]
medium_activity_user_ids = user_ids[medium_activity_users]
high_activity_user_ids = user_ids[high_activity_users]

# Initialize the recommender with the trained similarity matrix
recommender = MyItemItemRecommender(URM_train, best_item_item_S)
recommender.fit()

# Function to evaluate for a user group
def evaluate_for_user_group(user_group_ids, group_name):
    # Ensure the user IDs are valid (i.e., they exist in URM_test)
    all_user_ids = np.arange(URM_test.shape[0])
    valid_user_ids = np.intersect1d(user_group_ids, all_user_ids)

    # Check if there are any interactions in the test set for the user group
    num_interactions = URM_test[valid_user_ids].nnz
    if num_interactions == 0:
        print(f"No test interactions for {group_name} activity users.")
        return

    # Create a list of users to ignore: all users except the ones in the group
    ignore_users = np.setdiff1d(all_user_ids, valid_user_ids)

    # Create an evaluator that ignores all other users
    evaluator_group = EvaluatorHoldout(URM_test, cutoff_list=[10], ignore_users=ignore_users.tolist())

    # Evaluate the recommender on the user group
    results_group, _ = evaluator_group.evaluateRecommender(recommender)

    # Print results
    print(f"Results for {group_name} Activity Users:")
    for cutoff in results_group.index:
        print(f"Cutoff {cutoff}:")
        print(f"MAP: {results_group.loc[cutoff, 'MAP']:.4f}")
        print(f"Precision: {results_group.loc[cutoff, 'PRECISION']:.4f}")
        print(f"Recall: {results_group.loc[cutoff, 'RECALL']:.4f}")
        print(f"NDCG: {results_group.loc[cutoff, 'NDCG']:.4f}")
    print("\n")

# Evaluate for each user group
evaluate_for_user_group(low_activity_user_ids, "Low")
evaluate_for_user_group(medium_activity_user_ids, "Medium")
evaluate_for_user_group(high_activity_user_ids, "High")


EvaluatorHoldout: Ignoring 149 ( 0.4%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 23483 Users
EvaluatorHoldout: Processed 12118 (100.0%) in 33.45 sec. Users per second: 362
Results for Low Activity Users:
Cutoff 10:
MAP: 0.1216
Precision: 0.1907
Recall: 0.4720
NDCG: 0.4128


EvaluatorHoldout: Ignoring 149 ( 0.4%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 24267 Users
EvaluatorHoldout: Processed 11456 (100.0%) in 36.20 sec. Users per second: 316
Results for Medium Activity Users:
Cutoff 10:
MAP: 0.2251
Precision: 0.3070
Recall: 0.5072
NDCG: 0.5053


EvaluatorHoldout: Ignoring 149 ( 0.4%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 23722 Users
EvaluatorHoldout: Processed 12013 (100.0%) in 1.14 min. Users per second: 175
Results for High Activity Users:
Cutoff 10:
MAP: 0.5348
Precision: 0.6099
Recall: 0.4195
NDCG: 0.6723


