In [1]:
import os

from utils import *
from Recommenders.DataIO import DataIO
from Evaluation.Evaluator import EvaluatorHoldout

In [2]:
data = load_data()
users = load_users()
data, num_users, num_items, mapping_user_id = preprocess_data(data)
data_train, data_validation, data_test = split_data(
    data,
    num_users=num_users,
    num_items=num_items,
    validation_percentage=0.1,
    testing_percentage=0.20
)

In [3]:
evaluator_validation = EvaluatorHoldout(data_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(data_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 4185 (33.1%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2132 (16.9%) Users that have less than 1 test interactions


In [4]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {
    "topK": Integer(5, 1000),
    "shrink": Integer(0, 1000),
    "similarity": Categorical(["cosine"]),
    "normalize": Categorical([True, False]),
}

In [5]:
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = ItemKNNCFRecommender

hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_validation,
                                         evaluator_test=evaluator_test)

2023-11-16 12:19:06.220469: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-11-16 12:19:06.220487: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [6]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs

recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [data_train],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    EARLYSTOPPING_KEYWORD_ARGS = {},
)

In [7]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [data_validation],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    EARLYSTOPPING_KEYWORD_ARGS = {},
)

In [8]:
output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [9]:
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "best",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 55, 'shrink': 606, 'similarity': 'cosine', 'normalize': False}
ItemKNNCFRecommender: URM Detected 338 ( 2.7%) users with no interactions.
ItemKNNCFRecommender: URM Detected 206 ( 0.9%) items with no interactions.
Similarity column 22222 (100.0%), 11168.07 column/sec. Elapsed time 1.99 sec
EvaluatorHoldout: Processed 8453 (100.0%) in 5.48 sec. Users per second: 1542
SearchBayesianSkopt: New best config found. Config 0: {'topK': 55, 'shrink': 606, 'similarity': 'cosine', 'normalize': False} - results: PRECISION: 0.0269845, PRECISION_RECALL_MIN_DEN: 0.0852423, RECALL: 0.0832942, MAP: 0.0106849, MAP_MIN_DEN: 0.0348020, MRR: 0.0912916, NDCG: 0.0607616, F1: 0.0407631, HIT_RATE: 0.2291494, ARHR_ALL_HITS: 0.0987236, NOVELTY: 0.0042917, AVERAGE_POPULARITY: 0.5163746, DIVERSITY_MEAN_INTER_LIST: 0.7442185, DIVERSITY_HERFINDAHL: 0.9744130, COVERAGE_ITEM: 0.1707317, COVERAGE_ITEM_HIT: 0.01512



Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.4091
Function value obtained: -0.0148
Current minimum: -0.0148
Iteration No: 27 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 38, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}
ItemKNNCFRecommender: URM Detected 338 ( 2.7%) users with no interactions.
ItemKNNCFRecommender: URM Detected 206 ( 0.9%) items with no interactions.
Similarity column 22222 (100.0%), 11086.75 column/sec. Elapsed time 2.00 sec
EvaluatorHoldout: Processed 8453 (100.0%) in 2.99 sec. Users per second: 2830
SearchBayesianSkopt: Config 26 is suboptimal. Config: {'topK': 38, 'shrink': 0, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0358808, PRECISION_RECALL_MIN_DEN: 0.1051336, RECALL: 0.1024235, MAP: 0.0147302, MAP_MIN_DEN: 0.0440403, MRR: 0.1166934, NDCG: 0.0770524, F1: 0.0531442, HIT_RATE: 0.2834497, ARHR_ALL_HITS: 0.1308862, NOVELTY: 0.0058302, AVERAGE_POPUL



Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.4632
Function value obtained: -0.0148
Current minimum: -0.0148
Iteration No: 32 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 464, 'shrink': 1, 'similarity': 'cosine', 'normalize': True}
ItemKNNCFRecommender: URM Detected 338 ( 2.7%) users with no interactions.
ItemKNNCFRecommender: URM Detected 206 ( 0.9%) items with no interactions.
Similarity column 22222 (100.0%), 10666.08 column/sec. Elapsed time 2.08 sec
EvaluatorHoldout: Processed 8453 (100.0%) in 5.42 sec. Users per second: 1561
SearchBayesianSkopt: Config 31 is suboptimal. Config: {'topK': 464, 'shrink': 1, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0315391, PRECISION_RECALL_MIN_DEN: 0.0966356, RECALL: 0.0945400, MAP: 0.0125095, MAP_MIN_DEN: 0.0391845, MRR: 0.1003259, NDCG: 0.0686795, F1: 0.0472990, HIT_RATE: 0.2554123, ARHR_ALL_HITS: 0.1118657, NOVELTY: 0.0055866, AVERAGE_POP



Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.4534
Function value obtained: -0.0148
Current minimum: -0.0148
Iteration No: 36 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 5, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}
SearchBayesianSkopt: Config 35 was already explored at index 21. Config: {'topK': 5, 'shrink': 0, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0346741, PRECISION_RECALL_MIN_DEN: 0.1027179, RECALL: 0.1001089, MAP: 0.0148399, MAP_MIN_DEN: 0.0440241, MRR: 0.1186566, NDCG: 0.0764356, F1: 0.0515078, HIT_RATE: 0.2775346, ARHR_ALL_HITS: 0.1323148, NOVELTY: 0.0058926, AVERAGE_POPULARITY: 0.1244829, DIVERSITY_MEAN_INTER_LIST: 0.9863622, DIVERSITY_HERFINDAHL: 0.9986246, COVERAGE_ITEM: 0.5759158, COVERAGE_ITEM_HIT: 0.0563406, ITEMS_IN_GT: 0.5502655, COVERAGE_USER: 0.6688558, COVERAGE_USER_HIT: 0.1856306, USERS_IN_GT: 0.6688558, DIVERSITY_GINI: 0.1669047, SHANNON_ENTR



Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.4445
Function value obtained: -0.0148
Current minimum: -0.0148
Iteration No: 37 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 305, 'shrink': 209, 'similarity': 'cosine', 'normalize': True}
ItemKNNCFRecommender: URM Detected 338 ( 2.7%) users with no interactions.
ItemKNNCFRecommender: URM Detected 206 ( 0.9%) items with no interactions.
Similarity column 22222 (100.0%), 10056.32 column/sec. Elapsed time 2.21 sec
EvaluatorHoldout: Processed 8453 (100.0%) in 5.94 sec. Users per second: 1422
SearchBayesianSkopt: Config 36 is suboptimal. Config: {'topK': 305, 'shrink': 209, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0303206, PRECISION_RECALL_MIN_DEN: 0.0955296, RECALL: 0.0933748, MAP: 0.0123373, MAP_MIN_DEN: 0.0395920, MRR: 0.1031366, NDCG: 0.0686593, F1: 0.0457766, HIT_RATE: 0.2541110, ARHR_ALL_HITS: 0.1126997, NOVELTY: 0.0043833, AVERAGE



Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.5197
Function value obtained: -0.0148
Current minimum: -0.0148
Iteration No: 43 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 174, 'shrink': 264, 'similarity': 'cosine', 'normalize': False}
ItemKNNCFRecommender: URM Detected 338 ( 2.7%) users with no interactions.
ItemKNNCFRecommender: URM Detected 206 ( 0.9%) items with no interactions.
Similarity column 22222 (100.0%), 11440.00 column/sec. Elapsed time 1.94 sec
EvaluatorHoldout: Processed 8453 (100.0%) in 6.09 sec. Users per second: 1388
SearchBayesianSkopt: Config 42 is suboptimal. Config: {'topK': 174, 'shrink': 264, 'similarity': 'cosine', 'normalize': False} - results: PRECISION: 0.0249497, PRECISION_RECALL_MIN_DEN: 0.0794511, RECALL: 0.0776245, MAP: 0.0096087, MAP_MIN_DEN: 0.0316814, MRR: 0.0828590, NDCG: 0.0557391, F1: 0.0377621, HIT_RATE: 0.2138886, ARHR_ALL_HITS: 0.0892089, NOVELTY: 0.0041457, AVERA



Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.5302
Function value obtained: -0.0148
Current minimum: -0.0148
Iteration No: 46 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'topK': 579, 'shrink': 993, 'similarity': 'cosine', 'normalize': True}
ItemKNNCFRecommender: URM Detected 338 ( 2.7%) users with no interactions.
ItemKNNCFRecommender: URM Detected 206 ( 0.9%) items with no interactions.
Similarity column 22222 (100.0%), 9966.95 column/sec. Elapsed time 2.23 sec
EvaluatorHoldout: Processed 8453 (100.0%) in 6.89 sec. Users per second: 1227
SearchBayesianSkopt: Config 45 is suboptimal. Config: {'topK': 579, 'shrink': 993, 'similarity': 'cosine', 'normalize': True} - results: PRECISION: 0.0247486, PRECISION_RECALL_MIN_DEN: 0.0791595, RECALL: 0.0772943, MAP: 0.0096059, MAP_MIN_DEN: 0.0316810, MRR: 0.0823194, NDCG: 0.0555134, F1: 0.0374926, HIT_RATE: 0.2119957, ARHR_ALL_HITS: 0.0888728, NOVELTY: 0.0042156, AVERAGE_



Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 0.6870
Function value obtained: -0.0148
Current minimum: -0.0148
SearchBayesianSkopt: Search complete. Best config is 21: {'topK': 5, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}

ItemKNNCFRecommender: URM Detected 4185 (33.1%) users with no interactions.
ItemKNNCFRecommender: URM Detected 9994 (45.0%) items with no interactions.
SearchBayesianSkopt: Evaluation with constructor data for final test. Using best config: {'topK': 5, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}
Similarity column 22222 (100.0%), 13710.42 column/sec. Elapsed time 1.62 sec
EvaluatorHoldout: Processed 10506 (100.0%) in 3.03 sec. Users per second: 3469
SearchBayesianSkopt: Best config evaluated with evaluator_test with constructor data for final test. Config: {'topK': 5, 'shrink': 0, 'similarity': 'cosine', 'normalize': True} - results:
CUTOFF: 10 - PRECISION: 0.0009328, PRECISION_RECALL_MIN_DEN: 0.0010987, RECA

In [10]:
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

In [11]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'topK': 5, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}

In [12]:
search_metadata['result_on_test_best']

Unnamed: 0_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.071283,0.123001,0.103423,0.035444,0.060188,0.211227,0.112319,0.084397,0.423187,0.270861,...,0.831302,0.351796,0.831302,0.163634,11.3511,0.998459,0.472097,0.874768,1.011607,0.370982


In [13]:
recommender_object = recommender_class(data_train + data_validation)

ItemKNNCFRecommender: URM Detected 239 ( 1.9%) users with no interactions.
ItemKNNCFRecommender: URM Detected 113 ( 0.5%) items with no interactions.


In [14]:
recommender_object.load_model(output_folder_path, 
                              file_name = recommender_object.RECOMMENDER_NAME + "_best_model_last.zip" )

ItemKNNCFRecommender: Loading model from file 'result_experiments/ItemKNNCFRecommender_best_model_last.zip'
ItemKNNCFRecommender: Loading complete


In [15]:
def prepare_submission(ratings: pd.DataFrame, users_to_recommend: np.array, urm_train: sp.csr_matrix, recommender: object):
    users_ids_and_mappings = ratings[ratings.user_id.isin(users_to_recommend)][["user_id", "mapped_user_id"]].drop_duplicates()

    mapping_to_item_id = dict(zip(ratings.mapped_item_id, ratings.item_id))
    item_ids = ratings.item_id.unique()


    recommendation_length = 10
    submission = dict()
    for idx, row in users_ids_and_mappings.iterrows():
        user_id = row.user_id
        mapped_user_id = row.mapped_user_id

        recommendations = recommender.recommend(mapped_user_id, cutoff = 10)

        submission[user_id] = [mapping_to_item_id[item_id] for item_id in recommendations]
    
    for user_id in users_to_recommend:
        if user_id not in submission:
            submission[user_id] = np.random.choice(item_ids, 10)

    return submission

In [16]:
from utils import write_submission

write_submission(prepare_submission(data, users, data_train + data_validation, recommender_object))