In [8]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.model_selection import train_test_split

DATA LOADING

In [9]:
URM = pd.read_csv(filepath_or_buffer="Data/data_train.csv",
                                sep=",",
                                header=0,
                                dtype={0:int, 1:int, 2:float},
                                engine='python')
URM.columns = ["user_id", "item_id", "data"]

users = pd.read_csv(filepath_or_buffer="Data/data_target_users_test.csv",
                    header=0)

ICM = pd.read_csv(filepath_or_buffer="Data/data_ICM_metadata.csv",
                  sep=",",
                  header=0,
                  dtype={0:int, 1:int, 2:float},
                  engine='python')
ICM.columns = ["item_id", "feature_id", "data"]

DATA PREPROCESSING

In [11]:
#create new sequential indexes for users and items
mapped_id, original_id = pd.factorize(URM["user_id"].unique())
user_original_ID_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(URM["item_id"].unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)

URM["user_id"] = URM["user_id"].map(user_original_ID_to_index)
URM["item_id"] = URM["item_id"].map(item_original_ID_to_index)

unique_users = URM["user_id"].unique()
unique_items = URM["item_id"].unique()

num_users = len(unique_users)
num_items = len(unique_items)
num_interactions = len(URM)

print ("Number of items\t {}, Number of users\t {}".format(num_items, num_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(unique_items), max(unique_users)))
print ("Average interactions per user {:.2f}".format(num_interactions/num_users))
print ("Average interactions per item {:.2f}\n".format(num_interactions/num_items))

print ("Sparsity {:.2f} %".format((1-float(num_interactions)/(num_items*num_users))*100))

Number of items	 38121, Number of users	 35736
Max ID items	 38120, Max Id users	 35735

Average interactions per user 49.38
Average interactions per item 46.29

Sparsity 99.87 %


DATASET SPLITTING

In [23]:
URM_sparse = sp.coo_matrix((URM["data"].values, 
                          (URM["user_id"].values, URM["item_id"].values)))

In [18]:
def dataset_splits_csr(ratings, n_users, n_items, validation_percentage: float, testing_percentage: float):
    seed = 1234

    (user_ids_training, user_ids_test,
     item_ids_training, item_ids_test,
     ratings_training, ratings_test) = train_test_split(ratings.user_id,
                                                        ratings.item_id,
                                                        ratings.data,
                                                        test_size=testing_percentage,
                                                        shuffle=True,
                                                        random_state=seed)

    (user_ids_training, user_ids_validation,
     item_ids_training, item_ids_validation,
     ratings_training, ratings_validation) = train_test_split(user_ids_training,
                                                              item_ids_training,
                                                              ratings_training,
                                                              test_size=validation_percentage,
                                                              )

    urm_train = sp.csr_matrix((ratings_training, (user_ids_training, item_ids_training)),
                              shape=(n_users, n_items))

    urm_validation = sp.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)),
                                   shape=(n_users, n_items))

    urm_test = sp.csr_matrix((ratings_test, (user_ids_test, item_ids_test)),
                             shape=(n_users, n_items))

    return urm_train, urm_validation, urm_test


In [24]:
def dataset_splits_csc(ratings, n_users, n_items, validation_percentage: float, testing_percentage: float):
    seed = 1234

    (user_ids_training, user_ids_test,
     item_ids_training, item_ids_test,
     ratings_training, ratings_test) = train_test_split(ratings.user_id,
                                                        ratings.item_id,
                                                        ratings.data,
                                                        test_size=testing_percentage,
                                                        shuffle=True,
                                                        random_state=seed)

    (user_ids_training, user_ids_validation,
     item_ids_training, item_ids_validation,
     ratings_training, ratings_validation) = train_test_split(user_ids_training,
                                                              item_ids_training,
                                                              ratings_training,
                                                              test_size=validation_percentage,
                                                              )

    urm_train = sp.csr_matrix((ratings_training, (user_ids_training, item_ids_training)),
                              shape=(n_users, n_items))

    urm_validation = sp.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)),
                                   shape=(n_users, n_items))

    urm_test = sp.csr_matrix((ratings_test, (user_ids_test, item_ids_test)),
                             shape=(n_users, n_items))

    return urm_train, urm_validation, urm_test


In [25]:
URM_train, URM_validation, URM_test = dataset_splits_csc(URM,
                                                     n_users=num_users,
                                                     n_items=num_items,
                                                     validation_percentage=0.10,
                                                     testing_percentage=0.20)

SIMILARITY MEASURE

In [26]:
def vector_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A.flatten()

    num_items = urm.shape[1]
    urm_t = urm.T
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = urm_t.dot(urm[:, item_id]).A.flatten()
        denominator = item_weights[item_id] * item_weights + shrink + 1e-6

        weights[item_id] = numerator / denominator

    np.fill_diagonal(weights, 0.0)
    return weights

In [17]:
vector_similarity(URM_train, 0)

AttributeError: 'csc_matrix' object has no attribute 'A'