In [None]:
import numpy as np 
import pandas as pd 
import scipy.sparse as sp
import os
from typing import Tuple, Callable, Dict, Optional, List
from sklearn.model_selection import train_test_split
cwd = os.getcwd()

**LOADING DATA**

In [None]:
def load_data():
    return pd.read_csv("../input/recommender-system-2020-challenge-polimi/data_train.csv", 
                       sep=",", 
                       names=["user_id", "item_id", "rating"],
                       header=0,
                       dtype={"row": np.int32,
                               "col": np.int32,
                               "data": np.int32})
def load_ICM():    
    return pd.read_csv("../input/recommender-system-2020-challenge-polimi/data_ICM_title_abstract.csv", 
                       sep=",", 
                       names=["item_id", "feature_id", "importance"],
                       header=0,
                       dtype={"row": np.int32,
                              "col": np.int32,
                              "data": np.float})

ratings = load_data()
ICM = load_ICM()

ICM["importance"] = ICM["importance"]+1


**MAPPING old ITEM_ID AND FEATURE_ID to new ITEM_ID AND FEATURE_ID.**

ADDING ALSO THE ONES WHICH ARE IN THE ICM BUT NOT IN THE URM


In [None]:
item_original_ID_to_index_dict = {}

for item_id in ratings["item_id"].unique():
    item_original_ID_to_index_dict[item_id] = len(item_original_ID_to_index_dict)

print("Unique item_id in the URM are {}".format(len(item_original_ID_to_index_dict)))
    
for item_id in ICM["item_id"].unique():
    if item_id not in item_original_ID_to_index_dict:
        item_original_ID_to_index_dict[item_id] = len(item_original_ID_to_index_dict)
        
print("Unique item_id in the URM and ICM are {}".format(len(item_original_ID_to_index_dict)))



feature_original_ID_to_index_dict = {}

for feature_id in ICM["feature_id"].unique():
    feature_original_ID_to_index_dict[feature_id] = len(feature_original_ID_to_index_dict)

print("Unique feature_id in the URM are {}".format(len(feature_original_ID_to_index_dict)))



#original_feature_ID = 1185 
#print("New index for feature '{}' is {}".format(original_feature_ID, feature_original_ID_to_index_dict[original_feature_ID]))



ratings["item_id"] = [item_original_ID_to_index_dict[item_original] for item_original in 
                                      ratings["item_id"].values]


ICM["item_id"] = [item_original_ID_to_index_dict[item_original] for item_original in 
                                      ICM["item_id"].values]

ICM["feature_id"] = [feature_original_ID_to_index_dict[feature_original] for feature_original in 
                                      ICM["feature_id"].values]




n_items = len(item_original_ID_to_index_dict)
n_items_URM = len(ratings["item_id"].unique())
n_items_ICM = len(ICM["item_id"].unique())
n_users = len(ratings["user_id"].unique())
n_features = len(feature_original_ID_to_index_dict)

TRAIN TEST SPLIT

In [None]:
def dataset_splits(ratings, ICM, num_users, num_items, validation_percentage: float, testing_percentage: float):
    seed = 1234
    
    (user_ids_training, user_ids_test,
     item_ids_training, item_ids_test,
     ratings_training, ratings_test) = train_test_split(ratings.user_id,
                                                        ratings.item_id,
                                                        ratings.rating,
                                                        test_size=testing_percentage,
                                                        shuffle=True,
                                                        random_state=seed)
    
    (user_ids_training, user_ids_validation,
     item_ids_training, item_ids_validation,
     ratings_training, ratings_validation) = train_test_split(user_ids_training,
                                                              item_ids_training,
                                                              ratings_training,
                                                              test_size=validation_percentage,
                                                             )
    
    urm_train = sp.csr_matrix((ratings_training, (user_ids_training, item_ids_training)), 
                              shape=(num_users, num_items))
    
    urm_validation = sp.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)), 
                              shape=(num_users, num_items))
    
    urm_test = sp.csr_matrix((ratings_test, (user_ids_test, item_ids_test)), 
                              shape=(num_users, num_items))
    
    ICM = sp.csr_matrix((ICM["importance"].values, 
                          (ICM["item_id"].values, ICM["feature_id"].values)),
                        shape = (n_items, n_features))
    
    
    return urm_train, urm_validation, urm_test, ICM


urm_train, urm_validation, urm_test, ICM_final = dataset_splits(ratings,
                                                     ICM,
                                                     num_users=n_users, 
                                                     num_items=n_items , 
                                                     validation_percentage=0.20, 
                                                     testing_percentage=0.10)

In [None]:

def vector_similarity(ICM: sp.csc_matrix, shrink: int):
    ICM = ICM.tocsr()  #fa più veloce (3.50 min contro 6.2)
    item_norms = np.sqrt(np.array(ICM.T.power(2).sum(axis=0))).ravel()
 
    num_items = ICM.shape[0]
    ICM_t = ICM.T
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator_vector = ICM[item_id].dot(ICM_t).A.flatten()#toarray().ravel()
       
        denominator_vector = item_norms[item_id] * item_norms + shrink + 1e-6
   
        weights[item_id] = numerator_vector / denominator_vector
        
    np.fill_diagonal(weights, 0.0)
    return weights


def matrix_similarity2(ICM: sp.csc_matrix, shrink: int):      
    item_norms = np.sqrt(
        np.sum(ICM.power(2), axis=1)
    ).A
    
   
    #item_norms = np.sqrt(np.array(ICM.T.power(2).sum(axis=0))).ravel()

    numerator = ICM.dot(ICM.T)
  
    denominator = item_norms.dot(item_norms.T) + shrink + 1e-6
   
    weights = numerator / denominator
    np.fill_diagonal(weights, 0.0)
    
    return weights

def matrix_similarity(ICM: sp.csc_matrix, shrink: int, block_size: int):
    ICM = ICM.tocsr()  #fa più veloce (quasi la metà)

    item_norms = np.sqrt(
            np.sum(ICM.power(2), axis=1)
            ).A

    n_items = ICM.shape[0]

    #block_size = 500
    blocks_start_positions = range(0, n_items, block_size)
    denominator = item_norms.dot(item_norms.T) + shrink + 1e-6
    weights = np.empty(shape=(n_items, n_items))
    numerator = np.array([]).reshape(0, 25975)
    
    for start_pos in blocks_start_positions:
        end_pos = min(start_pos + block_size, n_items)
        numerator = np.concatenate((numerator,ICM[start_pos:end_pos].dot(ICM.T).toarray()))
        
    weights = numerator / denominator
    np.fill_diagonal(weights, 0.0)
    return weights
    

In [None]:
class CBFItemKNN(object):
    def __init__(self, shrink: int):
        self.shrink = shrink
        self.weights = None
    
    
    def fit(self, urm_train: sp.csc_matrix, block_size: int, similarity_function):
        if not sp.isspmatrix_csc(urm_train):
            raise TypeError(f"We expected a CSC matrix, we got {type(urm_train)}")
        
        self.weights = similarity_function(urm_train, self.shrink, block_size)
        
    def recommend(self, user_id: int, urm_train: sp.csr_matrix, at: Optional[int] = None, remove_seen: bool = True):
        user_profile = urm_train[user_id]
        
        ranking = user_profile.dot(self.weights).flatten()
        
        if remove_seen:
            user_profile_start = urm_train.indptr[user_id]
            user_profile_end = urm_train.indptr[user_id+1]
            
            seen_items = urm_train.indices[user_profile_start:user_profile_end]
            
            ranking[seen_items] = -np.inf
            
        ranking = np.flip(np.argsort(ranking))
        return ranking[:at]

In [None]:
A = np.array([2,5,1,3,7,3,89,2])
B = np.argsort(A)
B

In [None]:
def recall(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant) / relevant_items.shape[0]
    
    return recall_score
    
    
def precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant) / recommendations.shape[0]

    return precision_score

def mean_average_precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    precision_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(precision_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

In [None]:
def evaluator(recommender: object, urm_train: sp.csr_matrix, urm_test: sp.csr_matrix):
    recommendation_length = 10
    accum_precision = 0
    accum_recall = 0
    accum_map = 0
    
    num_users = urm_train.shape[0]
    
    num_users_evaluated = 0
    num_users_skipped = 0
    for user_id in range(num_users):
        user_profile_start = urm_test.indptr[user_id]
        user_profile_end = urm_test.indptr[user_id+1]
        
        relevant_items = urm_test.indices[user_profile_start:user_profile_end]
        
        if relevant_items.size == 0:
            num_users_skipped += 1
            continue
            
        recommendations = recommender.recommend(user_id=user_id, 
                                               at=recommendation_length, 
                                               urm_train=urm_train, 
                                               remove_seen=True)
        
        accum_precision += precision(recommendations, relevant_items)
        accum_recall += recall(recommendations, relevant_items)
        accum_map += mean_average_precision(recommendations, relevant_items)
        
        num_users_evaluated += 1
        
    
    accum_precision /= max(num_users_evaluated, 1)
    accum_recall /= max(num_users_evaluated, 1)
    accum_map /=  max(num_users_evaluated, 1)
    
    return accum_precision, accum_recall, accum_map, num_users_evaluated, num_users_skipped

In [None]:
def hyperparameter_tuning(ICM_matrix, shrinks):
    #shrinks = [0,1,5,10,50, 75, 100]
    #shrinks = [7, 8, 9, 10, 11, 12, 18, 20, 25]
    results = []
    for shrink in shrinks:
        print(f"Currently trying shrink {shrink}")
        
        itemknn_recommender = CBFItemKNN(shrink=shrink)
        itemknn_recommender.fit(ICM_matrix.tocsc(), 8000, matrix_similarity)
        
        ev_precision, ev_recall, ev_map, _, _ = evaluator(itemknn_recommender, urm_train, urm_validation)
        
        results.append((shrink, (ev_precision, ev_recall, ev_map)))
        
    return results


In [None]:
print(ICM_final[725,96])

In [None]:
#shrinks = [7, 8, 9, 10, 11, 12, 18, 20, 25]
#hyperparameter_results = hyperparameter_tuning(ICM_final, shrinks)
#hyperparameter_results
#best_shrink = 8

In [None]:

r3 = CBFItemKNN(shrink=7)
r3.fit(ICM_final.tocsc(), 6000, matrix_similarity)

#vector_similarity statistics:
#3min 50 sec CSR
#6min 20 sec CSC

#matrix_similarity statistics:
#2min 5 sec block 100, CSC
#58 sec block 1000, CSC
#41 sec block 3000, CSC
#25 sec block 3000, CSR

In [None]:
r3.weights[0].max()

In [None]:

accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(recommender3, 
                                                                                            urm_train+urm_validation, 
                                                                                            urm_test)

print(accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped)


