In [2]:
import numpy as np 
import pandas as pd 
import scipy.sparse as sp
import os
from typing import Tuple, Callable, Dict, Optional, List
from sklearn.model_selection import train_test_split
cwd = os.getcwd()

DATA LOADING

In [3]:
def load_data():
    return pd.read_csv("../input/recommender-system-2020-challenge-polimi/data_train.csv", 
                       sep=",", 
                       names=["user_id", "item_id", "rating"],
                       header=0,
                       dtype={"row": np.int32,
                               "col": np.int32,
                               "data": np.int32})
def load_ICM():    
    return pd.read_csv("../input/recommender-system-2020-challenge-polimi/data_ICM_title_abstract.csv", 
                       sep=",", 
                       names=["item_id", "feature_id", "importance"],
                       header=0,
                       dtype={"row": np.int32,
                              "col": np.int32,
                              "data": np.float})

ratings = load_data()
ICM = load_ICM()


DATA PREPROCESSING



In [4]:
def preprocess_data(ratings: pd.DataFrame, ICM: pd.DataFrame):
    unique_users = ratings.user_id.unique()
    unique_items = ratings.item_id.unique()
    
   
    
    num_users, min_user_id, max_user_id = unique_users.size, unique_users.min(), unique_users.max()
    num_items, min_item_id, max_item_id = unique_items.size, unique_items.min(), unique_items.max()
  
    print(num_users, min_user_id, max_user_id)
    print(num_items, min_item_id, max_item_id)
 
    
    mapping_user_id = pd.DataFrame({"mapped_user_id": np.arange(num_users), "user_id": unique_users})
    mapping_item_id = pd.DataFrame({"mapped_item_id": np.arange(num_items), "item_id": unique_items})
   
    #ratings = pd.merge(left=ratings, 
    #                   right=mapping_user_id,
    #                   how="inner",
    #                   on="user_id")
    
    ratings = pd.merge(left=ratings, 
                       right=mapping_item_id,
                       how="inner",
                       on="item_id")
    

    return ratings

ratings = preprocess_data(ratings, ICM)


7947 0 7946
24896 0 25974


DATASET SPLIT INTO TRAINING, VALIDATION AND TEST SET

In [5]:
def dataset_splits(ratings, num_users, num_items, validation_percentage: float, testing_percentage: float):
    seed = 1235
    
    (user_ids_training, user_ids_test,
     item_ids_training, item_ids_test,
     ratings_training, ratings_test) = train_test_split(ratings.user_id,
                                                        ratings.mapped_item_id,
                                                        ratings.rating,
                                                        test_size=testing_percentage,
                                                        shuffle=True,
                                                        random_state=seed)
    
    
    (user_ids_training, user_ids_validation,
     item_ids_training, item_ids_validation,
     ratings_training, ratings_validation) = train_test_split(user_ids_training,
                                                              item_ids_training,
                                                              ratings_training,
                                                              test_size=validation_percentage,
                                                             )
    
    urm_train = sp.csr_matrix((ratings_training, (user_ids_training, item_ids_training)), 
                              shape=(num_users, num_items))
    
    urm_validation = sp.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)), 
                              shape=(num_users, num_items))
    
    urm_test = sp.csr_matrix((ratings_test, (user_ids_test, item_ids_test)), 
                              shape=(num_users, num_items))
    
    #ICM = sp.csr_matrix((IMC.importance, (ICM.mapped_feature_id)))
    
    
    return urm_train, urm_validation, urm_test


urm_train, urm_validation, urm_test = dataset_splits(ratings, 
                                                     num_users=7947, 
                                                     num_items=24896 , 
                                                     validation_percentage=0.20, 
                                                     testing_percentage=0.10)

COSINE SIMILARITY

In [6]:
def naive_similarity(urm: sp.csc_matrix, shrink: int):
    num_items = urm.shape[1]
    weights = np.empty(shape=(num_items, num_items))
    for item_i in range(num_items):
        item_i_profile = urm[:, item_i] # mx1 vector
        
        for item_j in range(num_items):
            item_j_profile = urm[:, item_j] # mx1 vector
            
            numerator = item_i_profile.T.dot(item_j_profile).todense()[0,0]
            denominator = (np.sqrt(np.sum(item_i_profile.power(2)))
                           * np.sqrt(np.sum(item_j_profile.power(2)))
                           + shrink
                           + 1e-6)
            
            weights[item_i, item_j] = numerator / denominator
    
    np.fill_diagonal(weights, 0.0)
    return weights



def vector_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A.flatten()
    
    num_items = urm.shape[1]
    urm_t = urm.T
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = urm_t.dot(urm[:, item_id]).A.flatten()
        denominator = item_weights[item_id] * item_weights + shrink + 1e-6
        
        weights[item_id] = numerator / denominator
        
    np.fill_diagonal(weights, 0.0)
    return weights




def matrix_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A
    
    numerator = urm.T.dot(urm)
    denominator = item_weights.T.dot(item_weights) + shrink + 1e-6
    weights = numerator / denominator
    np.fill_diagonal(weights, 0.0)
    
    return weights

In [None]:
#%%time 
#naive_weights = naive_similarity(urm_csc[:slice_size,:slice_size], shrink)

In [None]:
#%%time
#vector_weights = vector_similarity(urm_csc[:slice_size,:slice_size], shrink)


In [None]:
#%%time
#matrix_weights = matrix_similarity(urm_csc[:slice_size,:slice_size], shrink)

**COLLABORATIVE FILTERING ITEM KNN RECOMMENDER**

In [7]:
class CFItemKNN(object):
    def __init__(self, shrink: int):
        self.shrink = shrink
        self.weights = None
    
    
    def fit(self, urm_train: sp.csc_matrix, similarity_function):
        if not sp.isspmatrix_csc(urm_train):
            raise TypeError(f"We expected a CSC matrix, we got {type(urm_train)}")
        
        self.weights = similarity_function(urm_train, self.shrink)
        
    def recommend(self, user_id: int, urm_train: sp.csr_matrix, at: Optional[int] = None, remove_seen: bool = True):
        user_profile = urm_train[user_id]
        
        ranking = user_profile.dot(self.weights).flatten()
        
        if remove_seen:
            user_profile_start = urm_train.indptr[user_id]
            user_profile_end = urm_train.indptr[user_id+1]
            
            seen_items = urm_train.indices[user_profile_start:user_profile_end]
            
            ranking[seen_items] = -np.inf
            
        ranking = np.flip(np.argsort(ranking))
        return ranking[:at]

EVALUATION METRICS

In [8]:
def recall(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant) / relevant_items.shape[0]
    
    return recall_score
    
    
def precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant) / recommendations.shape[0]

    return precision_score

def mean_average_precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    precision_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(precision_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

In [9]:
def evaluator(recommender: object, urm_train: sp.csr_matrix, urm_test: sp.csr_matrix):
    recommendation_length = 10
    accum_precision = 0
    accum_recall = 0
    accum_map = 0
    
    num_users = urm_train.shape[0]
    
    num_users_evaluated = 0
    num_users_skipped = 0
    for user_id in range(num_users):
        user_profile_start = urm_test.indptr[user_id]
        user_profile_end = urm_test.indptr[user_id+1]
        
        relevant_items = urm_test.indices[user_profile_start:user_profile_end]
        
        if relevant_items.size == 0:
            num_users_skipped += 1
            continue
            
        recommendations = recommender.recommend(user_id=user_id, 
                                               at=recommendation_length, 
                                               urm_train=urm_train, 
                                               remove_seen=True)
        
        accum_precision += precision(recommendations, relevant_items)
        accum_recall += recall(recommendations, relevant_items)
        accum_map += mean_average_precision(recommendations, relevant_items)
        
        num_users_evaluated += 1
        
    
    accum_precision /= max(num_users_evaluated, 1)
    accum_recall /= max(num_users_evaluated, 1)
    accum_map /=  max(num_users_evaluated, 1)
    
    return accum_precision, accum_recall, accum_map, num_users_evaluated, num_users_skipped

HYPERPARAMETERS TUNING

In [10]:
def hyperparameter_tuning(shrinks):
    #shrinks = [0,1,5,10,50]
    #shrnks = [50, 180, 190, 195, 200, 205, 210, 220, 230, 300, 400, 500]
    results = []
    for shrink in shrinks:
        print(f"Currently trying shrink {shrink}")
        
        itemknn_recommender = CFItemKNN(shrink=shrink)
        itemknn_recommender.fit(urm_train.tocsc(), matrix_similarity)
        
        ev_precision, ev_recall, ev_map, _, _ = evaluator(itemknn_recommender, urm_train, urm_validation)
        
        results.append((shrink, (ev_precision, ev_recall, ev_map)))
        
    return results




In [11]:
%%time
shrinks = [5,10]
#hyperparameter_results = hyperparameter_tuning(shrinks)
#hyperparameter_results


CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.34 µs


In [12]:
#users_to_recommend = np.random.choice(ratings.user_id.unique(), size=100, replace=False)
#print(users_to_recommend)

users = pd.read_csv("../input/recommender-system-2020-challenge-polimi/data_target_users_test.csv", 
                       #sep=",", 
                       names=["user_id"],
                       header=0,
                       dtype={"user_id": np.int32,
                              })
utenti = users.user_id.unique()
print(utenti)
len(utenti)


[   0    1    2 ... 7944 7945 7946]


7944

In [13]:
mapping_to_item_id = dict(zip(ratings.mapped_item_id, ratings.item_id))

mapping_to_item_id[24895]

25953

In [14]:
def prepare_submission(ratings: pd.DataFrame, users_to_recommend: np.array, urm_train: sp.csr_matrix, recommender: object):
    users_ids_and_mappings = ratings[ratings.user_id.isin(users_to_recommend)][["user_id"]].drop_duplicates()
    items_ids_and_mappings = ratings[["item_id", "mapped_item_id"]].drop_duplicates()
    
    mapping_to_item_id = dict(zip(ratings.mapped_item_id, ratings.item_id))
    
    
    recommendation_length = 10
    submission = []
    for idx, row in users_ids_and_mappings.iterrows():
        user_id = row.user_id
        #mapped_user_id = row.mapped_user_id
        
        recommendations = recommender.recommend(user_id=user_id, #mapped_user_id,
                                                urm_train=urm_train,
                                                at=recommendation_length,
                                                remove_seen=True)
        
        submission.append((user_id, [mapping_to_item_id[item_id] for item_id in recommendations]))
        
    return submission


def write_submission(submissions):
    with open("./submission1.csv", "w") as f:
        f.write(f"{'user_id'},{'item_list'}\n")
        for user_id, items in submissions:
            f.write(f"{user_id},{' '.join([str(item) for item in items])}\n")

In [17]:
submission_recommender = CFItemKNN(shrink=50)
submission_recommender.fit((urm_train + urm_validation).tocsc(), vector_similarity)

accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(submission_recommender, 
                                                                                            urm_train, 
                                                                                            urm_test)


accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

(0.016152897657213446, 0.09176057591193126, 0.03775676749898023, 4055, 3892)

In [None]:
#Create Submission

submission = prepare_submission(ratings, utenti, urm_train + urm_validation + urm_test, submission_recommender)
submission

#25974
            
write_submission(submission)

In [None]:
submission