In [1]:
from google.colab import drive
drive.mount('/gdrive')

MessageError: ignored

In [None]:
%cd /gdrive/My Drive/Recommender_Sys

In [None]:
from urllib.request import urlretrieve
import zipfile, os

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
URM_training = pd.read_csv("/gdrive/My Drive/Recommender_Sys/recommender-system-2021-challenge-polimi/data_train.csv", 
                                sep=",", 
                                header=None, 
                                skiprows = [0],
                                engine='python')

URM_training.columns = ["UserID", "ItemID", "Interaction"]
#Items are the TVShows!

In [None]:
URM_training.head(n=20)

In [None]:
URM_popular = pandas.DataFrame(data=URM_training)

In [None]:
print ("The number of interactions is {}".format(len(URM_training)))

We can use this data to create a sparse matrix, notice that we have red UserID and ItemID as int.

Now we can extract the list of unique user id and item id and display some statistics

In [None]:
userID_unique = URM_training["UserID"].unique()
itemID_unique = URM_training["ItemID"].unique()

In [None]:
n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_training)

print ("Number of items\t {}".format(n_items))
print ("Max ID items\t {}".format(max(itemID_unique)))
print("")
print ("Number of users\t {}".format(n_users))
print ("Max Id users\t {}".format(max(userID_unique)))
print("")
print ("Number of interactions\t {}".format(n_interactions))

A user has no interaction data

An item has no interaction data

In [None]:
print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

To store the data we use a sparse matrix.

We build it as a COO matrix and then change its format.

The COO constructor expects (data, (row, column))

A sparse matrix is a matrix that is comprised of mostly zero values.

In [None]:
import scipy.sparse as sps

URM_all = sps.coo_matrix((URM_training["Interaction"].values, 
                          (URM_training["UserID"].values, URM_training["ItemID"].values)))

URM_all

In [None]:
URM_all.tocsr()

In [None]:
import numpy as np

item_popularity = np.ediff1d(URM_all.tocsc().indptr)
item_popularity

In [None]:
item_popularity = np.sort(item_popularity)
item_popularity

In [None]:
import matplotlib.pyplot as pyplot

pyplot.plot(item_popularity, 'ro')
pyplot.ylabel('Num Interactions ')
pyplot.xlabel('Sorted Item')
pyplot.show()

In [None]:
ten_percent = int(n_items/10)

print("Average per-item interactions over the whole dataset {:.2f}".
      format(item_popularity.mean()))

print("Average per-item interactions for the top 10% popular items {:.2f}".
      format(item_popularity[-ten_percent:].mean()))

print("Average per-item interactions for the least 10% popular items {:.2f}".
      format(item_popularity[:ten_percent].mean()))

print("Average per-item interactions for the median 10% popular items {:.2f}".
      format(item_popularity[int(n_items*0.45):int(n_items*0.55)].mean()))

print("Number of items with zero interactions {}".
      format(np.sum(item_popularity==0)))

In [None]:
user_activity = np.ediff1d(URM_all.tocsr().indptr)
user_activity = np.sort(user_activity)


pyplot.plot(user_activity, 'ro')
pyplot.ylabel('Num Interactions ')
pyplot.xlabel('Sorted User')
pyplot.show()

In order to evaluate our recommender we have to define:

*   A splitting of the data in URM_train and URM_test
*   An evaluation metric
*   A functon computing the evaluation for each user

The splitting of the data is very important to ensure your algorithm is evaluated in a realistic scenario by using test it has never seen. We create two splits:
- Train data: we will use this to train our model
- Test data: we will use this to evaluate our model

In [None]:
train_test_split = 0.80

n_interactions = URM_all.nnz


train_mask = np.random.choice([True,False], n_interactions, p=[train_test_split, 1-train_test_split])
train_mask

In [None]:
URM_train = sps.csr_matrix((URM_all.data[train_mask],
                            (URM_all.row[train_mask], URM_all.col[train_mask])))

URM_train

In [None]:
test_mask = np.logical_not(train_mask)

URM_test = sps.csr_matrix((URM_all.data[test_mask],
                            (URM_all.row[test_mask], URM_all.col[test_mask])))

URM_test

Precision: how many of the recommended items are relevant

Recall: how many of the relevant items I was able to recommend

Mean Average Precision

In [None]:
def precision(recommended_items, relevant_items):    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)    
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)    
    return precision_score

def recall(recommended_items, relevant_items):    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)    
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]   
    return recall_score

def MAP(recommended_items, relevant_items):   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))   
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])
    return map_score

Now that we have the data, we can build our first recommender. We need two things:
-  a 'fit' function to train our model
-  a 'recommend' function that uses our model to recommend

In [None]:
class TopPopRecommender(object):

    def fit(self, URM_all):
        
        self.URM_all = URM_all

        item_popularity = np.ediff1d(URM_all.tocsc().indptr)

        # We are not interested in sorting the popularity value,
        # but to order the items according to it
        self.popular_items = np.argsort(item_popularity)
        self.popular_items = np.flip(self.popular_items, axis = 0)
    
    
    def recommend(self, user_id, at=1000, remove_seen=True):

        if remove_seen:
            seen_items = self.URM_all.indices[self.URM_all.indptr[user_id]:self.URM_all.indptr[user_id+1]]
            
            unseen_items_mask = np.in1d(self.popular_items, 
                                        seen_items,
                                        assume_unique=True, 
                                        invert = True)

            unseen_items = self.popular_items[unseen_items_mask]

            recommended_items = unseen_items[0:at]

        else:
            recommended_items = self.popular_items[0:at]
            

        return recommended_items

In [None]:
topPopRecommender_removeSeen = TopPopRecommender()
topPopRecommender_removeSeen.fit(URM_all)

for user_id in range(50):
    print(user_id, topPopRecommender_removeSeen.recommend(user_id, at=10))

In [None]:
# We pass as paramether the recommender class

def evaluate_algorithm(URM_test, recommender_object, at=10):
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0
    
    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]
        
        if len(relevant_items)>0:
            
            recommended_items = recommender_object.recommend(user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_MAP += MAP(recommended_items, relevant_items)
            
    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval
    
    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP)) 

In [None]:
evaluate_algorithm(URM_test, topPopRecommender_removeSeen)