# S05 - Recommender Systems - Functions

In [6]:
# Operating System
import os

import numpy as np
import pandas as pd

from numpy.linalg import norm
from scipy.sparse import csr_matrix, save_npz

from sklearn.metrics.pairwise import cosine_similarity

## 0.0 Loading the data

In [15]:
path = os.path.join('BLU12 - Workflow', 'data', 'train_play_counts.txt')
ratings_df = pd.read_csv(path, names=['user_id', 'item_id', 'rating'], sep='\t')

ratings_df.head()

Unnamed: 0,user_id,item_id,rating
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1


#### Each user must not have more than one rating for the same item

In [16]:
ratings_df[['user_id','item_id']].duplicated().sum()

0

## 0.1 Create the ratings matrix $R$  

In [17]:
R = csr_matrix( ratings_df.pivot(index='user_id', columns='item_id', values='rating').sort_index().fillna(0) )
R

<7526x41194 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

#### Confirmations

In [48]:
# Just for mental (in)sanity, let's match the info of the matrix to what is printed in the previous cell.
print(f"The shape is {R.shape}")
print(f"DataFrame ratings_df has {ratings_df.user_id.unique().shape[0]} distinct users")
print(f"DataFrame ratings_df has {ratings_df.item_id.unique().shape[0]} distinct items")

The shape is (7526, 41194)
DataFrame ratings_df has 7526 distinct users
DataFrame ratings_df has 41194 distinct items


In [49]:
print(f"The number of stored elements in R is {R.nnz}")
print(f"DataFrame ratings_df has {ratings_df.shape[0]} elements")

The number of stored elements in R is 100000
DataFrame ratings_df has 100000 elements


## 1. Non-personalized recommendations

In [58]:
def most_rated(R, n=1):
    # Check if is rating and convert to array
    R_ = (R>0).toarray()
    
    # Count ratings
    R_sum_ri = R_.sum(axis=0)
    
    # Return top-n
    return np.negative(R_sum_ri).argsort()[:n]

most_rated(R, 5)

array([ 9622,  1394,  2648, 30752,  1557], dtype=int64)

In [59]:
def most_rated_above_thr(R, threshold=0 , n=1):
    # Check if is rating and convert to array
    R_ = (R>threshold).toarray()
    
    # Count ratings
    R_sum_ri = R_.sum(axis=0)
    
    # Return top-n
    return np.negative(R_sum_ri).argsort()[:n]

most_rated_above_thr(R, 3, 5)

array([ 2648,  1394, 30752,  9622,  1557], dtype=int64)

In [61]:
def best_mean_rating(R, n):
    # make a copy of the original matrix and conver
    R_ = R.toarray()
    
    # Replace zeros by Nan
    R_[R_==0] = np.NaN
        
    R_means = np.nanmean(R_, axis=0)
    
    return np.negative(R_means).argsort()[:n]


best_mean_rating(R, 5)

array([23181,  7847,  9348, 35329, 12890], dtype=int64)

In [57]:
R.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
np.nanmean(R.toarray(), axis=0)

array([0.00026575, 0.00013287, 0.00013287, ..., 0.00026575, 0.00053149,
       0.00013287])

In [52]:
R.copy()

<7526x41194 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

Similarity between two users.

In [2]:
def similarity(u, v):
    return round(np.dot(u, v) / (norm(u) * norm(v)), 2)

Complete matrix (symmetric) of users' similarities.

In [3]:
def make_user_similarities(R):
    return cosine_similarity(R, dense_output=False)

Complete matrix (symmetric) of items' similarities.

In [4]:
def make_item_similarities(R):
    return cosine_similarity(R.T, dense_output=False)

## Making predictions

### Based on user-similarities, $S_u$

In [5]:
def make_user_predictions(S, R):
    
    weighted_sum = np.dot(S, R)
    
    # We use the absolute value to support negative similarities.
    # In this particular example there are none.
    sum_of_weights = np.abs(S).sum(axis=1)
    
    preds = weighted_sum / sum_of_weights
    
    # Exclude previously rated items.
    preds[R.nonzero()] = 0
    
    return csr_matrix(preds)
 


`preds.nnz` prints the number of non-zeros

In the end, **we will be predicting over a test set of users**, which may or may not have ratings available in our training data. **For the ones** in the test set for **which we have data in the training, we may use personalized recommendation** systems. **Otherwise**, we need to switch to **non-personalized** recommendations.