# COMM7380 Recommender Systems for Digital Media

In [1]:
# Install NetworkX, Matplotlib, Pandas, Numpy using pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy



# User Behaviour and the User-Item Matrix

## Importing and knowing your data 

In [2]:
import pandas as pd 
import numpy as np

In [3]:
evidence = pd.read_csv('../data/' + 'collector_log.csv')

In [4]:
# checkin the type and take a glance at the head 
print(type(evidence))
evidence.head(5)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,id,created,content_id,event,session_id,user_id
0,3,14/01/2020 17:54,4501244,details,794773,400003
1,4,14/01/2020 17:54,3521164,moreDetails,794773,400003
2,5,14/01/2020 17:54,3640424,details,441002,400005
3,6,14/01/2020 17:54,2823054,moreDetails,885440,400001
4,7,14/01/2020 17:54,3553976,genreView,441003,400005


In [5]:
users = evidence.user_id.unique()
content = evidence.content_id.unique()
print(type(content))
print(len(content))

<class 'numpy.ndarray'>
103


# Implicit Ratings
## Binary Matrix
Let's create a user-item binary matrix from the "buy" events

In [6]:
#Create a user-item binary matrix
uiBuyMatrix = pd.DataFrame(columns=content, index=users)
uiBuyMatrix.head(2)

Unnamed: 0,4501244,3521164,3640424,2823054,3553976,3470600,4513674,4698684,3315342,3874544,...,4196776,2948356,1355644,3300542,5247022,2140479,1083452,1179933,3410834,3553442
400003,,,,,,,,,,,...,,,,,,,,,,
400005,,,,,,,,,,,...,,,,,,,,,,


In [7]:
evidence.event.unique()

array(['details', 'moreDetails', 'genreView', 'addToList', 'buy'],
      dtype=object)

Select only the "buy" events

In [8]:
buyEvidence = evidence[evidence['event'] == 'buy']
buyEvidence.head(5)

Unnamed: 0,id,created,content_id,event,session_id,user_id
92,95,14/01/2020 17:54,4501244,buy,794776,400003
131,134,14/01/2020 17:54,2937696,buy,885441,400001
358,361,14/01/2020 17:54,3874544,buy,885444,400001
612,615,14/01/2020 17:54,3949660,buy,885445,400001
707,710,14/01/2020 17:54,5512872,buy,42460,400006


Create the user-item matrix `uiBuyMatrix` for the buy events

In [9]:
for index, row in buyEvidence.iterrows():
    currentUser = row['user_id']
    currentContent = row['content_id']
    uiBuyMatrix.at[currentUser, currentContent] = 1

In [10]:
print(uiBuyMatrix)

       4501244 3521164 3640424 2823054 3553976 3470600 4513674 4698684  \
400003       1       1       1       1       1       1       1       1   
400005     NaN     NaN       1     NaN       1     NaN       1     NaN   
400001       1       1       1     NaN       1       1       1       1   
400006       1     NaN       1       1     NaN       1       1       1   
400002     NaN     NaN       1       1       1       1       1     NaN   
400004     NaN     NaN       1     NaN     NaN     NaN     NaN     NaN   

       3315342 3874544  ... 4196776 2948356 1355644 3300542 5247022 2140479  \
400003       1       1  ...     NaN     NaN       1     NaN     NaN       1   
400005       1     NaN  ...     NaN     NaN       1     NaN     NaN       1   
400001       1       1  ...     NaN     NaN       1     NaN       1       1   
400006     NaN       1  ...     NaN     NaN     NaN     NaN       1     NaN   
400002       1     NaN  ...       1       1       1     NaN       1     NaN   
400004 

## Behavioural Implicit Ratings

Using the formula introduced during lecture

$${IR}_(i,u) = \left(w_1*{\#event}_1\right)+\left(w_2*{\#event}_2\right)+\dots+\left(w_n*{\#event}_n\right)$$

In [11]:
#Create a user-item matrix
uiMatrix = pd.DataFrame(columns=content, index=users)
uiMatrix.head(2)

Unnamed: 0,4501244,3521164,3640424,2823054,3553976,3470600,4513674,4698684,3315342,3874544,...,4196776,2948356,1355644,3300542,5247022,2140479,1083452,1179933,3410834,3553442
400003,,,,,,,,,,,...,,,,,,,,,,
400005,,,,,,,,,,,...,,,,,,,,,,


Type of events recorded in the logs

In [12]:
eventTypes = evidence.event.unique()
print(eventTypes)

['details' 'moreDetails' 'genreView' 'addToList' 'buy']


Give a weight to each of them

In [13]:
eventWeights = {
    'details': 15,
    'moreDetails': 50,
    'genreView': 0,
    'addToList': 0,
    'buy': 100}

Compute the Implicit Rating for each user-item combination.
Populate the user-item matrix `uiMatrix` with the IR values.

In [14]:
# Iterate the evidence
for index, row in evidence.iterrows():
    # Select the user and items involved
    currentUser = row['user_id']
    currentContent = row['content_id']
    
    # Extract the appropriate weight for the event
    w = eventWeights[row['event']]
    
    # Find the value eventually stored for the current user-item combination
    currentValue = uiMatrix.at[currentUser, currentContent]
    if np.isnan(currentValue):
        currentValue = 0
        
    # Compute the new value and update the user-item matrix
    updatedValue = currentValue + w #+ (1 * w)
    uiMatrix.at[currentUser, currentContent] = updatedValue
    

# Normalise the matrix
Update the user-item matrix by normalizing the values between 0 and 10. 

**Note**: NaN values should be maintained as NaN

In [15]:
uiMatrixNorm = uiMatrix.apply(
    lambda x: ((x - np.nanmin(uiMatrix.values))/(np.nanmax(uiMatrix.values) - np.nanmin(uiMatrix.values)))*10
    )

In [16]:
uiMatrixNorm

Unnamed: 0,4501244,3521164,3640424,2823054,3553976,3470600,4513674,4698684,3315342,3874544,...,4196776,2948356,1355644,3300542,5247022,2140479,1083452,1179933,3410834,3553442
400003,4.94342,4.76337,4.21296,3.97119,2.84979,4.26955,7.25823,4.25926,4.01749,4.30556,...,1.65638,2.08333,2.80864,1.65123,3.6677,2.90638,2.95267,2.72634,2.03189,3.74486
400005,,,7.7572,,7.99383,,8.05041,,8.17901,,...,0.0360082,0.0668724,7.83436,0.0,,8.30761,9.38272,8.15844,0.0,
400001,4.93313,4.70679,4.36214,4.09979,2.38683,5.36523,7.00617,4.63477,4.34156,4.55761,...,1.66152,1.74897,2.1965,1.59979,4.71708,2.5,2.37654,2.59259,1.86728,4.11008
400006,7.81893,7.88066,0.185185,8.93519,,9.49074,9.07407,8.8323,0.0154321,9.79938,...,0.0462963,0.221193,,0.138889,7.9321,,,,0.0154321,8.44136
400002,2.29424,2.47428,6.04424,2.88066,1.74383,2.39712,4.11523,1.9393,6.32202,2.20679,...,4.92284,4.16152,1.69753,3.71914,2.58745,2.13477,1.83642,1.41975,4.34671,2.46914
400004,,,8.54424,,,,,,8.42078,,...,8.65226,8.33848,,9.01235,,,,,8.95576,


# User-based Collaborative Filtering

## Step 1: Compute Similarity between the active user and the rest of the users
Select a user.

For our needs we need to find one without all the ratings already filled 

In [17]:
uiMatrixNorm.isnull().count(axis=1)

400003    103
400005    103
400001    103
400006    103
400002    103
400004    103
dtype: int64

We can choose freerly, in our dataset not all the users rated all the movies

In [18]:
currentUser = 400005

To avoid possible problem with measures, especially with Pearson correlation, let's convert all the values in the matrix as floating point numbers (previusly were `object` type)

In [19]:
uiMatrixNorm = uiMatrixNorm.astype(float)

Select the current user list of movies in a Pandas Series

In [20]:
cuDf = uiMatrixNorm.loc[currentUser]

Compute correlation using [`corrwith`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corrwith.html)

In [21]:
corrDf = uiMatrixNorm.corrwith(cuDf, axis=1, method='pearson')

Sort and print the results

In [22]:
corrDf.sort_values(ascending=False, inplace=True)

In [23]:
corrDf

400005    1.000000
400003    0.379975
400001    0.375150
400006    0.305177
400004   -0.226089
400002   -0.509538
dtype: float64

The results are correct if the correlation between the current user and itself is equal to 1. (We can remove its row for next steps)

In [24]:
corrDf.drop(labels=[currentUser], inplace=True)

Let's select the top-*k* with *k*=2

In [25]:
corrDf = corrDf.head(2)
corrDf

400003    0.379975
400001    0.375150
dtype: float64

Select the items whose values are to be predicted for current user

In [26]:
toPredict = cuDf[cuDf.isna()]
toPredict

4501244   NaN
3521164   NaN
2823054   NaN
3470600   NaN
4698684   NaN
3874544   NaN
475290    NaN
2387499   NaN
1700841   NaN
1608290   NaN
2709768   NaN
1489889   NaN
1473832   NaN
4438848   NaN
2277860   NaN
4139124   NaN
1292566   NaN
4136084   NaN
4034354   NaN
1711525   NaN
1985949   NaN
1860213   NaN
4624424   NaN
4048272   NaN
1679335   NaN
2937696   NaN
4901306   NaN
4651520   NaN
5247022   NaN
3553442   NaN
Name: 400005, dtype: float64

Select all the ratings from the other users selected for rating prediction

In [27]:
ratings = uiMatrixNorm.loc[corrDf.index]
ratings

Unnamed: 0,4501244,3521164,3640424,2823054,3553976,3470600,4513674,4698684,3315342,3874544,...,4196776,2948356,1355644,3300542,5247022,2140479,1083452,1179933,3410834,3553442
400003,4.943416,4.763374,4.212963,3.971193,2.849794,4.269547,7.25823,4.259259,4.01749,4.305556,...,1.656379,2.083333,2.808642,1.651235,3.667695,2.906379,2.952675,2.726337,2.031893,3.744856
400001,4.933128,4.70679,4.36214,4.099794,2.386831,5.365226,7.006173,4.634774,4.341564,4.557613,...,1.661523,1.748971,2.196502,1.599794,4.717078,2.5,2.376543,2.592593,1.867284,4.110082


Select only their ratings for the items we need

In [28]:
ratingsToPredict = ratings[toPredict.index]
ratingsToPredict

Unnamed: 0,4501244,3521164,2823054,3470600,4698684,3874544,475290,2387499,1700841,1608290,...,1985949,1860213,4624424,4048272,1679335,2937696,4901306,4651520,5247022,3553442
400003,4.943416,4.763374,3.971193,4.269547,4.259259,4.305556,4.480453,4.403292,4.686214,4.207819,...,4.243827,4.022634,4.722222,3.986626,5.061728,3.647119,4.290123,5.24177,3.667695,3.744856
400001,4.933128,4.70679,4.099794,5.365226,4.634774,4.557613,4.531893,4.161523,4.130658,4.593621,...,4.212963,4.223251,3.796296,4.861111,4.398148,4.465021,4.12037,3.734568,4.717078,4.110082


Compute the predicted ratings for current user.

Using mean value of the other high-similarity users.

In [29]:
predictedRatings = ratingsToPredict.mean()
predictedRatings.sort_values(ascending=False, inplace=True)
predictedRatings

4501244    4.938272
1489889    4.909979
3470600    4.817387
3521164    4.735082
1679335    4.729938
4438848    4.699074
1711525    4.596193
2709768    4.580761
475290     4.506173
4651520    4.488169
2277860    4.452160
4698684    4.447016
3874544    4.431584
4048272    4.423868
1700841    4.408436
1608290    4.400720
4034354    4.393004
1292566    4.387860
1473832    4.295267
2387499    4.282407
4624424    4.259259
1985949    4.228395
4901306    4.205247
5247022    4.192387
4136084    4.166667
1860213    4.122942
4139124    4.063786
2937696    4.056070
2823054    4.035494
3553442    3.927469
dtype: float64

Select the top-*k* with *k*=5 for recommendation.

In [30]:
predictedRatings.head(5)

4501244    4.938272
1489889    4.909979
3470600    4.817387
3521164    4.735082
1679335    4.729938
dtype: float64

- Course Instructor: Dr. Paolo Mengoni (Visiting Scholar, School of Communication, Hong Kong Baptist University) 
  - pmengoni@hkbu.edu.hk

- The codes in this notebook take insipiration from various sources. All codes are for educational purposes only and released under the CC1.0. 