# COMM7380 Recommender Systems for Digital Media

In [1]:
# Install NetworkX, Matplotlib, Pandas, Numpy using pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy



# User Behaviour and the User-Item Matrix

## Importing and knowing your data 

In [2]:
import pandas as pd 
import numpy as np

In [3]:
evidence = pd.read_csv('../data/' + 'collector_log.csv')

In [4]:
# checkin the type and take a glance at the head 
print(type(evidence))
evidence.head(5)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,id,created,content_id,event,session_id,user_id
0,3,14/01/2020 17:54,4501244,details,794773,400003
1,4,14/01/2020 17:54,3521164,moreDetails,794773,400003
2,5,14/01/2020 17:54,3640424,details,441002,400005
3,6,14/01/2020 17:54,2823054,moreDetails,885440,400001
4,7,14/01/2020 17:54,3553976,genreView,441003,400005


In [6]:
users = evidence.user_id.unique()
content = evidence.content_id.unique()
print(type(content))
print(len(content))

<class 'numpy.ndarray'>
103


# Implicit Ratings
## Binary Matrix
Let's create a user-item binary matrix from the "buy" events

In [7]:
#Create a user-item binary matrix
uiBuyMatrix = pd.DataFrame(columns=content, index=users)
uiBuyMatrix.head(2)

Unnamed: 0,4501244,3521164,3640424,2823054,3553976,3470600,4513674,4698684,3315342,3874544,...,4196776,2948356,1355644,3300542,5247022,2140479,1083452,1179933,3410834,3553442
400003,,,,,,,,,,,...,,,,,,,,,,
400005,,,,,,,,,,,...,,,,,,,,,,


In [8]:
evidence.event.unique()

array(['details', 'moreDetails', 'genreView', 'addToList', 'buy'],
      dtype=object)

Select only the "buy" events

In [9]:
buyEvidence = evidence[evidence['event'] == 'buy']
buyEvidence.head(5)

Unnamed: 0,id,created,content_id,event,session_id,user_id
92,95,14/01/2020 17:54,4501244,buy,794776,400003
131,134,14/01/2020 17:54,2937696,buy,885441,400001
358,361,14/01/2020 17:54,3874544,buy,885444,400001
612,615,14/01/2020 17:54,3949660,buy,885445,400001
707,710,14/01/2020 17:54,5512872,buy,42460,400006


Create the user-item matrix `uiBuyMatrix` for the buy events

In [10]:
for index, row in buyEvidence.iterrows():
    currentUser = row['user_id']
    currentContent = row['content_id']
    uiBuyMatrix.at[currentUser, currentContent] = 1

In [11]:
print(uiBuyMatrix)

       4501244 3521164 3640424 2823054 3553976 3470600 4513674 4698684  \
400003       1       1       1       1       1       1       1       1   
400005     NaN     NaN       1     NaN       1     NaN       1     NaN   
400001       1       1       1     NaN       1       1       1       1   
400006       1     NaN       1       1     NaN       1       1       1   
400002     NaN     NaN       1       1       1       1       1     NaN   
400004     NaN     NaN       1     NaN     NaN     NaN     NaN     NaN   

       3315342 3874544  ... 4196776 2948356 1355644 3300542 5247022 2140479  \
400003       1       1  ...     NaN     NaN       1     NaN     NaN       1   
400005       1     NaN  ...     NaN     NaN       1     NaN     NaN       1   
400001       1       1  ...     NaN     NaN       1     NaN       1       1   
400006     NaN       1  ...     NaN     NaN     NaN     NaN       1     NaN   
400002       1     NaN  ...       1       1       1     NaN       1     NaN   
400004 

## Behavioural Implicit Ratings

Using the formula introduced during lecture

$${IR}_(i,u) = \left(w_1*{\#event}_1\right)+\left(w_2*{\#event}_2\right)+\dots+\left(w_n*{\#event}_n\right)$$

In [12]:
#Create a user-item matrix
uiMatrix = pd.DataFrame(columns=content, index=users)
uiMatrix.head(2)

Unnamed: 0,4501244,3521164,3640424,2823054,3553976,3470600,4513674,4698684,3315342,3874544,...,4196776,2948356,1355644,3300542,5247022,2140479,1083452,1179933,3410834,3553442
400003,,,,,,,,,,,...,,,,,,,,,,
400005,,,,,,,,,,,...,,,,,,,,,,


Type of events recorded in the logs

In [13]:
eventTypes = evidence.event.unique()
print(eventTypes)

['details' 'moreDetails' 'genreView' 'addToList' 'buy']


Give a weight to each of them

In [14]:
eventWeights = {
    'details': 15,
    'moreDetails': 50,
    'genreView': 0,
    'addToList': 0,
    'buy': 100}

Compute the Implicit Rating for each user-item combination.
Populate the user-item matrix `uiMatrix` with the IR values.

In [15]:
# Iterate the evidence
for index, row in evidence.iterrows():
    # Select the user and items involved
    currentUser = row['user_id']
    currentContent = row['content_id']
    
    # Extract the appropriate weight for the event
    w = eventWeights[row['event']]
    
    # Find the value eventually stored for the current user-item combination
    currentValue = uiMatrix.at[currentUser, currentContent]
    if np.isnan(currentValue):
        currentValue = 0
        
    # Compute the new value and update the user-item matrix
    updatedValue = currentValue + w #+ (1 * w)
    uiMatrix.at[currentUser, currentContent] = updatedValue
    

# Normalise the matrix
Update the user-item matrix by normalizing the values between 0 and 10. 

**Note**: NaN values should be maintained as NaN

In [16]:
uiMatrixNorm = uiMatrix.apply(
    lambda x: ((x - np.nanmin(uiMatrix.values))/(np.nanmax(uiMatrix.values) - np.nanmin(uiMatrix.values)))*10
    )

In [17]:
uiMatrixNorm

Unnamed: 0,4501244,3521164,3640424,2823054,3553976,3470600,4513674,4698684,3315342,3874544,...,4196776,2948356,1355644,3300542,5247022,2140479,1083452,1179933,3410834,3553442
400003,4.94342,4.76337,4.21296,3.97119,2.84979,4.26955,7.25823,4.25926,4.01749,4.30556,...,1.65638,2.08333,2.80864,1.65123,3.6677,2.90638,2.95267,2.72634,2.03189,3.74486
400005,,,7.7572,,7.99383,,8.05041,,8.17901,,...,0.0360082,0.0668724,7.83436,0.0,,8.30761,9.38272,8.15844,0.0,
400001,4.93313,4.70679,4.36214,4.09979,2.38683,5.36523,7.00617,4.63477,4.34156,4.55761,...,1.66152,1.74897,2.1965,1.59979,4.71708,2.5,2.37654,2.59259,1.86728,4.11008
400006,7.81893,7.88066,0.185185,8.93519,,9.49074,9.07407,8.8323,0.0154321,9.79938,...,0.0462963,0.221193,,0.138889,7.9321,,,,0.0154321,8.44136
400002,2.29424,2.47428,6.04424,2.88066,1.74383,2.39712,4.11523,1.9393,6.32202,2.20679,...,4.92284,4.16152,1.69753,3.71914,2.58745,2.13477,1.83642,1.41975,4.34671,2.46914
400004,,,8.54424,,,,,,8.42078,,...,8.65226,8.33848,,9.01235,,,,,8.95576,


# Compute similarities

## ${L_1}$-Norm

In [18]:
def l1norm(dfItems):
    numRows, numColumns = dfItems.shape

    # Compare two items only for all the users
    if numColumns == 2:
        # Drop rows with na
        dfItems = dfItems.apply (pd.to_numeric, errors='coerce')
        dfItems = dfItems.dropna()

        # Compute difference between items
        dfItems = dfItems.apply(np.diff, axis=1)
        # Convert to absolute value
        dfItems = dfItems.apply(abs)
        # Sum all the values
        distanceValue = dfItems.sum()
        simValue = distanceValue / numRows
    else:
        # Return NaN, cannot compute against multiple items
        simValue = np.nan
        
    return simValue

In [21]:
# Select the 2 columns to compare
#uiSelection = uiMatrixNorm[[4501244, 3521164]]
uiSelection = uiMatrixNorm[[4501244, 3640424]]
print(uiSelection)
l1value = l1norm(uiSelection)
print(l1value)

        4501244   3640424
400003  4.94342   4.21296
400005      NaN    7.7572
400001  4.93313   4.36214
400006  7.81893  0.185185
400002  2.29424   6.04424
400004      NaN   8.54424
[2.11419753]


- Course Instructor: Dr. Paolo Mengoni (Visiting Scholar, School of Communication, Hong Kong Baptist University) 
  - pmengoni@hkbu.edu.hk

- The codes in this notebook take insipiration from various sources. All codes are for educational purposes only and released under the CC1.0. 