# COMM7380 Recommender Systems for Digital Media

In [None]:
# Install NetworkX, Matplotlib, Pandas, Numpy using pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install scipy

# User Behaviour and the User-Item Matrix

## Importing and knowing your data 

In [None]:
import pandas as pd 
import numpy as np

In [None]:
evidence = pd.read_csv('../data/' + 'collector_log.csv')

In [None]:
# checkin the type and take a glance at the head 
print(type(evidence))
evidence.head(5)

In [None]:
users = evidence.user_id.unique()
content = evidence.content_id.unique()
print(type(content))
print(len(content))

# Implicit Ratings
## Binary Matrix
Let's create a user-item binary matrix from the "buy" events

In [None]:
#Create a user-item binary matrix
uiBuyMatrix = pd.DataFrame(columns=content, index=users)
uiBuyMatrix.head(2)

In [None]:
evidence.event.unique()

Select only the "buy" events

In [None]:
buyEvidence = evidence[evidence['event'] == 'buy']
buyEvidence.head(5)

Create the user-item matrix `uiBuyMatrix` for the buy events

In [None]:
for index, row in buyEvidence.iterrows():
    currentUser = row['user_id']
    currentContent = row['content_id']
    uiBuyMatrix.at[currentUser, currentContent] = 1

In [None]:
print(uiBuyMatrix)

## Behavioural Implicit Ratings

Using the formula introduced during lecture

$${IR}_(i,u) = \left(w_1*{\#event}_1\right)+\left(w_2*{\#event}_2\right)+\dots+\left(w_n*{\#event}_n\right)$$

In [None]:
#Create a user-item matrix
uiMatrix = pd.DataFrame(columns=content, index=users)
uiMatrix.head(2)

Type of events recorded in the logs

In [None]:
eventTypes = evidence.event.unique()
print(eventTypes)

Give a weight to each of them

In [None]:
eventWeights = {
    'details': 15,
    'moreDetails': 50,
    'genreView': 0,
    'addToList': 0,
    'buy': 100}

Compute the Implicit Rating for each user-item combination.
Populate the user-item matrix `uiMatrix` with the IR values.

In [None]:
# Iterate the evidence
for index, row in evidence.iterrows():
    # Select the user and items involved
    currentUser = row['user_id']
    currentContent = row['content_id']
    
    # Extract the appropriate weight for the event
    w = eventWeights[row['event']]
    
    # Find the value eventually stored for the current user-item combination
    currentValue = uiMatrix.at[currentUser, currentContent]
    if np.isnan(currentValue):
        currentValue = 0
        
    # Compute the new value and update the user-item matrix
    updatedValue = currentValue + w #+ (1 * w)
    uiMatrix.at[currentUser, currentContent] = updatedValue
    

# Normalise the matrix
Update the user-item matrix by normalizing the values between 0 and 10. 

**Note**: NaN values should be maintained as NaN

In [None]:
uiMatrixNorm = uiMatrix.apply(
    lambda x: ((x - np.nanmin(uiMatrix.values))/(np.nanmax(uiMatrix.values) - np.nanmin(uiMatrix.values)))*10
    )

In [None]:
uiMatrixNorm

# Item-based Collaborative Filtering

## Step 1: Compute Similarity between an item and the rest of the items

We want to predict the rating of item 4501244 for user 400005.

In [None]:
currentUser = 400005
currentItem = 4501244

Find all the co-rated items

To avoid possible problem with measures, let's convert all the values in the matrix as floating point numbers (previusly were `object` type)

In [None]:
uiMatrixNorm = uiMatrixNorm.astype(float)

Drop users that didn't rate actual item

In [None]:
uiMatrixSelection = uiMatrixNorm.dropna(subset = [currentItem])
uiMatrixSelection

Drop items that are not co-rated with the actual one

In [None]:
uiMatrixSelection = uiMatrixSelection.dropna(axis = 1)
uiMatrixSelection

Compute the average rating for the current user

In [None]:
cuAvgRating = uiMatrixNorm.loc[[currentUser]].dropna(axis=1).mean(axis=1)
cuAvgRating

Compute the average rating for the other users.

In [None]:
ouAvgRating = uiMatrixSelection.mean(axis=1)
ouAvgRating

Extract the current item ratings from the DataFrame. The other ratings to which we will compare will remain in the dataframe.

In [None]:
ciRatings = uiMatrixSelection.pop(currentItem)
ciRatings

In [None]:
uiMatrixSelection

Normalize (another normalization!) the ratings basing on the average rating of each user

In [None]:
uiMatrixSelection = uiMatrixSelection.sub(uiMatrixSelection.mean(axis=1), axis=0)
uiMatrixSelection

### Cosine similarity


Let's define our **cosine similarity** measure. We are not going to redefine all the match calculations, but we will use the **cosine distance** in `scipy` library to compute the similarity. Since this is a distance and given the characteristics of the cosine distance, we can convert it to similarity by using the following simple formula:

$$cosine\_similarity = 1 - cosine\_distance$$

In [None]:
from scipy.spatial.distance import cosine

def cosine_sim(df1, df2):
    # check for na in dataframes
    df1na = df1.isna()
    df1clean = df1[~df1na]
    df2clean = df2[~df1na]

    df2na = df2clean.isna()
    df1clean = df1clean[~df2na]
    df2clean = df2clean[~df2na]

    
    # Compute cosine similarity
    distance = cosine(df1clean, df2clean)
    sim = 1 - distance
    
    return sim

print('Similarity between current item and one of the others: ', cosine_sim(ciRatings, uiMatrixSelection[3521164]))
print('Similarity between current item and itself, if everything is ok it should be 1: ', cosine_sim(uiMatrixSelection[3521164], uiMatrixSelection[3521164]))

Compute similarity between all the items and the current one

In [None]:
iiSimilarity = uiMatrixSelection.apply(lambda x: cosine_sim(ciRatings, x), axis=0)
iiSimilarity

## Step 2 and 3: order the items by similarity and select the top-k neighborhood


Sort the items, select the top-5

In [None]:
iiSimilarity.sort_values(ascending=False, inplace=True)
iiSimilarity.head(5)

## Step 4: Rating prediction

Now that we have the items, we can compute the predicted rating for the selected item.

First select the items from the dataset.

In [None]:
itemsToCompare = uiMatrixNorm[iiSimilarity.head(5).index]
itemsToCompare

Predict the ratings (using mean value) using those items

In [None]:
predictedRating = itemsToCompare.loc[currentUser].mean()
#predictedRatings.sort_values(ascending=False, inplace=True)
predictedRating = predictedRating

### Find the predicted ratings for the neighboring items

In [None]:
itemsToCompare = uiMatrixSelection[iiSimilarity.head(5).index]
itemsToCompare

Predict the ratings (using mean value) using those items

In [None]:
predictedRatings = itemsToCompare.mean(axis=0)
predictedRatings.sort_values(ascending=False, inplace=True)
predictedRatings

Let's bring it back to current user rating scheme

In [None]:
predictedRatings += cuAvgRating.iloc[0]
predictedRatings

# Precomputing similarities

## Compute overlapping ratings


Let's compute the ovelapping ratings in our dataset.

We need a utility function to convert from numeric to boolean, indicating if the user rated an item.

In [None]:
def to_bool(value):
    if np.isnan(value):
        return 0
    else:
        return 1

Convert the user-item matrix

In [None]:
uiMatrixBool = uiMatrixNorm.applymap(lambda x: to_bool(x))
uiMatrixBool

Compute the number of overlapping rating between each item

In [None]:
overlappingUsersRatings = uiMatrixBool.T.dot(uiMatrixBool)
overlappingUsersRatings

Check some statistics. The maximum and minimum number of ratings.

In [None]:
overlappingUsersRatings.max(axis=0).max()

In [None]:
overlappingUsersRatings.min(axis=0).min()

Mantain only the items with overlapping ratings over a specific threshold.

In [None]:
toDrop = overlappingUsersRatings.min(axis=0) > 3
toDrop.head(10)

In [None]:
selectedItems = overlappingUsersRatings.loc[toDrop, toDrop]
selectedItems.head(5)

We just need the list of items to compare between each other

In [None]:
selectedIndex = selectedItems.index
selectedIndex

## Precompute similarities

In [None]:
uiMatrixSelection = uiMatrixNorm[selectedIndex]
uiMatrixSelection

Normalize the ratings per user

In [None]:
uiMatrixSelection = uiMatrixSelection.sub(uiMatrixSelection.mean(axis=1), axis=0)
uiMatrixSelection

Compute item-item similarity

In [None]:
iiSimMatrix = pd.DataFrame().reindex_like(selectedItems)
iiSimMatrix.shape#head(5)

In [None]:
for item1 in selectedIndex:
    item1Ratings = uiMatrixSelection[item1]
    for item2 in selectedIndex:
        item2Ratings = uiMatrixSelection[item2]
        iiSimMatrix.at[item1, item2] = cosine_sim(item1Ratings, item2Ratings)

iiSimMatrix

Visualizing

In [None]:
import matplotlib.pyplot as plt

plt.pcolor(iiSimMatrix)
plt.yticks(np.arange(0.5, len(iiSimMatrix.index), 1), iiSimMatrix.index)
plt.xticks(np.arange(0.5, len(iiSimMatrix.columns), 1), iiSimMatrix.columns)
plt.show()

Visualizing 2

In [None]:
import seaborn as sns

cm = sns.light_palette("green", as_cmap=True)

s = iiSimMatrix.style.background_gradient(cmap=cm)
s

# Exercise n. 1
Convert the code in the function `itemCF_prediction(df, currentUser, currentItem)` where the parametes are:

- `df` is a dataframe containing the user-item ratings
- `currentUser` is the user for which we want to predict the rating
- `currentItem` is the items for that we want to predict the rating 

Predict the rating for user id 400005 and item id 4501244

# Exercise n. 2

Convert the code in the function itemCF_recommend(df, currentUser, currentItem, numItems) where the parametes are:
- `df` is a dataframe containing the user-item ratings
- `currentUser` is the user for which we want to predict the rating
- `currentItem` is the items from which we start the comparison 
- `numItems` is the number of items we want to return for suggesting to the current user

Recommend the top-3 items for user id 400005 starting from item id 4501244

- Course Instructor: Dr. Paolo Mengoni (Visiting Scholar, School of Communication, Hong Kong Baptist University) 
  - pmengoni@hkbu.edu.hk

- The codes in this notebook take insipiration from various sources. All codes are for educational purposes only and released under the CC1.0. 