<a href="https://colab.research.google.com/github/ihagoSantos/recommendation-systems/blob/main/content_based_rocchio_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Content-Based RS

In this step we will implement the Rocchio Model, a simple content-based RS. For this reason, you must do:

- Read the train file extracted from dataset
- Read te 0-1 file related to the movies features
- Create a sparse matrix to them
- Implement the Rocchio model and save the items recommended

In [1]:
import operator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from collections import OrderedDict
from sklearn.metrics.pairwise import cosine_similarity

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

plt.rcParams.update({'font.size': 14})

# Reading train and Items' feature files

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [36]:
base_url='/content/drive/My Drive/Pós Graduação PUC Minas/11 - Sistemas de Recomendação/Unidade 1/praticas/dataset/ML-1M'

df_train = pd.read_csv(
    base_url + '/trainSet.txt',
    sep='::',
    engine='python',
    names=['userId', 'itemId', 'rating', 'timestamp']
)
df_test = pd.read_csv(
    base_url + '/testSet.txt',
    sep='::',
    engine='python',
    names=['userId', 'itemId', 'rating', 'timestamp']
)
df_features = pd.read_csv(
    base_url + '/features-items.txt',
    sep=',',
    engine='python',
)

df_train.head()
df_features.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,1,1193,5.0,978300760.0
1,1,661,3.0,978302109.0
2,1,914,3.0,978301968.0
3,1,3408,4.0,978300275.0
4,1,1197,3.0,978302268.0


Unnamed: 0,MovieId,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Creating Sparse Matrix

In [6]:
users = df_train['userId']
items = df_train['itemId']
ratings = df_train['rating']

In [7]:
nb_users = max(users)
nb_items = max(items)

In [9]:
# Creating matrix of ratings
ratings_matrix = csr_matrix(
    (ratings, (users, items)),
    shape= (nb_users + 1, nb_items + 1)
)
ratings_matrix.shape

(6041, 3953)

In [12]:
# Select items and freatures
nb_items = max(df_features['MovieId'])
nb_features = len(df_features.columns) - 1

In [13]:
# Creatings matrix of items' features all zero
features_matrix = csr_matrix((nb_items + 1, nb_features + 1))
features_matrix.shape

(3953, 19)

In [18]:
f = open(base_url + '/features-items.txt', 'r')

# ignore header
row = f.readline()

for row in f:
  row = row.rstrip()
  values = row.split(',')
  itemId = int(values.pop(0))
  featureId = 0
  for v in values:
    features_matrix[itemId, featureId] = int(v)
    featureId += 1

f.close()

  self._set_intXint(row, col, x.flat[0])


In [19]:
features_matrix.shape

(3953, 19)

## An useful function
this function is used to save the recommendations in a file.

In [21]:
def dumpRecommendation(recommendation, users_targets, file_name):
  file_out = open(file_name, 'w')

  # for each user target
  for userId in users_targets:
    issuedItems = ""
    # for each item in the previous order
    for itemId in recommendation[userId]:
      issuedItems += str(itemId) + ":" + str(0.0) + ","
    #saving in file in correct format
    string_s = str(userId) + "\t" + "[" + issuedItems
    string_out = string_s[:-1] + ']'
    file_out.write(string_out + "\n")
  file_out.close()

# Rocchio Recommendation
In Rocchio model, the prediction is based on the similarity (e.g., cosine) between items and users:

- Each item is a vector of features (similar to features_matrix)
- Each user is a mix of his/her items consumed:

$$
\vec{U} = \frac{1}{|R_u|} \sum_{j \in R_u}{r_{uj}\vec{j}}
$$

$$
R_u: \text{items consumidos pelo usuário } u \\
r_{uj}: \text{rating do usuário } u \text{ ao item } j
$$


In [22]:
# creating matrix of users' features all zero
users_matrix = csr_matrix((nb_users + 1, nb_features + 1))
users_matrix.shape

(6041, 19)

In [24]:
# Matrix multiplication of reatings and features
aux = np.dot(ratings_matrix, features_matrix)
aux.shape

(6041, 19)

In [26]:
print(aux[15,:])

  (0, 17)	11.0
  (0, 16)	3.0
  (0, 15)	22.0
  (0, 14)	16.0
  (0, 13)	41.0
  (0, 11)	42.0
  (0, 10)	17.0
  (0, 9)	162.0
  (0, 8)	57.0
  (0, 7)	136.0
  (0, 6)	240.0
  (0, 5)	70.0
  (0, 4)	7.0
  (0, 3)	25.0
  (0, 2)	172.0
  (0, 1)	4.0
  (0, 0)	8.0


In [25]:
# Normalizing them by the size of user historic
for u in range(ratings_matrix.shape[0]):
  # measuring the items nonzero
  nb_nonzero = ratings_matrix[u,:].count_nonzero()
  # multiplying this
  if(nb_nonzero != 0):
    users_matrix[u,:] = np.dot(1/float(nb_nonzero), aux[u,:])

users_matrix.shape

  self._set_arrayXarray_sparse(i, j, x)


(6041, 19)

In [27]:
print(users_matrix[15,:])

  (0, 0)	0.049689440993788817
  (0, 1)	0.024844720496894408
  (0, 2)	1.0683229813664596
  (0, 3)	0.15527950310559005
  (0, 4)	0.043478260869565216
  (0, 5)	0.43478260869565216
  (0, 6)	1.4906832298136645
  (0, 7)	0.8447204968944099
  (0, 8)	0.35403726708074534
  (0, 9)	1.0062111801242235
  (0, 10)	0.10559006211180123
  (0, 11)	0.2608695652173913
  (0, 13)	0.2546583850931677
  (0, 14)	0.09937888198757763
  (0, 15)	0.13664596273291924
  (0, 16)	0.018633540372670808
  (0, 17)	0.06832298136645962


## Recommending Items
The recommendation is related to the cosine similarity of users and items vectors.

In [28]:
features_matrix.shape

(3953, 19)

In [29]:
# Cosine similarity betweren each item
prediction_matrix = cosine_similarity(users_matrix, features_matrix)
prediction_matrix.shape

(6041, 3953)

In [32]:
print(np.sort(prediction_matrix[15,:])[::-1])

[0.86900599 0.86900599 0.82964335 ... 0.         0.         0.        ]


In [30]:
# Size of each recommendation
top_k = 10

In [34]:
# Setting the recommendations of items that have not be rated by the user
recommendation = {}

for u in range(ratings_matrix.shape[0]):
  recommendation[u] = []
  cont = 0
  # sorting items by relevance
  order = np.argsort(prediction_matrix[u,:])[::-1]
  for i in order:
    # recommending the top-k items
    if(cont < top_k):
      if(ratings_matrix[u, i] == 0):
        recommendation[u].append(i)
        cont += 1
    else:
      break

In [38]:
# save in a file
users_targets = df_test['userId'].unique()
dumpRecommendation(recommendation, users_targets, "recList_Rocchio.txt")

In [39]:
recommendation[300]
recommendation[3000]
recommendation[6010]

[390, 21, 1916, 3180, 3817, 1723, 3381, 3083, 1115, 537]

[1127, 1917, 849, 1591, 1544, 1215, 1720, 173, 316, 2641]

[1916, 390, 2924, 3275, 360, 2170, 1858, 3740, 438, 1866]