<a href="https://colab.research.google.com/github/khushalkumar/ml-unsupervised-and-recsys/blob/main/9-ml_recommender_systems_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML: Recommender Systems-3

### Collaborative Filtering (based on rating)

In [None]:
# pip install cmfrec

In [None]:
from cmfrec import CMF
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
from sklearn.metrics import mean_squared_error as mse
warnings.filterwarnings('ignore')

In [None]:
movies = pd.read_csv('movies.csv') # Imdb
ratings = pd.read_csv('ratings.csv')

In [None]:
# to minimize the computations, since matrix factorization is very compute heavy. 668 users and 1000 movies is itself a huge dataset.
select_movies = ratings.movieId.value_counts().head(1000).index.to_list()
movies = movies.loc[movies.movieId.isin(select_movies)]
ratings = ratings.loc[ratings.movieId.isin(select_movies)]

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [None]:
ratings.shape

(63250, 4)

In [None]:
rm = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
rm.head()

movieId,1,2,3,5,6,7,10,11,16,17,...,88125,89745,91529,96610,99114,109374,109487,111759,112852,116797
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,3.0,0.0,3.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
rm.shape

(668, 1000)

In [None]:
rm_raw = ratings[['userId', 'movieId', 'rating']].copy()    # copying to format the column names for the package.
rm_raw.columns = ['UserId', 'ItemId', 'Rating']  # Lib requires specific column names
rm_raw.head(2)

Unnamed: 0,UserId,ItemId,Rating
0,1,16,4.0
1,1,24,1.5


In [None]:
# rm_raw.shape

(63250, 3)

In [None]:
# Packages : Suprise , CMFREC

In [None]:
from cmfrec import CMF
# k is number of encoders here.
# lambda is the regularization rate
# user_bias is the b(u)
# item_bias is the b(i)

# we tried k = 5 as well and it was giving worse results in class.

model = CMF(method="als", k=2, lambda_=0.1, user_bias=False, item_bias=False, verbose=False)    # method can be "als"(Gradient Descent approach) or "lbfgs"(used in SARIMAX)
model.fit(rm_raw)

Collective matrix factorization model
(explicit-feedback variant)


In [None]:
model.A_.shape, model.B_.shape   # A matrix is of rows 668 which is the number of users in my original rm data. B matrix has 1000 rows for the unique movies.

((668, 2), (1000, 2))

In [None]:
rm_raw.Rating.mean(), model.glob_mean_

(3.6659130434782607, 3.6659131050109863)

In [None]:
# R = A.B + mu
rm__ = np.dot(model.A_, model.B_.T) + model.glob_mean_    # we're taking transpose of B and adding the global mean.

In [None]:
rm__[0]    # for every movie for a user, it has given a prediction. Whether it's good or not, we'll see.

array([3.9872193 , 2.013515  , 4.148358  , 4.275113  , 4.672877  ,
       4.1426783 , 3.7773404 , 3.5917268 , 2.847507  , 1.3983827 ,
       4.087627  , 0.87938786, 4.313746  , 3.022889  , 2.9781733 ,
       4.507187  , 4.7699313 , 3.2834868 , 4.157156  , 2.8278878 ,
       2.9945934 , 3.7143376 , 3.248501  , 4.563337  , 3.7429895 ,
       3.4365995 , 2.9112353 , 4.3936605 , 2.670127  , 4.617191  ,
       2.8227568 , 1.2970147 , 2.1665502 , 1.9971961 , 2.6830902 ,
       4.709319  , 4.535943  , 3.9966462 , 3.4580631 , 4.3363957 ,
       4.618622  , 4.3653917 , 4.2544727 , 3.8918908 , 4.6495385 ,
       3.748356  , 4.5497675 , 4.4103594 , 4.1750565 , 4.334175  ,
       4.0587635 , 4.2106333 , 3.844352  , 3.6918812 , 3.2134662 ,
       4.3433456 , 2.743507  , 3.337244  , 4.0608664 , 3.157779  ,
       4.0985017 , 2.657434  , 3.2134168 , 3.7440207 , 3.7947633 ,
       2.8721886 , 3.2996387 , 3.7121046 , 4.0234313 , 4.082932  ,
       4.37195   , 2.2741008 , 3.7850149 , 4.2203875 , 4.29151

In [None]:
rm__[0,:10] # for first user looking at the predictions of first 10 movies

array([3.9872193, 2.013515 , 4.148358 , 4.275113 , 4.672877 , 4.1426783,
       3.7773404, 3.5917268, 2.847507 , 1.3983827], dtype=float32)

In [None]:
# whichever value of k gives you the least mse will be the ideal k and that will give the best model.
mse(rm.values[rm > 0], rm__[rm > 0])**0.5            # (actual - prediction) will tell us how off we are.

1.0629517113958387

- It means if I predict say 3, the actual rating would be between 3-1 and 3+1 ie. 2 to 4.
- This model isn't good enough. Error is slighly higher.
- We can optimize it.

In [None]:
top_items = model.topN(user=1, n=10)   # topN gives top n recommendations/indexes (movieId/ItemId) for a user.
movies.loc[movies.movieId.isin(top_items)]

Unnamed: 0,movieId,title,genres
279,318,"Shawshank Redemption, The (1994)",Crime|Drama
743,922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama|Film-Noir|Romance
744,923,Citizen Kane (1941),Drama|Mystery
938,1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
973,1212,"Third Man, The (1949)",Film-Noir|Mystery|Thriller
978,1217,Ran (1985),Drama|War
4472,5971,My Neighbor Totoro (Tonari no Totoro) (1988),Animation|Children|Drama|Fantasy
5390,7502,Band of Brothers (2001),Action|Drama|War
6958,44555,"Lives of Others, The (Das leben der Anderen) (...",Drama|Romance|Thriller
9908,109374,"Grand Budapest Hotel, The (2014)",Comedy|Drama


In [None]:
top_items # top 10 movie indexes

array([ 44555,   7502,   1217,   1172,    923,    318,   1212,    922,
       109374,   5971])

In [None]:
top_items = model.topN(user=500, n=10)
movies.loc[movies.movieId.isin(top_items)]

Unnamed: 0,movieId,title,genres
143,168,First Knight (1995),Action|Drama|Romance
389,441,Dazed and Confused (1993),Comedy
1060,1302,Field of Dreams (1989),Children|Drama|Fantasy
1776,2243,Broadcast News (1987),Comedy|Drama|Romance
2567,3210,Fast Times at Ridgemont High (1982),Comedy|Drama|Romance
2715,3424,Do the Right Thing (1989),Drama
3054,3868,"Naked Gun: From the Files of Police Squad!, Th...",Action|Comedy|Crime|Romance
5629,8376,Napoleon Dynamite (2004),Comedy
5794,8784,Garden State (2004),Comedy|Drama|Romance
8129,69122,"Hangover, The (2009)",Comedy|Crime


### Raw Implementation

In [None]:
# rm_small = rm.copy()
# rm_small = rm_small[rm_small.columns[:100]]   # first 100 columns but rows are still 668
# rm_small = rm_small.head(100)   # first 100 rows
# rm_small

movieId,1,2,3,5,6,7,10,11,16,17,...,300,303,306,307,308,315,316,317,318,319
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,5.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,3.0,0.0,3.0,0.0,4.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,5.0,5.0,0.0,0.0,0.0,4.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,5.0,0.0,4.0,3.0,4.0,3.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,3.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [None]:
rm_small = rm.copy()
rm_small = rm_small[rm_small.columns[:100]]  # taking only first 100 columns for faster execution.
rm_small = rm_small.head(100)

In [None]:
K = 2
# filling these matrices P and Q with random.normal distribution.
# This isn't the best way to initialize P and Q. There are other ways.
P = np.random.normal(size=(rm_small.shape[0], K)) # nrows of users * K
Q = np.random.normal(size=(rm_small.shape[1], K)) # nrows of movies * K

# alpha is the learning rate.
# beta is for regularization. We haven't coded for beta, so we'll just ignore it.

def matrix_factorization(R, P, Q, K, steps=10000, alpha=0.0002, beta=0.02):
    Q = Q.T   # multiplying P * Q
    for step in range(steps):     # for how many iterations you want to complete.
        for i in range(len(R)):   # number of rows
            for j in range(len(R[i])):    # number of columns
                if R[i][j] == 0:          # if zero, don't do computation.
                    continue
                eij = R[i][j] - np.dot(P[i,:],Q[:,j])    # error eij = Rij - P.Q
                for k in range(K):
                    x = P[i][k] ##
                    P[i][k] += alpha * (2 * eij * Q[k][j]) # - beta * P[i][k])
                    Q[k][j] += alpha * (2 * eij * x) # P[i][k] # - beta * Q[k][j])
    return P, Q.T
 ## P[i][k]  = P[i][k]  - alpha * (2 * eij * Q[k][j])

P_, Q_ =  matrix_factorization(rm_small.values.copy(), P.copy(), Q.copy(), 2)    # passing R,P,Q,K

In [None]:
print(np.dot(P_[4], Q_[36]), rm_small.values[4, 36])    # comparing the values of P__ and Q__ that we got vs R
# Not great result. Maybe the person hadn't watched the movie yet.

2.315952576867274 0.0


In [None]:
rm_ = np.dot(P_, Q_.T)
mse(rm_small.values[rm_small > 0], rm_[rm_small > 0])**0.5      # actual - predicted

0.6419426994134607

- Raw model is giving better mse than the CMF model.
- CMF model will perform better too once we fine tune it by changing lambda and K.

If you have any questions, get in touch with me [**here**](https://linktr.ee/khushalkumar31)