## Importations

In [1]:
import numpy as np
import pandas as pd
import itertools as it
import scipy
import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

## Loading data

In [2]:
PATH = "../data/ml-latest-small/"

movies = pd.read_csv(PATH+"movies.csv")
ratings = pd.read_csv(PATH+"ratings.csv")

## Building ratings and movies dataframe which both contains the same movieId

In [3]:
list_movieId = set(movies["movieId"]).intersection(set(ratings["movieId"]))

In [4]:
l = []
for i in range(len(movies['movieId'])):
    if movies['movieId'][i] in list_movieId:
        l.append(i)
movies = movies.iloc[l,:]

In [5]:
a = sorted(list(list_movieId))
b = range(len(a))
d = dict(zip(a,b))
movies = movies.replace({'movieId' : d})
len(set(movies['movieId']))

9724

In [6]:
a = sorted(list(list_movieId))
b = range(len(a))
d = dict(zip(a,b))
ratings = ratings.replace({'movieId' : d})
len(set(movies['movieId']))

9724

## Building one hot encoded genres in movies dataframe

In [7]:
tmp = []
for elt in movies["genres"]:
    tmp.append(elt.split("|"))
movies["genres"] = tmp

In [8]:
mlb = MultiLabelBinarizer(sparse_output=True)
movies = movies.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(movies.pop('genres')),
                index=movies.index,
                columns=mlb.classes_))

In [9]:
movies

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,3,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,9719,Black Butler: Book of the Atlantic (2017),0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9738,9720,No Game No Life: Zero (2017),0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9739,9721,Flint (2017),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,9722,Bungo Stray Dogs: Dead Apple (2018),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Cleaning ratings datagrame

In [10]:
ratings = ratings.drop(columns=["timestamp"])

In [11]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,0,4.0
1,1,2,4.0
2,1,5,4.0
3,1,43,5.0
4,1,46,5.0
...,...,...,...
100831,610,9416,4.0
100832,610,9443,5.0
100833,610,9444,5.0
100834,610,9445,5.0


In [12]:
ratings[ratings["movieId"] == 0]

Unnamed: 0,userId,movieId,rating
0,1,0,4.0
516,5,0,4.0
874,7,0,4.5
1434,15,0,2.5
1667,17,0,4.5
...,...,...,...
97364,606,0,2.5
98479,607,0,4.0
98666,608,0,2.5
99497,609,0,3.0


## Building a sparse matrix which contains the triple (u_k, m_i, r_ki)

In [13]:
ratings_sparse = scipy.sparse.csr_matrix(ratings.values)
ratings_sparse.shape

(100836, 3)

In [14]:
print("USER ID : u_k =",  ratings_sparse[100831, 0])
print("MOVIE ID : m_i =", ratings_sparse[100831, 1])
print("SCORE : r_ki =",   ratings_sparse[100831, 2])

USER ID : u_k = 610.0
MOVIE ID : m_i = 9416.0
SCORE : r_ki = 4.0


## Building a matrix M = (n_movies, n_movies) which contains the number of users who'se seen m_i and m_j

In [52]:
data_dict = dict()

for userId in set(ratings["userId"]):
    iterator = it.combinations(ratings[ratings['userId'] == userId]["movieId"], 2)
    for x, y in iterator:
        data_dict[(x,y)] = data_dict.get((x,y), 0) + 1
        data_dict[(y,x)] = data_dict.get((y,x), 0) + 1

In [53]:
keys = np.array(list(data_dict.keys()))

In [54]:
values = np.array(list(data_dict.values()))

In [55]:
M_coo = scipy.sparse.coo_matrix((values, (keys[:,0], keys[:,1])))
M_csr = M_coo.tocsr()

In [56]:
M_coo.shape

(9724, 9724)

In [57]:
print("The movies with ID 0 and 1 have been spotted", M_csr[0,1], "times in users watch list")

The movies with ID 0 and 1 have been spotted 68 times in users watch list


## Normalizing the M matrix

In [58]:
M_norm = M_coo / M_coo.sum(axis=0)

In [61]:
M_norm

matrix([[0.        , 0.00151111, 0.00145627, ..., 0.        , 0.        ,
         0.        ],
        [0.00106875, 0.        , 0.00118322, ..., 0.        , 0.        ,
         0.        ],
        [0.00050294, 0.00057778, 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.0075188 ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.0075188 , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

## Computing probabilites of genres P_ig

In [81]:
sum_ = movies[[i for i in movies.columns if i != "movieId" and i != "title"]].to_numpy().sum(axis=0).astype(int)
P_ig = sum_ / sum(sum_)
P_ig

array([0.00154223, 0.08291754, 0.05724394, 0.02766942, 0.03011884,
       0.17023496, 0.0542502 , 0.01986755, 0.19726935, 0.03528985,
       0.00385557, 0.04431643, 0.00716683, 0.01510478, 0.02599111,
       0.07216729, 0.04445251, 0.08568448, 0.01728205, 0.00757507])

## Computing R_uk

In [83]:
d = 0.15
alpha = 0.1
n_genres = len(movies.columns) - 2 # Number of genres
n_movies = len(movies)

In [88]:
r = 1/(n_movies*n_genres)
R = np.full((n_movies, n_genres), R)

F_ig = np.sum(R, axis=0) * P_ig

I_uk = ratings.to_numpy() * P_ig
R_new = d*alpha*M_csr*R + d*(1-alpha)*M_csr*F_ig + (1-d)

# I_uk --> (n_user, n_film) ?
# P_ig --> (n_film, n_genre) ?

(20,)


ValueError: operands could not be broadcast together with shapes (100836,3) (20,) 

## Computing TR_ki