## Importations

In [106]:
import numpy as np
import pandas as pd
import itertools as it
import scipy
import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

## Loading data

In [107]:
PATH = "../"

movies = pd.read_csv(PATH+"movies.csv")
ratings = pd.read_csv(PATH+"ratings.csv")

## Building ratings and movies dataframe which both contains the same movieId

In [108]:
list_movieId = set(movies["movieId"]).intersection(set(ratings["movieId"]))

In [109]:
l = []
for i in range(len(movies['movieId'])):
    if movies['movieId'][i] in list_movieId:
        l.append(i)
movies = movies.iloc[l,:]

In [110]:
a = sorted(list(list_movieId))
b = range(len(a))
d = dict(zip(a,b))
movies = movies.replace({'movieId' : d})
len(set(movies['movieId']))

9724

In [111]:
a = sorted(list(list_movieId))
b = range(len(a))
d = dict(zip(a,b))
ratings = ratings.replace({'movieId' : d})
len(set(movies['movieId']))

9724

## Building one hot encoded genres in movies dataframe

In [112]:
tmp = []
for elt in movies["genres"]:
    tmp.append(elt.split("|"))
movies["genres"] = tmp

In [113]:
mlb = MultiLabelBinarizer(sparse_output=True)
movies = movies.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(movies.pop('genres')),
                index=movies.index,
                columns=mlb.classes_))

In [114]:
movies

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,3,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,9719,Black Butler: Book of the Atlantic (2017),0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9738,9720,No Game No Life: Zero (2017),0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9739,9721,Flint (2017),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,9722,Bungo Stray Dogs: Dead Apple (2018),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Cleaning ratings datagrame

In [115]:
ratings = ratings.drop(columns=["timestamp"])

In [116]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,0,4.0
1,1,2,4.0
2,1,5,4.0
3,1,43,5.0
4,1,46,5.0
...,...,...,...
100831,610,9416,4.0
100832,610,9443,5.0
100833,610,9444,5.0
100834,610,9445,5.0


In [117]:
ratings[ratings["movieId"] == 0]

Unnamed: 0,userId,movieId,rating
0,1,0,4.0
516,5,0,4.0
874,7,0,4.5
1434,15,0,2.5
1667,17,0,4.5
...,...,...,...
97364,606,0,2.5
98479,607,0,4.0
98666,608,0,2.5
99497,609,0,3.0


## Building a sparse matrix which contains the triple (u_k, m_i, r_ki)

In [118]:
ratings_sparse = scipy.sparse.csr_matrix(ratings.values)
ratings_sparse.shape

(100836, 3)

In [119]:
print("USER ID : u_k =",  ratings_sparse[100831, 0])
print("MOVIE ID : m_i =", ratings_sparse[100831, 1])
print("SCORE : r_ki =",   ratings_sparse[100831, 2])

USER ID : u_k = 610.0
MOVIE ID : m_i = 9416.0
SCORE : r_ki = 4.0


## Building a matrix M = (n_movies, n_movies) which contains the number of users who'se seen m_i and m_j

In [120]:
data_dict = dict()

for userId in set(ratings["userId"]):
    iterator = it.combinations(ratings[ratings['userId'] == userId]["movieId"], 2)
    for x, y in iterator:
        data_dict[(x,y)] = data_dict.get((x,y), 0) + 1
        data_dict[(y,x)] = data_dict.get((y,x), 0) + 1

In [121]:
keys = np.array(list(data_dict.keys()))

In [122]:
values = np.array(list(data_dict.values()))

In [123]:
M_coo = scipy.sparse.coo_matrix((values, (keys[:,0], keys[:,1])))
M_csr = M_coo.tocsr()

In [124]:
M_coo.shape

(9724, 9724)

In [125]:
print("The movies with ID 0 and 1 have been spotted", M_csr[0,1], "times in users watch list")

The movies with ID 0 and 1 have been spotted 68 times in users watch list


In [126]:
M_csr.shape

(9724, 9724)

## Normalizing the M matrix

In [127]:
M_norm = M_coo / M_coo.sum(axis=0)

In [128]:
M_norm

matrix([[0.        , 0.00151111, 0.00145627, ..., 0.        , 0.        ,
         0.        ],
        [0.00106875, 0.        , 0.00118322, ..., 0.        , 0.        ,
         0.        ],
        [0.00050294, 0.00057778, 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.0075188 ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.0075188 , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

## Dimension et explication

In [129]:
# uk -> user k 
# ig -> movies i, genre g 
# R_uk -> movie,genre pour user uk
# P_ig -> désigne la probabilité avec laquelle l'item i appartient au genre g
# M -> matrice correlation
# I_uk -> 
# r_ki -> 


In [130]:
#R -> n_user X n_movies X n_genres
#I -> n_user X n_movies X n_genres

#M -> n_movies X n_movies
#F_ig -> n_movies X n_genres
#I_uk -> n_movies X n_genres

## Computing probabilites of genres P_ig

In [131]:
sum_ = movies[[i for i in movies.columns if i != "movieId" and i != "title"]].to_numpy().sum(axis=0).astype(int)
P_ig = sum_ / sum(sum_)
P_ig

array([0.00154223, 0.08291754, 0.05724394, 0.02766942, 0.03011884,
       0.17023496, 0.0542502 , 0.01986755, 0.19726935, 0.03528985,
       0.00385557, 0.04431643, 0.00716683, 0.01510478, 0.02599111,
       0.07216729, 0.04445251, 0.08568448, 0.01728205, 0.00757507])

In [132]:
P_ig.shape

(20,)

## Init Computing R_uk

In [133]:
d = 0.15
alpha = 0.1
n_genres = len(movies.columns) - 2 # Number of genres
n_movies = len(movies)
n_user = len(set(ratings["userId"]))

In [134]:
r = 1/(n_movies*n_genres)
R_uk = np.full((n_movies, n_genres), r)

In [135]:
# R = np.full((n_user,n_movies, n_genres), r)
R = np.array([ np.full((n_movies, n_genres), r) for i in range(n_user)])

In [136]:
R_uk.shape

(9724, 20)

In [137]:
R.shape

(610, 9724, 20)

## Computing F_ig

In [138]:
F_ig = np.sum(R_uk, axis=1).reshape(-1,1) @ P_ig.reshape(1,-1)
F = np.array([ np.sum(R[i], axis=1).reshape(-1,1) @ P_ig.reshape(1,-1) for i in range(n_user)])

In [139]:
F_ig.shape

(9724, 20)

In [140]:
F.shape

(610, 9724, 20)

# Matrix user X movie

In [141]:
values = ratings["rating"]
rows = ratings["userId"]
cols = ratings["movieId"]
M_coo2 = scipy.sparse.coo_matrix((values, (rows, cols)))
M_csr2 = M_coo2.tocsr()
M_csr2.shape

(611, 9724)

In [142]:
# Exemple user 1 ( Attention les users commence a 1 DONC pour user 0 marche pas cf after on decale de 1 les id_users)
id_user = 1

## Computing I_uk

In [143]:
I_uk = M_csr2[id_user,:].T @ P_ig.reshape(1,-1)
I_uk = I_uk / I_uk.sum(axis=0).T
I_uk.shape

(9724, 20)

In [146]:
# warning a cause de ca ( Attention les users commence a 1 DONC pour user 0 marche pas cf after on decale de 1 les id_users) 
I = []
for id_user in range(n_user) :
    I_temps = M_csr2[id_user,:].T @ P_ig.reshape(1,-1)
    I_temps = I_temps / I_temps.sum(axis=0).T
    I.append(I_temps)
I = np.array(I)

  I_temps = I_temps / I_temps.sum(axis=0).T


In [147]:
I.shape

(610, 9724, 20)

## Computing R_uk

In [148]:
# d -> 1
# alpha -> 1
# M_csr -> (9724, 9724)
# R_uk -> (9724, 20)
# F_ig -> (9724, 20)

R_uk = d*alpha*M_csr@R_uk + d*(1-alpha)*M_csr@F_ig + (1-d)*I_uk
R_uk

array([[9.62606067e-03, 8.15073859e-02, 5.88291082e-02, ...,
        8.39515112e-02, 2.35295277e-02, 1.49550552e-02],
       [4.43429109e-03, 5.52729362e-02, 3.92335420e-02, ...,
        5.70015635e-02, 1.42676299e-02, 8.20326534e-03],
       [5.52168083e-03, 3.03467561e-02, 2.25145417e-02, ...,
        3.11908640e-02, 1.03234094e-02, 7.36211284e-03],
       ...,
       [1.31057937e-05, 1.63362234e-04, 1.15956913e-04, ...,
        1.68471288e-04, 4.21687729e-05, 2.42452064e-05],
       [1.31057937e-05, 1.63362234e-04, 1.15956913e-04, ...,
        1.68471288e-04, 4.21687729e-05, 2.42452064e-05],
       [1.64561469e-05, 2.05124008e-04, 1.45600034e-04, ...,
        2.11539136e-04, 5.29487599e-05, 3.04432291e-05]])

In [163]:
R_uk.shape

(9724, 20)

In [150]:
R = np.array([d*alpha*M_csr@R[i]+ d*(1-alpha)*M_csr@F[i] + (1-d)*I[i] for i in range(n_user)])
R.shape

(610, 9724, 20)

## Computing TR_ki

In [170]:
TR_ki = (R_uk @ P_ig)
TR = np.array([(R[i] @ P_ig) for i in range(n_user)])
TR.shape

(610, 9724)

## Best movie

In [174]:
# Trie les recommendations
for i in range(len(TR)) :
    TR[i,:] = np.argsort(TR[i,:])[::-1]

In [175]:
top_10 = TR[:,:10]
top_10.shape

(610, 10)

In [179]:
top_10[1,:]

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [None]:
# brouillon

In [None]:
# ind_one_user  =ratings[ratings['userId'] ==1]['movieId'].to_numpy()