## Importations

In [1]:
import numpy as np
import pandas as pd
import itertools as it
import scipy
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

# Global variables

In [2]:
PATH = "../data/ml-latest-small/"

## Loading data

In [3]:
def load_data(path):
	movies = pd.read_csv(path+"movies.csv")
	ratings = pd.read_csv(path+"ratings.csv")
	return movies, ratings

In [4]:
movies, ratings = load_data(PATH)

# Preprocessing data

In [5]:
## Building ratings and movies dataframe which both contains the same movieId
def clear_dataset(movies, ratings):
	list_movieId = set(movies["movieId"]).intersection(set(ratings["movieId"]))

	l = []
	for i in range(len(movies['movieId'])):
		if movies['movieId'][i] in list_movieId:
			l.append(i)
	movies = movies.iloc[l,:]

	a = sorted(list(list_movieId))
	b = range(len(a))
	d = dict(zip(a,b))
	movies = movies.replace({'movieId' : d})

	a = sorted(list(list_movieId))
	b = range(len(a))
	d = dict(zip(a,b))
	ratings = ratings.replace({'movieId' : d})

	return movies, ratings	
	
## Building one hot encoded genres in movies dataframe
def one_hot_encode_genres(movies):
	tmp = []
	for elt in movies["genres"]:
		tmp.append(elt.split("|"))
	movies["genres"] = tmp

	mlb = MultiLabelBinarizer(sparse_output=True)
	movies = movies.join(
				pd.DataFrame.sparse.from_spmatrix(
												mlb.fit_transform(movies.pop('genres')),
												index=movies.index,
												columns=mlb.classes_))

	return movies

## Cleaning ratings datagrame
def preprocess_ratings(ratings):
	ratings = ratings.drop(columns=["timestamp"])
	ratings['userId'] = ratings['userId'].to_numpy() - 1 # car pas de user 0
	return ratings

def split_set(userId, train_size, ratings):
	rating_user = ratings[ratings["userId"] == userId]
	train_rating_user, test_rating_user = rating_user.to_numpy()[:int(train_size*len(rating_user))], rating_user.to_numpy()[int(train_size*len(rating_user)):]
	return train_rating_user, test_rating_user

def get_infos_user(userId):
	watched_user = set(ratings[ratings["userId"] == userId]["movieId"])
	watched_all = set(ratings['movieId'])
	unwatched_user = list(watched_all.difference(watched_user))
	return watched_user, watched_all, unwatched_user

In [6]:
movies, ratings = clear_dataset(movies, ratings)
movies = one_hot_encode_genres(movies)
ratings = preprocess_ratings(ratings)

# Building matrix

In [7]:
# Building a sparse matrix which contains the triple (u_k, m_i, r_ki)
# def build_sparse_matrix_triples(ratings):
# 	ratings_sparse = scipy.sparse.csr_matrix(ratings.values)
# 	return ratings_sparse

## Building a matrix M = (n_movies, n_movies) which contains the number of users who'se seen m_i and m_j
def build_M_matrix(ratings, train_size):
	data_dict = dict()
	test_rating_user_list = []
	for userId in tqdm(set(ratings["userId"])):
		train_rating_user, test_rating_user = split_set(userId, train_size, ratings)
		test_rating_user_list.append(test_rating_user)
		iterator = it.combinations(train_rating_user[:,1], 2)
		for x, y in iterator:
			data_dict[(x,y)] = data_dict.get((x,y), 0) + 1
			data_dict[(y,x)] = data_dict.get((y,x), 0) + 1
		iterator = it.combinations(test_rating_user[:,1], 2)
		for x, y in iterator:
			data_dict[(x,y)] = 1
			data_dict[(y,x)] = 1
	keys = np.array(list(data_dict.keys())).astype(int)
	values = np.array(list(data_dict.values())).astype(int)
	# print(keys)
	# print("====")
	# print(values)
	M_coo = scipy.sparse.coo_matrix((values, (keys[:,0], keys[:,1])))
	M_csr = M_coo.tocsr()
	M_norm = M_csr / M_csr.sum(axis=0)
	return M_norm, test_rating_user_list

## Computing probabilites of genres P_ig
def build_P_ig(movies):
	sum_ = movies[[i for i in movies.columns if i != "movieId" and i != "title"]].to_numpy().sum(axis=0).astype(int)
	P_ig = sum_ / sum(sum_)
	return P_ig.reshape(-1, 1)

## Initialisation of R_uk before iterative algorithm
def init_R_uk(movies):
	n_genres = len(movies.columns) - 2
	n_movies = len(movies)
	r = 1/(n_movies*n_genres)
	R_uk = np.full((n_movies, n_genres), r)
	return R_uk

## Computing F_ig for each user
def build_F_ig(R_uk, P_ig):
	F_ig = np.sum(R_uk, axis=1).reshape(-1,1) @ P_ig.reshape(1,-1)
	return F_ig

## Matrix user X movie
def build_ratings_matrix(ratings):
	values = ratings["rating"]
	rows = ratings["userId"]
	cols = ratings["movieId"]
	M_coo = scipy.sparse.coo_matrix((values, (rows, cols)))
	M_csr = M_coo.tocsr()
	return M_csr

## Build I_uk for each user
def build_I_uk(tmp_M, id_user, P_ig):
# 	print(tmp_M[id_user,:].T.shape)
	I_uk = tmp_M[id_user,:].T @ P_ig.reshape(1,-1)
	I_uk = I_uk / I_uk.sum(axis=0).T
	return I_uk

def compute_R_uk(id_user, R_uk, tmp_M, P_ig, M_csr, d, alpha, iter_max):
	I_uk = build_I_uk(tmp_M, id_user, P_ig)
	for _ in range(iter_max):
		F_ig = build_F_ig(R_uk, P_ig)
		R_uk = d * alpha * M_csr @ R_uk + d * (1-alpha) * M_csr @ F_ig + (1-d) * I_uk

		# This part is usefull if you want to normalize + break if converge
		# R_uk = (R_uk / R_uk.sum(axis=1)).T # Normalization isn't working
		#     print(np.abs(np.sum(R_uk - R_uk_old)))
		#     if np.abs(np.sum(R_uk - R_uk_old)) < eps :
		#         print(i)
		#         break
		# R_uk_old = R_uk.copy()
	
	TR_ki = np.array(R_uk @ P_ig) # It returns a np.mat object which can't be reduced to dimension 1
	return TR_ki.reshape(-1)

def iterative_R_uk(n_user, R_uk, tmp_M, P_ig, M_csr, d=0.15, alpha=0.1, iter_max=10):
	print("Computing TR_ki for all users...")
	TR_ki_all_user = []
	for id_user in tqdm(range(n_user)):
		TR_ki_all_user.append(compute_R_uk(id_user, R_uk, tmp_M, P_ig, M_csr, d, alpha, iter_max))
	return np.array(TR_ki_all_user)

def init(movies, ratings, train_size):
	print("Init R_uk...")
	R_uk = init_R_uk(movies)
	# print(R_uk.shape)
	print("Building P_ig...")
	tmp_M = build_ratings_matrix(ratings)
	P_ig = build_P_ig(movies)
	# print(P_ig.shape)
	print("Building M_csr...")
	M_csr, test_rating_user_list = build_M_matrix(ratings, train_size)
	# print(M_csr.shape)
	return R_uk, tmp_M, P_ig, M_csr, np.array(test_rating_user_list)

## Metrics

In [8]:
def sort_by_best_movie(TR_ki_all_user):
	sorted_movies_all_user = np.zeros_like(TR_ki_all_user)
	for i in range(len(TR_ki_all_user)):
		sorted_movies_all_user[i,:] = np.argsort(TR_ki_all_user[i,:])[::-1]
	return sorted_movies_all_user

# def compute_doa_score(TR_ki, test_rating_user, unwatched_user):
# 	score = 0
# 	for m_i in test_rating_user:
# 		for m_j in unwatched_user:
# 			if TR_ki[int(m_i)] > TR_ki[int(m_j)]:
# 				score += 1
# 	return score / (len(test_rating_user) * len(unwatched_user))

# def compute_all_doa(TR_ki_all_user):
# 	n_user = TR_ki_all_user.shape[0]
# 	score = 0
# 	for userId in range(n_user):
# 		_, test_rating_user = split_set(userId, train_size)
# 		_, _, unwatched_user = get_infos_user(userId)
# 		score += compute_doa_score(TR_ki_all_user[userId,:], test_rating_user, unwatched_user)
# 	return score / n_user

In [9]:
train_size = 0.7

R_uk, tmp_M, P_ig, M_csr, test_rating_user_list = init(movies, ratings, train_size)

Init R_uk...
Building P_ig...
Building M_csr...


100%|██████████| 610/610 [00:15<00:00, 38.17it/s]
  return R_uk, tmp_M, P_ig, M_csr, np.array(test_rating_user_list)


In [10]:
n_user = len(np.unique(ratings["userId"]))
n_user = 1

TR_ki_all_user = iterative_R_uk(n_user, R_uk, tmp_M, P_ig, M_csr, d=0.15, alpha=0.1, iter_max=10)

Computing TR_ki for all users...


100%|██████████| 1/1 [00:47<00:00, 47.77s/it]


In [11]:
test_user_id = 0

print("TR_ki_all_user shape:", TR_ki_all_user.shape)
print("test_rating_user_list shape:", test_rating_user_list.shape)
print("TR_ki for test user :\n", TR_ki_all_user[test_user_id, :10])

TR_ki_all_user shape: (1, 9724)
test_rating_user_list shape: (610,)
TR_ki for test user :
 [3.86440122e-03 3.39452951e-04 3.55532392e-03 1.21628588e-05
 1.31800830e-04 3.66160230e-03 1.32070721e-04 3.63199808e-05
 2.91618793e-05 3.35040980e-04]


In [12]:
sorted_movies_all_user = sort_by_best_movie(TR_ki_all_user)
print("sorted_movies_all_user shape:", sorted_movies_all_user.shape)
print("Sorted best movies recommandation for test user :\n", sorted_movies_all_user[test_user_id,:10])

sorted_movies_all_user shape: (1, 9724)
Sorted best movies recommandation for test user :
 [ 224.  897.   43.   46.  910.  899.  520.  968.  827. 1938.]


In [13]:
# Split train test sur les users????????
# Entrainement sur le train 
# Test a la fin comparaison NDCG entre tri du ranking et notes de l'utilisateur 

# Split train test sur les ratings par users (on ignore les ratings test pour le train)
# Entrainement sur le train (il reste le meme, on a juste caché des notes)
# Test a la fin comparaison NDCG entre tri du ranking (donné par l'algo grace au train) et notes de l'utilisateur (qu'on a garder en test)
# Test avec le DOA entre les films (qu'on a cache) et les films unwatched ?

# On prend pour un utilisateur sa liste de film + ses notes
# On montre cette liste a des nouveaux utilisateurs puis on leur demande parmis les films que notre algo sort quel est celui qui devrait etre recommandé
# On compare les resultats avec notre ranking reel apres train
# Test de student 

In [14]:
test_user_id = 0
train_rating_user0, test_rating_user0 = split_set(test_user_id, train_size, ratings)
watched_user0, watched_all0, unwatched_user0 = get_infos_user(test_user_id)
print(len(watched_user0), len(watched_all0), len(unwatched_user0))
print(compute_doa_score(TR_ki_all_user[test_user_id], test_rating_user0, unwatched_user0))

test_user_id = 1
train_rating_user0, test_rating_user0 = split_set(test_user_id, train_size, ratings)
watched_user0, watched_all0, unwatched_user0 = get_infos_user(test_user_id)
print(len(watched_user0), len(watched_all0), len(unwatched_user0))
print(compute_doa_score(TR_ki_all_user[test_user_id], test_rating_user0, unwatched_user0))

test_user_id = 2
train_rating_user0, test_rating_user0 = split_set(test_user_id, train_size, ratings)
watched_user0, watched_all0, unwatched_user0 = get_infos_user(test_user_id)
print(len(watched_user0), len(watched_all0), len(unwatched_user0))
print(compute_doa_score(TR_ki_all_user[test_user_id], test_rating_user0, unwatched_user0))

test_user_id = 3
rating_user = ratings[ratings["userId"] == 3]
# print(rating_user)
# train_rating_user0, test_rating_user0 = split_set(test_user_id, train_size, ratings)
# watched_user0, watched_all0, unwatched_user0 = get_infos_user(test_user_id)
# print(len(watched_user0), len(watched_all0), len(unwatched_user0))
# print(compute_doa_score(TR_ki_all_user[test_user_id], test_rating_user0, unwatched_user0))

# test_user_id = 4
# train_rating_user0, test_rating_user0 = split_set(test_user_id, train_size, ratings)
# watched_user0, watched_all0, unwatched_user0 = get_infos_user(test_user_id)
# print(len(watched_user0), len(watched_all0), len(unwatched_user0))
# print(compute_doa_score(TR_ki_all_user[test_user_id], test_rating_user0, unwatched_user0))

232 9724 9492


NameError: name 'compute_doa_score' is not defined

In [None]:
print(compute_all_doa(TR_ki_all_user))

IndexError: index 3 is out of bounds for axis 1 with size 3

## Dimension & explication

In [None]:
# uk -> user k 
# ig -> movies i, genre g 
# R_uk -> movie,genre pour user uk
# P_ig -> désigne la probabilité avec laquelle l'item i appartient au genre g
# M -> matrice correlation
# F -> 
# I_uk -> 
# r_ki -> 

#R -> n_user X n_movies X n_genres
#I -> n_user X n_movies X n_genres

#M -> n_movies X n_movies
#F_ig -> n_movies X n_genres
#I_uk -> n_movies X n_genres