In [1]:
import pandas as pd
import numpy as np
import random

from surprise import Dataset
from surprise import Reader
from surprise import NMF

from scipy.spatial.distance import euclidean, pdist, squareform

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [5]:
def train_test_split(dataDF, training_percentage):
    '''
    Función que divide el dataset en un conjunto de entrenamiento y
    otro conjunto de evaluación.
    '''
    msk = np.random.rand(len(dataDF)) < float(training_percentage / 100)
    train = dataDF[msk]
    test = dataDF[~msk]
    
    return train, test    

# Limpiar Dataset (Solo ejecutar 1 vez)

Para este experimento cogeremos los **650 usuarios más activos y las 50 películas más valoradas** partiendo del dataset **de MovieLens de 100K valoraciones**. Esta idea la cogemos del ejemplo del paper *Exploring Explanations for Matrix Factorization Recommender Systems*.

In [6]:
all_ratings_df = pd.read_csv('data/ratings.csv')
all_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# Calculamos el número de valoraciones de casa usuarios
users_num_ratings_df = all_ratings_df.groupby(by='userId').count()

# Cogemos los 650 usuarios más activos
most_active_users = users_num_ratings_df.sort_values(by='rating', ascending=False).index[:650]

In [8]:
# Calculamos las películas más activas
movies_num_ratings_df = all_ratings_df.groupby(by='movieId').count()

# Cogemos las 50 películas con más valoraciones
most_rated_movies = movies_num_ratings_df.sort_values(by='userId', ascending=False).index[:50]

In [9]:
# Creamos un nuevo dataframe con los usuarios y películas seleccionadas
final_ratings_df = all_ratings_df.loc[all_ratings_df['movieId'].isin(most_rated_movies)]
final_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
3,1,47,5.0,964983815
4,1,50,5.0,964982931
7,1,110,4.0,964982176
15,1,260,5.0,964981680


In [10]:
total_values_to_predict = len(most_active_users) * len(most_rated_movies)
total_ratings = len(final_ratings_df)
print(total_ratings / total_values_to_predict, '% ratings conocidos del total de la matriz')

0.32154098360655736 % ratings conocidos del total de la matriz


In [11]:
# Guardamos el nuevo dataframe en un fichero CSV
final_ratings_df = final_ratings_df.reset_index().drop(columns=['index'])
final_ratings_df.to_csv('data/most_rated_dataset.csv', index=False)

In [13]:
# Preparamos el dataset de entrenamiento y evaluación
trainset, testset = train_test_split(final_ratings_df, 90)
trainset.to_csv('data/trainset.csv', index=False)
testset.to_csv('data/testset.csv', index=False)

# Factorización de Matrices

Del nuevo dataset que hemos creado, calculamos las matrices P y Q usando el algoritmode NMF definido en la librería surprise.

In [2]:
# Cargamos los datasets
trainset = pd.read_csv('data/trainset.csv', usecols=[0,1,2])
testset = pd.read_csv('data/testset.csv', usecols=[0,1,2])

# Preparamos los datos de entrenamiento y evaluación

reader = Reader(rating_scale=(1,5))

train_data = Dataset.load_from_df(trainset, reader).build_full_trainset()
test_data = Dataset.load_from_df(testset, reader).build_full_trainset().build_testset()

# Seleccionamos el algoritmo
recommendation_algorithm = NMF()

# Entrenamos el algoritmo
recommendation_algorithm.fit(train_data)

# Obetenmos las predicciones
predictions = recommendation_algorithm.test(test_data)

# Mostramos las predicciones
predictions_df = pd.DataFrame(predictions, columns=['userId', 'movieId', 'realRating', 'estRating', 'details']).drop(columns='details')
predictions_df.head()

Unnamed: 0,userId,movieId,realRating,estRating
0,1,457,5.0,4.531989
1,1,592,4.0,3.91263
2,1,1210,5.0,4.764499
3,1,2959,5.0,4.372534
4,4,588,4.0,3.386799


In [3]:
# Lo usaremos más adelante
predictions_df.loc[predictions_df['estRating'] >= 4.0].head()

Unnamed: 0,userId,movieId,realRating,estRating
0,1,457,5.0,4.531989
2,1,1210,5.0,4.764499
3,1,2959,5.0,4.372534
6,6,47,4.0,4.141829
7,6,110,5.0,4.272893


# Obtenemos las matriz Qi

In [4]:
movies_index = trainset.groupby(by='movieId').count().index
qi = recommendation_algorithm.qi

In [6]:
itemsDF = pd.DataFrame(qi, columns=['i-1','i-2','i-3','i-4','i-5','i-6','i-7','i-8','i-9','i-10','i-11','i-12','i-13','i-14','i-15'])
itemsDF['movieId'] = movies_index
itemsDF.set_index('movieId', inplace=True)
itemsDF.reset_index(inplace=True)
itemsDF.head()

Unnamed: 0,movieId,i-1,i-2,i-3,i-4,i-5,i-6,i-7,i-8,i-9,i-10,i-11,i-12,i-13,i-14,i-15
0,1,0.686926,0.840011,0.599155,0.400147,0.74834,0.41568,0.00688,0.111094,0.139339,0.477982,0.735295,0.636752,0.588507,0.629047,0.597433
1,32,0.140581,0.360797,0.421243,0.811602,0.173372,0.800286,0.107544,0.723221,0.300546,0.94781,0.414476,0.833994,0.180708,0.830025,0.608125
2,47,0.476199,0.028124,0.920408,0.202781,0.718952,0.647758,0.224847,0.973001,0.754216,0.851529,0.628756,0.452164,0.207708,0.680688,0.353078
3,50,0.915665,0.055408,0.254025,0.746733,0.564421,0.222397,0.553068,0.094984,0.707748,0.5874,0.833925,0.726116,0.341055,0.510827,0.734035
4,110,0.100078,0.423575,1.046786,1.00541,0.112906,0.32496,0.670677,0.753419,0.163927,0.429131,0.535529,0.431662,1.06049,0.557137,0.409544


# Calculamos la matriz Q de un usuario

En esta sección analizamos la matriz Q a la que se ha multiplicado el vector $p_u$ del usuario. Analizaremos distintas variantes estadísticas paara saber si hay más dimensiones influyentes que otras.

In [52]:
user = 40

In [53]:
qui = recommendation_algorithm.pu[user] * qi
qui_t = qui.transpose()
index_x = np.arange(1,15)
index_y = np.arange(1, 50)

In [54]:
qui_df = pd.DataFrame(qui, index=movies_index).reset_index()
qui_df.head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,0.102043,0.43622,0.503253,0.067666,0.590851,0.314204,0.003309,0.037899,0.091771,0.450847,0.322221,0.328358,0.35344,0.355765,0.07806
1,32,0.020883,0.187363,0.353818,0.137244,0.136885,0.604921,0.051731,0.246723,0.197945,0.894002,0.181632,0.430071,0.108528,0.46943,0.079457
2,47,0.070739,0.014605,0.773086,0.034291,0.567647,0.489628,0.108155,0.331933,0.49674,0.803186,0.275534,0.233171,0.124743,0.384971,0.046133
3,50,0.136022,0.028773,0.213365,0.126275,0.445638,0.168106,0.266035,0.032403,0.466136,0.554052,0.365443,0.374441,0.204828,0.288904,0.095908
4,110,0.014867,0.219964,0.879235,0.170018,0.089145,0.245631,0.322607,0.257024,0.107965,0.404768,0.23468,0.222598,0.636899,0.315095,0.05351


In [55]:
qui_desc = qui_df.describe()
qui_desc

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,2584.2,0.070577,0.264848,0.436941,0.095067,0.352353,0.39646,0.241665,0.185338,0.310758,0.465732,0.225509,0.254931,0.289473,0.285631,0.072114
std,8270.080327,0.041537,0.154459,0.262255,0.040572,0.223878,0.173453,0.124673,0.092741,0.170076,0.235748,0.11477,0.138918,0.162277,0.157817,0.032095
min,1.0,0.011766,0.011921,0.005053,0.007587,0.011489,0.05255,0.000345,0.023516,0.052166,0.06206,0.004763,0.023044,0.03962,0.001151,0.010661
25%,364.75,0.033493,0.161822,0.216954,0.075364,0.148765,0.268056,0.138997,0.122178,0.177091,0.2643,0.132351,0.167647,0.167095,0.141,0.045056
50%,594.0,0.06862,0.276806,0.460129,0.093562,0.340241,0.422459,0.255586,0.178745,0.274463,0.458273,0.21535,0.240289,0.26335,0.306507,0.071469
75%,1916.0,0.096955,0.383656,0.662595,0.125825,0.552138,0.501973,0.298703,0.259502,0.44595,0.607639,0.318967,0.372906,0.426896,0.38597,0.095278
max,58559.0,0.170752,0.586214,0.879235,0.170018,0.773795,0.766804,0.48662,0.393143,0.681179,0.955371,0.447422,0.531544,0.648363,0.559767,0.156713


In [56]:
trace = go.Heatmap(z=qui,
                   x=index_x,
                   y=index_y,
                   colorscale=[[0.0, 'rgb(255,255,255)'], [1.0, 'rgb(31, 119, 180)']])

data=[trace]
iplot(data, filename='labelled-heatmap')

In [57]:
data = []
for i in range(len(qui_t)):
    trace = go.Box(
        y = qui_t[i],
        name = str(i + 1)
    )

    data.append(trace)

iplot(data)