In [16]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from surprise import Dataset
from surprise import Reader
from surprise import NMF

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Ponemos una semilla fija para que el experimento sea reusable
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

# Calculo Factorización de Matrices

En este notebook entrenaremos la factorización de matrices aplicando el algoritmo **NMF**. Una vez entrenado, ya tendremos la matriz $P$ (relaciona usuario con las dimensiones) y la matriz $Q$ (relaciona películas con dimensiones). Al final, en la carpeta `data/matrices_data` tendremos la matriz $Q_u$ de cada usuario. Esto es la matriz $Q$ multiplicada por el vector en $P$ de cada usuario.

# Preparar dataset

Separaremos los datos de los ratings en 2 grupos, entrenamiento (que tendrá el 90% de los datos) y evaluación (que tendrá el 10% de los datos).

In [17]:
def train_test_split(dataDF, training_percentage):
    '''
    Función que divide el dataset en un conjunto de entrenamiento y
    otro conjunto de evaluación.
    '''
    msk = np.random.rand(len(dataDF)) < float(training_percentage / 100)
    train = dataDF[msk]
    test = dataDF[~msk]
    
    return train, test    

In [18]:
# Cargamos todos los ratings del experimento
ratings_DF = pd.read_csv('data/experiment_data/ratings.csv')
ratings_DF.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,47,5.0,964983815
2,1,50,5.0,964982931
3,1,70,3.0,964982400
4,1,101,5.0,964980868


In [19]:
# Calculamos el porcentaje de la matriz completado

num_users = len(ratings_DF.groupby('userId').count().index)
num_items = len(ratings_DF.groupby('movieId').count().index)
total_ratings = len(ratings_DF)
total_values_to_predict = num_users * num_items

print(100*(total_ratings / total_values_to_predict), '% ratings conocidos del total de la matriz')

11.92192628910957 % ratings conocidos del total de la matriz


In [20]:
# Preparamos el dataset de entrenamiento y evaluación
trainset, testset = train_test_split(ratings_DF, 90)

# Guardamos ambos dataframes en CSV
trainset.to_csv('data/experiment_data/trainset.csv', index=False)
testset.to_csv('data/experiment_data/testset.csv', index=False)

## Entrenamiento del algoritmo

Del nuevo dataset que hemos creado, calculamos las matrices P y Q usando el algoritmode NMF definido en la librería surprise.

In [21]:
# Cargamos los datasets
trainset_DF = pd.read_csv('data/experiment_data/trainset.csv', usecols=[0,1,2])
testset_DF = pd.read_csv('data/experiment_data/testset.csv', usecols=[0,1,2])

# Preparamos los datos de entrenamiento y evaluación

reader = Reader(rating_scale=(1,5))

train_data = Dataset.load_from_df(trainset_DF, reader).build_full_trainset()
test_data = Dataset.load_from_df(testset_DF, reader).build_full_trainset().build_testset()

# Seleccionamos el algoritmo
recommendation_algorithm = NMF()

# Entrenamos el algoritmo
recommendation_algorithm.fit(train_data)

# Obetenmos las predicciones
predictions = recommendation_algorithm.test(test_data)

# Mostramos las predicciones
predictions_df = pd.DataFrame(predictions, columns=['userId', 'movieId', 'realRating', 'estRating', 'details']).drop(columns='details')
predictions_df.head()

Unnamed: 0,userId,movieId,realRating,estRating
0,1,223,3.0,3.611623
1,1,349,4.0,4.004865
2,1,527,5.0,4.1468
3,3,527,0.5,3.610407
4,4,357,3.0,2.097807


## Calculamos las matrices $Q_u$

A continuación, calculamos las matrices $Q_u$ de cada usuario y lo guardamos en ficheros CSV.

In [22]:
# Obtenemos las matrices P y Q
p = pd.DataFrame(recommendation_algorithm.pu)
p['userId'] = sorted(list(set(trainset_DF['userId'])))
p.set_index('userId',inplace=True)
p.reset_index(inplace=True)

q = recommendation_algorithm.qi

In [23]:
# Por cada usuario calculamos su Qu y la gaurdamos en un CSV

for u in tqdm(range(len(p))):
    user_id = int(p.loc[u]['userId'])
    user_vector = p.loc[u][1:].values

    q_u = user_vector * q
    q_u_df = pd.DataFrame(q_u)
    q_u_df['movieId'] = sorted(list(set(trainset_DF['movieId'])))
    q_u_df.set_index('movieId', inplace=True)
    q_u_df.reset_index(inplace=True)
    
    path = 'data/matrices_data/q_user_' + str(user_id) + '.csv'
    q_u_df.to_csv(path, index=False)

100%|██████████| 584/584 [00:02<00:00, 207.77it/s]


# Calcular la predicción de ratings a partir de Qu

In [24]:
def get_predicted_rating(row):
    user = int(row['userId'])
    movie = int(row['movieId'])
    
    q_u = pd.read_csv('data/matrices_data/q_user_' + str(user) + '.csv')
    values = q_u[q_u['movieId'] == movie].values[0]
    return np.sum(values[1:])

In [25]:
testset_DF = testset_DF[testset_DF['userId'].isin(set(trainset_DF['userId']))]

In [26]:
testset_DF['predicted'] = testset_DF.apply(lambda row: get_predicted_rating(row), axis=1)
testset_DF.head()

Unnamed: 0,userId,movieId,rating,predicted
0,1,223,3.0,4.042081
1,1,349,4.0,2.726556
2,1,527,5.0,2.743715
4,4,357,3.0,2.737091
5,5,110,4.0,3.492654


In [27]:
testset_DF.to_csv('data/experiment_data/predicted_values.csv', index=False)

## Visualizar matrices

En esta sección se pueden visualizar las matrices $Q_u$. Se muestra un mapa de calor de la matriz y un analisis estádistico de cada uno de los factores de la matriz $Q_u$. 

In [28]:
# Seleccionar el usuario que vamos a mostrar
user_id = 462
path = 'data/matrices_data/q_user_' + str(user_id) + '.csv'

In [29]:
q_u_DF = pd.read_csv(path)
q_u_DF.head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,0.136041,0.082501,0.117668,0.130931,0.280936,0.007431,0.095236,0.211363,0.173855,0.045931,0.220878,0.090647,0.076499,0.343405,0.310501
1,10,0.181177,0.067242,0.238208,1.023316,0.181374,0.002157,0.160619,0.152972,0.120773,0.012701,0.52252,0.080124,0.036222,0.425558,0.195651
2,11,0.145755,0.054689,0.050525,1.425273,0.056845,0.005208,0.153412,0.118537,0.00787,0.059453,0.481623,0.235629,0.082038,0.28828,0.337147
3,14,0.048767,0.018607,0.089322,0.352244,0.085213,0.002862,0.169502,0.074206,0.16171,0.062029,0.663884,0.059039,0.06694,0.963233,0.171767
4,15,0.115345,0.003413,0.048877,0.4739,0.399656,0.003359,0.071964,0.002044,0.109401,0.021648,0.354788,0.367428,0.078747,0.429475,0.365689


In [30]:
q_u_describe = q_u_DF.describe()
q_u_describe

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0
mean,283.335366,0.090701,0.052263,0.172284,0.509253,0.1750447,0.005625,0.094214,0.117835,0.129623,0.04546,0.32039,0.102674,0.063361,0.33058,0.178559
std,185.085398,0.065596,0.034972,0.111066,0.356692,0.1116147,0.003805,0.059737,0.075767,0.082219,0.028391,0.211369,0.073032,0.03582,0.213768,0.12627
min,1.0,0.002252,1.5e-05,0.001155,0.01782,4.130156e-07,1e-05,0.001799,0.000843,0.001946,0.000148,0.000171,6.2e-05,0.000929,0.000139,0.000513
25%,111.75,0.039366,0.027222,0.088988,0.199168,0.08065229,0.002349,0.043447,0.063226,0.063874,0.022495,0.161124,0.04254,0.03634,0.170491,0.077195
50%,263.5,0.074832,0.046299,0.167704,0.471996,0.1720126,0.005282,0.086514,0.10772,0.122406,0.044368,0.298579,0.084836,0.063639,0.297635,0.162857
75%,444.25,0.141324,0.07441,0.229385,0.691944,0.250561,0.008031,0.144501,0.175923,0.181259,0.062455,0.453982,0.160443,0.085493,0.437913,0.250577
max,610.0,0.2719,0.160832,0.68024,1.80358,0.4759954,0.017123,0.255132,0.327287,0.365244,0.148085,0.914124,0.367428,0.177224,0.995278,0.533481


In [31]:
trace = go.Heatmap(z=q_u_DF.iloc[:,1:].values,
                   x=q_u_DF.columns[1:].values,
                   y=q_u_DF.index.values,
                   colorscale=[[0.0, 'rgb(255,255,255)'], [1.0, 'rgb(31, 119, 180)']])

data=[trace]
iplot(data, filename='labelled-heatmap')

In [32]:
aux = q_u_DF.drop(columns=['movieId']).transpose()
data = []
for i in range(len(aux)):
    trace = go.Box(
        y = aux.iloc[i,:].values,
        name = str(i)
    )

    data.append(trace)

iplot(data)

### Siguiente Notebook

Una vez calculado las matrices $Q$ por cada usuario ya podemos obtener películas similares a partir de esa información y estudiar las características en común. Ejecute el notebook `recomendador_con_explicaciones.ipynb`.