In [1]:
import pandas as pd
import numpy as np
import random

from surprise import Dataset
from surprise import Reader
from surprise import NMF

from scipy.spatial.distance import euclidean, pdist, squareform

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [5]:
def train_test_split(dataDF, training_percentage):
    '''
    Función que divide el dataset en un conjunto de entrenamiento y
    otro conjunto de evaluación.
    '''
    msk = np.random.rand(len(dataDF)) < float(training_percentage / 100)
    train = dataDF[msk]
    test = dataDF[~msk]
    
    return train, test    

# Limpiar Dataset (Solo ejecutar 1 vez)

Para este experimento cogeremos los **650 usuarios más activos y las 50 películas más valoradas** partiendo del dataset **de MovieLens de 100K valoraciones**. Esta idea la cogemos del ejemplo del paper *Exploring Explanations for Matrix Factorization Recommender Systems*.

In [6]:
all_ratings_df = pd.read_csv('data/ratings.csv')
all_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# Calculamos el número de valoraciones de casa usuarios
users_num_ratings_df = all_ratings_df.groupby(by='userId').count()

# Cogemos los 650 usuarios más activos
most_active_users = users_num_ratings_df.sort_values(by='rating', ascending=False).index[:650]

In [8]:
# Calculamos las películas más activas
movies_num_ratings_df = all_ratings_df.groupby(by='movieId').count()

# Cogemos las 50 películas con más valoraciones
most_rated_movies = movies_num_ratings_df.sort_values(by='userId', ascending=False).index[:50]

In [9]:
# Creamos un nuevo dataframe con los usuarios y películas seleccionadas
final_ratings_df = all_ratings_df.loc[all_ratings_df['movieId'].isin(most_rated_movies)]
final_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
3,1,47,5.0,964983815
4,1,50,5.0,964982931
7,1,110,4.0,964982176
15,1,260,5.0,964981680


In [10]:
total_values_to_predict = len(most_active_users) * len(most_rated_movies)
total_ratings = len(final_ratings_df)
print(total_ratings / total_values_to_predict, '% ratings conocidos del total de la matriz')

0.32154098360655736 % ratings conocidos del total de la matriz


In [11]:
# Guardamos el nuevo dataframe en un fichero CSV
final_ratings_df = final_ratings_df.reset_index().drop(columns=['index'])
final_ratings_df.to_csv('data/most_rated_dataset.csv', index=False)

In [13]:
# Preparamos el dataset de entrenamiento y evaluación
trainset, testset = train_test_split(final_ratings_df, 90)
trainset.to_csv('data/trainset.csv', index=False)
testset.to_csv('data/testset.csv', index=False)

# Factorización de Matrices

Del nuevo dataset que hemos creado, calculamos las matrices P y Q usando el algoritmode NMF definido en la librería surprise.

In [2]:
# Cargamos los datasets
trainset = pd.read_csv('data/trainset.csv', usecols=[0,1,2])
testset = pd.read_csv('data/testset.csv', usecols=[0,1,2])

# Preparamos los datos de entrenamiento y evaluación

reader = Reader(rating_scale=(1,5))

train_data = Dataset.load_from_df(trainset, reader).build_full_trainset()
test_data = Dataset.load_from_df(testset, reader).build_full_trainset().build_testset()

# Seleccionamos el algoritmo
recommendation_algorithm = NMF()

# Entrenamos el algoritmo
recommendation_algorithm.fit(train_data)

# Obetenmos las predicciones
predictions = recommendation_algorithm.test(test_data)

# Mostramos las predicciones
predictions_df = pd.DataFrame(predictions, columns=['userId', 'movieId', 'realRating', 'estRating', 'details']).drop(columns='details')
predictions_df.head()

Unnamed: 0,userId,movieId,realRating,estRating
0,1,457,5.0,4.531989
1,1,592,4.0,3.91263
2,1,1210,5.0,4.764499
3,1,2959,5.0,4.372534
4,4,588,4.0,3.386799


In [3]:
# Lo usaremos más adelante
predictions_df.loc[predictions_df['estRating'] >= 4.0].head()

Unnamed: 0,userId,movieId,realRating,estRating
0,1,457,5.0,4.531989
2,1,1210,5.0,4.764499
3,1,2959,5.0,4.372534
6,6,47,4.0,4.141829
7,6,110,5.0,4.272893


# Obtenemos las matriz Qi

In [4]:
movies_index = trainset.groupby(by='movieId').count().index
qi = recommendation_algorithm.qi

In [6]:
itemsDF = pd.DataFrame(qi, columns=['i-1','i-2','i-3','i-4','i-5','i-6','i-7','i-8','i-9','i-10','i-11','i-12','i-13','i-14','i-15'])
itemsDF['movieId'] = movies_index
itemsDF.set_index('movieId', inplace=True)
itemsDF.reset_index(inplace=True)
itemsDF.head()

Unnamed: 0,movieId,i-1,i-2,i-3,i-4,i-5,i-6,i-7,i-8,i-9,i-10,i-11,i-12,i-13,i-14,i-15
0,1,0.686926,0.840011,0.599155,0.400147,0.74834,0.41568,0.00688,0.111094,0.139339,0.477982,0.735295,0.636752,0.588507,0.629047,0.597433
1,32,0.140581,0.360797,0.421243,0.811602,0.173372,0.800286,0.107544,0.723221,0.300546,0.94781,0.414476,0.833994,0.180708,0.830025,0.608125
2,47,0.476199,0.028124,0.920408,0.202781,0.718952,0.647758,0.224847,0.973001,0.754216,0.851529,0.628756,0.452164,0.207708,0.680688,0.353078
3,50,0.915665,0.055408,0.254025,0.746733,0.564421,0.222397,0.553068,0.094984,0.707748,0.5874,0.833925,0.726116,0.341055,0.510827,0.734035
4,110,0.100078,0.423575,1.046786,1.00541,0.112906,0.32496,0.670677,0.753419,0.163927,0.429131,0.535529,0.431662,1.06049,0.557137,0.409544


# Calculamos la matriz Q de un usuario

En esta sección analizamos la matriz Q a la que se ha multiplicado el vector $p_u$ del usuario. Analizaremos distintas variantes estadísticas paara saber si hay más dimensiones influyentes que otras.

In [58]:
user = 400

In [59]:
qui = recommendation_algorithm.pu[user] * qi
qui_t = qui.transpose()
index_x = np.arange(1,15)
index_y = np.arange(1, 50)

In [60]:
qui_df = pd.DataFrame(qui, index=movies_index).reset_index()
qui_df.head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,0.200012,0.598607,0.439763,0.19255,0.392775,0.275209,0.002097,0.012423,0.127835,0.430718,0.459294,0.301555,0.362501,0.450194,0.380272
1,32,0.040933,0.25711,0.30918,0.39054,0.090996,0.529846,0.032777,0.080873,0.275734,0.854089,0.258898,0.394966,0.11131,0.59403,0.387078
2,47,0.138654,0.020042,0.675553,0.097577,0.37735,0.428862,0.068529,0.108804,0.69195,0.767328,0.392746,0.214138,0.127941,0.487152,0.224738
3,50,0.266614,0.039484,0.186447,0.359325,0.296243,0.147243,0.168563,0.010621,0.649319,0.529317,0.520903,0.343877,0.210079,0.365587,0.467221
4,110,0.02914,0.301848,0.768311,0.4838,0.05926,0.215146,0.204408,0.084249,0.150394,0.386697,0.334512,0.204428,0.653228,0.39873,0.260679


In [61]:
qui_desc = qui_df.describe()
qui_desc

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,2584.2,0.138337,0.363441,0.381816,0.270521,0.234231,0.347256,0.153122,0.060751,0.43288,0.444939,0.32144,0.234122,0.296895,0.361445,0.351306
std,8270.080327,0.081416,0.211957,0.229169,0.115451,0.148826,0.151926,0.078994,0.030399,0.236913,0.225223,0.163593,0.127579,0.166437,0.199705,0.156353
min,1.0,0.023063,0.016358,0.004415,0.021589,0.007637,0.046028,0.000218,0.007708,0.072666,0.059289,0.006789,0.021163,0.040636,0.001457,0.051935
25%,364.75,0.06565,0.222061,0.189583,0.214454,0.098894,0.234789,0.08807,0.040048,0.246685,0.2525,0.188653,0.153963,0.171379,0.178425,0.219491
50%,594.0,0.134501,0.37985,0.40208,0.266238,0.226179,0.370029,0.161943,0.05859,0.382322,0.437814,0.30696,0.220675,0.270102,0.387862,0.348167
75%,1916.0,0.190039,0.526475,0.579002,0.358046,0.36704,0.439675,0.189262,0.085061,0.6212,0.580511,0.454655,0.342467,0.437841,0.488417,0.46415
max,58559.0,0.334687,0.804438,0.768311,0.4838,0.514389,0.671639,0.308329,0.128867,0.94887,0.912718,0.637755,0.488156,0.664986,0.708344,0.763437


In [62]:
trace = go.Heatmap(z=qui,
                   x=index_x,
                   y=index_y,
                   colorscale=[[0.0, 'rgb(255,255,255)'], [1.0, 'rgb(31, 119, 180)']])

data=[trace]
iplot(data, filename='labelled-heatmap')

In [63]:
data = []
for i in range(len(qui_t)):
    trace = go.Box(
        y = qui_t[i],
        name = str(i + 1)
    )

    data.append(trace)

iplot(data)