In [31]:
import pandas as pd
import numpy as np
import random

from surprise import Dataset
from surprise import Reader
from surprise import NMF

from scipy.spatial.distance import euclidean, pdist, squareform

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [32]:
def train_test_split(dataDF, training_percentage):
    '''
    Función que divide el dataset en un conjunto de entrenamiento y
    otro conjunto de evaluación.
    '''
    msk = np.random.rand(len(dataDF)) < float(training_percentage / 100)
    train = dataDF[msk]
    test = dataDF[~msk]
    
    return train, test    

# Limpiar Dataset (Solo ejecutar 1 vez)

Para este experimento cogeremos los **650 usuarios más activos y las 50 películas más valoradas** partiendo del dataset **de MovieLens de 100K valoraciones**. Esta idea la cogemos del ejemplo del paper *Exploring Explanations for Matrix Factorization Recommender Systems*.

In [33]:
all_ratings_df = pd.read_csv('data/ratings.csv')
all_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [34]:
# Calculamos el número de valoraciones de casa usuarios
users_num_ratings_df = all_ratings_df.groupby(by='userId').count()

# Cogemos los 650 usuarios más activos
most_active_users = users_num_ratings_df.sort_values(by='rating', ascending=False).index[:650]

In [35]:
# Calculamos las películas más activas
movies_num_ratings_df = all_ratings_df.groupby(by='movieId').count()

# Cogemos las 50 películas con más valoraciones
most_rated_movies = movies_num_ratings_df.sort_values(by='userId', ascending=False).index[:50]

In [36]:
# Creamos un nuevo dataframe con los usuarios y películas seleccionadas
final_ratings_df = all_ratings_df.loc[all_ratings_df['movieId'].isin(most_rated_movies)]
final_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
3,1,47,5.0,964983815
4,1,50,5.0,964982931
7,1,110,4.0,964982176
15,1,260,5.0,964981680


In [37]:
total_values_to_predict = len(most_active_users) * len(most_rated_movies)
total_ratings = len(final_ratings_df)
print(100*(total_ratings / total_values_to_predict), '% ratings conocidos del total de la matriz')

0.32154098360655736 % ratings conocidos del total de la matriz


In [38]:
# Guardamos el nuevo dataframe en un fichero CSV
final_ratings_df = final_ratings_df.reset_index().drop(columns=['index'])
final_ratings_df.to_csv('data/most_rated_dataset.csv', index=False)

In [39]:
# Preparamos el dataset de entrenamiento y evaluación
trainset, testset = train_test_split(final_ratings_df, 90)
trainset.to_csv('data/trainset.csv', index=False)
testset.to_csv('data/testset.csv', index=False)

# Factorización de Matrices

Del nuevo dataset que hemos creado, calculamos las matrices P y Q usando el algoritmode NMF definido en la librería surprise.

In [40]:
# Cargamos los datasets
trainset = pd.read_csv('data/trainset.csv', usecols=[0,1,2])
testset = pd.read_csv('data/testset.csv', usecols=[0,1,2])

# Preparamos los datos de entrenamiento y evaluación

reader = Reader(rating_scale=(1,5))

train_data = Dataset.load_from_df(trainset, reader).build_full_trainset()
test_data = Dataset.load_from_df(testset, reader).build_full_trainset().build_testset()

# Seleccionamos el algoritmo
recommendation_algorithm = NMF()

# Entrenamos el algoritmo
recommendation_algorithm.fit(train_data)

# Obetenmos las predicciones
predictions = recommendation_algorithm.test(test_data)

# Mostramos las predicciones
predictions_df = pd.DataFrame(predictions, columns=['userId', 'movieId', 'realRating', 'estRating', 'details']).drop(columns='details')
predictions_df.head()

Unnamed: 0,userId,movieId,realRating,estRating
0,1,457,5.0,3.582155
1,1,592,4.0,3.964446
2,1,1210,5.0,4.436599
3,1,2959,5.0,4.895204
4,4,588,4.0,2.474949


In [41]:
# Lo usaremos más adelante
predictions_df.loc[predictions_df['estRating'] >= 4.0].head()

Unnamed: 0,userId,movieId,realRating,estRating
2,1,1210,5.0,4.436599
3,1,2959,5.0,4.895204
5,5,50,4.0,4.118513
7,6,110,5.0,4.221054
14,11,457,5.0,4.390238


# Obtenemos las matriz Qi

In [42]:
movies_index = trainset.groupby(by='movieId').count().index
qi = recommendation_algorithm.qi

pu = pd.DataFrame(recommendation_algorithm.pu)
pu['userId'] = set(trainset['userId'])
pu.set_index('userId',inplace=True)
pu.reset_index(inplace=True)

In [43]:
itemsDF = pd.DataFrame(qi, columns=['i-1','i-2','i-3','i-4','i-5','i-6','i-7','i-8','i-9','i-10','i-11','i-12','i-13','i-14','i-15'])
itemsDF['movieId'] = movies_index
itemsDF.set_index('movieId', inplace=True)
itemsDF.reset_index(inplace=True)
itemsDF.head()

Unnamed: 0,movieId,i-1,i-2,i-3,i-4,i-5,i-6,i-7,i-8,i-9,i-10,i-11,i-12,i-13,i-14,i-15
0,1,0.669227,0.45029,0.518469,0.172482,0.192067,0.550246,0.730267,0.341074,0.984728,0.581804,0.660342,0.65047,0.161652,0.238133,0.662217
1,32,0.250603,0.688778,1.176327,0.439349,0.887178,0.547806,0.440404,1.066532,0.498039,0.082285,0.369065,0.183575,0.407748,0.321046,0.41099
2,47,0.456392,0.950109,0.885799,0.397693,0.725959,0.552065,0.724731,0.049411,0.329132,0.462918,0.150958,0.909266,0.923671,0.322263,0.303501
3,50,0.905288,0.517975,0.112498,0.452847,0.193739,0.681275,0.619051,0.981687,0.105481,0.768974,0.708297,0.214482,0.508149,0.194432,0.835698
4,110,0.56579,0.726175,1.0049,0.350126,0.554565,0.366892,0.718613,0.427536,0.539649,0.512424,0.884087,0.684994,0.401033,0.311651,0.194278


# Calculamos la matriz Q de un usuario

En esta sección analizamos la matriz Q a la que se ha multiplicado el vector $p_u$ del usuario. Analizaremos distintas variantes estadísticas paara saber si hay más dimensiones influyentes que otras.

In [44]:
movies_df = pd.read_csv('data/movies_data_format.csv')
movies_df.head()

Unnamed: 0,movieId,companies,director,genres,keywords,stars,title,writers,year
0,1,Pixar Animation Studios|Walt Disney Pictures,John Lasseter,Family|Fantasy|Comedy|Adventure|Animation,claw crane|toy|cgi animation|cowboy|rivalry,Don Rickles|Tim Allen|Tom Hanks,Toy Story,Pete Docter|John Lasseter,1995
1,32,Atlas Entertainment|Universal Pictures|Classico,Terry Gilliam,Mystery|Thriller|Sci-Fi,mental institution|time travel|underground|psy...,Brad Pitt|Madeleine Stowe|Bruce Willis,Doce monos,Chris Marker|David Webb Peoples,1995
2,47,New Line Cinema|Juno Pix|Cecchi Gori Pictures,David Fincher,Mystery|Thriller|Drama|Crime,human monster|serial murder|seven deadly sins|...,Morgan Freeman|Brad Pitt|Kevin Spacey,Seven,Andrew Kevin Walker,1995
3,50,Spelling Films International|PolyGram Filmed E...,Bryan Singer,Mystery|Thriller|Crime,burying a body|suspect|criminal mastermind|dir...,Gabriel Byrne|Chazz Palminteri|Kevin Spacey,Sospechosos habituales,Christopher McQuarrie,1995
4,110,The Ladd Company|B.H. Finance C.V.|Icon Entert...,Mel Gibson,War|Biography|Drama|History,legend|14th century|revolt|scotland|england,Patrick McGoohan|Sophie Marceau|Mel Gibson,Braveheart,Randall Wallace,1995


In [161]:
user = 56

In [162]:
qui = pu.loc[pu['userId'] == user].values[0,1:] * qi
qui_t = qui.transpose()
index_x = np.arange(0,14)
index_y = np.arange(1, 50)

In [163]:
qui_df = pd.DataFrame(qui, index=movies_index).reset_index()
qui_df.head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,0.331186,0.214649,0.24043,0.054776,0.162004,0.439977,0.466859,0.145018,0.887808,0.210599,0.321102,0.035796,0.083282,0.166855,0.58487
1,32,0.124018,0.328334,0.545499,0.139526,0.748315,0.438026,0.28155,0.453469,0.44902,0.029785,0.179464,0.010102,0.210069,0.22495,0.362986
2,47,0.225859,0.452909,0.410772,0.126297,0.612331,0.441432,0.46332,0.021008,0.296738,0.167565,0.073406,0.050037,0.47587,0.225803,0.268052
3,50,0.448008,0.246914,0.052169,0.143813,0.163415,0.544748,0.395759,0.417395,0.095099,0.278349,0.344421,0.011803,0.261795,0.136234,0.738089
4,110,0.279998,0.346162,0.466003,0.111191,0.467764,0.293367,0.459409,0.18178,0.486535,0.185485,0.429902,0.037696,0.20661,0.218367,0.171586


In [164]:
qui_desc = qui_df.describe()
qui_desc

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,2584.2,0.22896,0.218938,0.237897,0.164747,0.451953,0.476749,0.326228,0.209715,0.434529,0.198376,0.242086,0.026144,0.257163,0.331837,0.485605
std,8270.080327,0.138361,0.135529,0.132352,0.08593,0.246055,0.230035,0.175688,0.111294,0.233956,0.088128,0.147064,0.016003,0.118089,0.173723,0.238615
min,1.0,0.005672,0.012018,0.001173,0.003945,0.027177,0.013834,0.001156,0.00377,0.002826,0.009823,0.000615,0.000173,0.015849,0.028173,0.024356
25%,364.75,0.11922,0.10418,0.131544,0.114262,0.297637,0.345797,0.176233,0.151281,0.255167,0.136752,0.10966,0.011471,0.193411,0.220013,0.356567
50%,594.0,0.228632,0.206965,0.240744,0.151649,0.400648,0.467923,0.376298,0.212171,0.482116,0.204116,0.26531,0.028184,0.265192,0.293732,0.537767
75%,1916.0,0.316379,0.293097,0.318368,0.214161,0.624144,0.622622,0.465974,0.281089,0.602994,0.277772,0.341999,0.042666,0.323606,0.463706,0.601965
max,58559.0,0.477232,0.534855,0.545499,0.400789,0.90913,0.968084,0.621611,0.453469,0.962433,0.325729,0.623024,0.053092,0.54012,0.727155,1.302667


In [165]:
trace = go.Heatmap(z=qui,
                   x=index_x,
                   y=index_y,
                   colorscale=[[0.0, 'rgb(255,255,255)'], [1.0, 'rgb(31, 119, 180)']])

data=[trace]
iplot(data, filename='labelled-heatmap')

In [166]:
data = []
for i in range(len(qui_t)):
    trace = go.Box(
        y = qui_t[i],
        name = str(i)
    )

    data.append(trace)

iplot(data)

# Pruebas

In [167]:
movies_watched = trainset.loc[trainset['userId'] == user]['movieId'].values
print("El usuario ha visto", len(movies_watched), 'películas')

El usuario ha visto 14 películas


In [168]:
qui_df = pd.merge(left=qui_df, right=movies_df, on='movieId')
watched_qui_df = qui_df.loc[qui_df['movieId'].isin(movies_watched)]
watched_qui_df.head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,...,13,14,companies,director,genres,keywords,stars,title,writers,year
2,47,0.225859,0.452909,0.410772,0.126297,0.612331,0.441432,0.46332,0.021008,0.296738,...,0.225803,0.268052,New Line Cinema|Juno Pix|Cecchi Gori Pictures,David Fincher,Mystery|Thriller|Drama|Crime,human monster|serial murder|seven deadly sins|...,Morgan Freeman|Brad Pitt|Kevin Spacey,Seven,Andrew Kevin Walker,1995
4,110,0.279998,0.346162,0.466003,0.111191,0.467764,0.293367,0.459409,0.18178,0.486535,...,0.218367,0.171586,The Ladd Company|B.H. Finance C.V.|Icon Entert...,Mel Gibson,War|Biography|Drama|History,legend|14th century|revolt|scotland|england,Patrick McGoohan|Sophie Marceau|Mel Gibson,Braveheart,Randall Wallace,1995
6,165,0.470973,0.356695,0.039297,0.344896,0.068191,0.653878,0.020274,0.294302,0.459799,...,0.44878,0.761944,Cinergi Pictures Entertainment|Twentieth Centu...,John McTiernan,Action|Thriller|Adventure,time bomb|john mcclane character|male butt cle...,Jeremy Irons|Samuel L. Jackson|Bruce Willis,Jungla de cristal: La venganza,Roderick Thorp|Jonathan Hensleigh,1995
8,296,0.263623,0.029162,0.241057,0.137697,0.315078,0.514461,0.483905,0.173769,0.742856,...,0.444569,0.594139,Jersey Films|A Band Apart|Miramax,Quentin Tarantino,Drama|Crime,nonlinear timeline|black comedy|overdose|drug ...,John Travolta|Uma Thurman|Samuel L. Jackson,Pulp Fiction,Quentin Tarantino|Roger Avary,1994
9,318,0.127927,0.207111,0.058637,0.233897,0.335185,0.788262,0.089631,0.014423,0.477696,...,0.226062,0.510058,Castle Rock Entertainment,Frank Darabont,Drama,prison|voice over narration|escape from prison...,Morgan Freeman|Tim Robbins|Bob Gunton,Cadena perpetua,Stephen King|Frank Darabont,1994


In [169]:
# Mapa de calor de la película a buscar
movie_query = testset.loc[testset['userId'] == user]['movieId'].values
query_qui_df = qui_df.loc[qui_df['movieId'].isin(movie_query)]
query_qui_df

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,...,13,14,companies,director,genres,keywords,stars,title,writers,year
13,367,0.110251,0.417472,0.273194,0.139442,0.351268,0.014298,0.164465,0.307242,0.300791,...,0.577788,0.558901,New Line Cinema|Dark Horse Entertainment,Chuck Russell,Comedy|Fantasy,high heels|camera shot of feet|pantyhose|legs|...,Cameron Diaz|Jim Carrey|Peter Riegert,La máscara,Mark Verheiden|Michael Fallon,1994


In [170]:
trace = go.Heatmap(z=query_qui_df.values[:,1:16],
                   x=index_x,
                   y=movie_query,
                   colorscale=[[0.0, 'rgb(255,255,255)'], [1.0, 'rgb(31, 119, 180)']])

data=[trace]
iplot(data, filename='labelled-heatmap')

In [171]:
trace = go.Heatmap(z=watched_qui_df.values[:,1:16],
                   x=index_x,
                   y=index_y,
                   colorscale=[[0.0, 'rgb(255,255,255)'], [1.0, 'rgb(31, 119, 180)']])

data=[trace]
iplot(data, filename='labelled-heatmap')

In [172]:
watched_qui_df.sort_values(by=[5,8], ascending=False).head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,...,13,14,companies,director,genres,keywords,stars,title,writers,year
9,318,0.127927,0.207111,0.058637,0.233897,0.335185,0.788262,0.089631,0.014423,0.477696,...,0.226062,0.510058,Castle Rock Entertainment,Frank Darabont,Drama,prison|voice over narration|escape from prison...,Morgan Freeman|Tim Robbins|Bob Gunton,Cadena perpetua,Stephen King|Frank Darabont,1994
11,356,0.452335,0.074459,0.106762,0.04624,0.374865,0.751605,0.366467,0.177497,0.204259,...,0.538685,0.651367,Paramount Pictures,Robert Zemeckis,Romance|Drama,mother|vietnam|amputee|war hero|vietnam war,Tom Hanks|Robin Wright|Gary Sinise,Forrest Gump,Winston Groom|Eric Roth,1994
6,165,0.470973,0.356695,0.039297,0.344896,0.068191,0.653878,0.020274,0.294302,0.459799,...,0.44878,0.761944,Cinergi Pictures Entertainment|Twentieth Centu...,John McTiernan,Action|Thriller|Adventure,time bomb|john mcclane character|male butt cle...,Jeremy Irons|Samuel L. Jackson|Bruce Willis,Jungla de cristal: La venganza,Roderick Thorp|Jonathan Hensleigh,1995
15,380,0.363768,0.036318,0.018923,0.121483,0.054778,0.598265,0.171993,0.160076,0.632408,...,0.569366,0.69569,Lightstorm Entertainment|Twentieth Century Fox,James Cameron,Action|Thriller|Comedy,remake|secret mission|spy|tango|secret agent,Tom Arnold|Jamie Lee Curtis|Arnold Schwarzenegger,Mentiras arriesgadas,Simon Michaël|Claude Zidi,1994
17,480,0.221518,0.052874,0.175066,0.172946,0.574815,0.550416,0.165484,0.367376,0.354033,...,0.446443,0.623151,Amblin Entertainment|Universal Pictures,Steven Spielberg,Thriller|Adventure|Sci-Fi,theme park|tyrannosaurus rex|chaos theory|dino...,Sam Neill|Laura Dern|Jeff Goldblum,Jurassic Park (Parque Jurásico),Michael Crichton,1993


In [173]:
# Películas más similares
K = 5
print("Query:", query_qui_df.iloc[0]['title'])
watched_qui_df.sort_values(by=[10], ascending=False).iloc[:K]['title'].tolist()

Query: La máscara


['El silencio de los corderos',
 'El fugitivo',
 'Braveheart',
 'Mentiras arriesgadas',
 'Jurassic Park (Parque Jurásico)']

In [174]:
print("Query:", query_qui_df.iloc[0]['genres'])
watched_qui_df.sort_values(by=[10], ascending=False).iloc[:K]['genres'].tolist()

Query: Comedy|Fantasy


['Thriller|Drama|Crime',
 'Action|Mystery|Crime|Thriller|Drama',
 'War|Biography|Drama|History',
 'Action|Thriller|Comedy',
 'Thriller|Adventure|Sci-Fi']

In [175]:
print("Query", query_qui_df.iloc[0]['keywords'])
watched_qui_df.sort_values(by=[10], ascending=False).iloc[:K]['keywords'].tolist()

Query high heels|camera shot of feet|pantyhose|legs|female stockinged feet


['psychopath|bad guy wins|serial killer|stuck in a well|psycho thriller',
 'on the run|chicago illinois|one armed man|surgeon|u.s. marshal',
 'legend|14th century|revolt|scotland|england',
 'remake|secret mission|spy|tango|secret agent',
 'theme park|tyrannosaurus rex|chaos theory|dinosaur|man on a toilet']

In [176]:
print("Query", query_qui_df.iloc[0]['year'])
watched_qui_df.sort_values(by=[10], ascending=False).iloc[:K]['year'].tolist()

Query 1994


[1991, 1993, 1995, 1994, 1993]

In [177]:
print("Query", query_qui_df.iloc[0]['companies'])
watched_qui_df.sort_values(by=[10], ascending=False).iloc[:K]['companies'].tolist()

Query New Line Cinema|Dark Horse Entertainment


['Orion Pictures|Strong Heart/Demme Production',
 'Kopelson Entertainment|Warner Bros.',
 'The Ladd Company|B.H. Finance C.V.|Icon Entertainment International',
 'Lightstorm Entertainment|Twentieth Century Fox',
 'Amblin Entertainment|Universal Pictures']