In [1]:
from surprise import Dataset
from surprise import Reader
from surprise import NMF
from surprise.model_selection import train_test_split
from surprise import accuracy

import pandas as pd
import numpy as np

from scipy.spatial.distance import euclidean, pdist, squareform

from plotly.offline import iplot, init_notebook_mode
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio
import os

from collections import Counter



# Carga Dataset

Cargamos el último dataset de MovieLens con un tamaño de 100K ratings. Este dataset contiene 9000 películas y 600 usuarios.

In [2]:
ratingsDF = pd.read_csv('data/ratings.csv', usecols=[0,1,2])
indexMovies = ratingsDF.movieId.unique()

# Preparamos NMF

Entrenamos el algoritmo de recomendación NMF con el 90% de los ratings. A continuación, comprobamos el MAE para conocer el margen de error del recomendador.

In [3]:
def train_test_split(dataDF, training_percentage):
    msk = np.random.rand(len(dataDF)) < float(training_percentage / 100)
    train = dataDF[msk]
    test = dataDF[~msk]
    
    return train, test    

In [4]:
# Load Dataset
trainset, testset = train_test_split(ratingsDF, 90)

reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(trainset, reader).build_full_trainset()
test_data = Dataset.load_from_df(testset, reader).build_full_trainset().build_testset()

In [5]:
# Select algorithm
recommendation_algorithm = NMF()

# Train the algorithm on the trainset and predict ratings for the testset
recommendation_algorithm.fit(train_data)
predictions = recommendation_algorithm.test(test_data)

In [6]:
# Compute error
mae = accuracy.mae(predictions)
rmse = accuracy.rmse(predictions)

MAE:  0.6994
RMSE: 0.9163


In [7]:
# Get items and users factors
qi = recommendation_algorithm.qi
pu = recommendation_algorithm.pu

In [8]:
# Create a data frame that contains movies factors
indexMovies = trainset.groupby(by='movieId').mean().index.values

itemsDF = pd.DataFrame(qi, columns=['i-1','i-2','i-3','i-4','i-5','i-6','i-7','i-8','i-9','i-10','i-11','i-12','i-13','i-14','i-15'])
itemsDF['movieId'] = indexMovies
itemsDF.set_index('movieId', inplace=True)
itemsDF.reset_index(inplace=True)
itemsDF.head()

Unnamed: 0,movieId,i-1,i-2,i-3,i-4,i-5,i-6,i-7,i-8,i-9,i-10,i-11,i-12,i-13,i-14,i-15
0,1,0.448615,0.51312,0.761016,0.558062,0.336098,0.110453,0.679988,0.641057,0.447478,0.945153,0.624808,0.673377,0.272066,0.399181,0.528857
1,2,0.2992,0.984132,0.208504,0.272572,0.58989,0.130194,0.333689,0.728396,0.651215,0.119849,0.351418,0.840982,0.470168,0.307271,0.455965
2,3,0.363726,0.547316,0.480391,0.473582,0.197262,0.054821,0.248237,0.922952,0.711271,0.598732,0.424775,0.678368,0.897396,0.60219,0.668296
3,4,0.566615,0.090566,0.269202,0.500066,0.828935,0.541791,0.273893,0.501041,0.778984,0.740797,0.731365,0.4959,0.88283,0.46573,0.500328
4,5,0.43451,0.532484,0.571569,0.398876,0.047144,0.251698,0.71568,0.409818,0.807586,0.759683,0.688394,0.568819,1.053392,0.752505,0.567576


In [13]:
itemsDF['i-1'].describe()

count    9.359000e+03
mean     3.997024e-01
std      2.943353e-01
min      4.979796e-11
25%      1.611680e-01
50%      3.743740e-01
75%      5.809290e-01
max      2.481073e+00
Name: i-1, dtype: float64

# Calcular la matrix de similitud de los items

Calculamos la matriz de similitud de los items a partir de sus factores. La similitud entre los items se basará en la distancia Euclidea.

In [11]:
def item_similarity(i1, i2):
    sim = 1 / (1 +  euclidean(i1[1:], i2[1:]))
    return sim

In [12]:
similarities = pdist(itemsDF, item_similarity)

In [13]:
item_sim_DF = pd.DataFrame(squareform(similarities), columns=indexMovies, index=indexMovies)
item_sim_DF.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
1,0.0,0.425293,0.468013,0.455097,0.418437,0.349327,0.435087,0.442851,0.37838,0.360744,...,0.522531,0.454686,0.445977,0.457831,0.492631,0.47069,0.466496,0.470079,0.444177,0.447519
2,0.425293,0.0,0.385735,0.392933,0.407978,0.361537,0.419305,0.457202,0.37965,0.354353,...,0.467757,0.467631,0.47785,0.474087,0.488375,0.503196,0.462783,0.45702,0.500006,0.441078
3,0.468013,0.385735,0.0,0.465556,0.397251,0.43094,0.391623,0.381495,0.381815,0.419397,...,0.46788,0.43799,0.453082,0.45789,0.432334,0.468935,0.42573,0.508044,0.414192,0.447409
4,0.455097,0.392933,0.465556,0.0,0.419268,0.39605,0.427184,0.407794,0.391275,0.33933,...,0.464819,0.430364,0.421176,0.408775,0.453577,0.426659,0.433074,0.428163,0.445278,0.40949
5,0.418437,0.407978,0.397251,0.419268,0.0,0.34558,0.358303,0.384124,0.322591,0.321514,...,0.439558,0.480171,0.386444,0.38922,0.422528,0.396818,0.448527,0.417929,0.461207,0.402388


# Evaluaciones

## Evaluación generos en común

En esta sección evaluamos los generos qué tienen en común la pelicula de la que se va a predecir los ratings y las K películas más similares respecto a sus factores.

In [25]:
def get_genres(movieId):
    return moviesDF[moviesDF['movieId'] == movieId]['genres'].values[0]

In [26]:
def get_k_genres(item):
    K = 50
    
    if item in item_sim_DF.index:
        kIndex = item_sim_DF.loc[item].sort_values(ascending=False)[:K].index.values
        kGenres = []

        for i in kIndex:
            kGenres.append(get_genres(i))
        
        return '|'.join(str(e) for e in kGenres)
    else:
        return ''

In [27]:
# Get Movies information
moviesDF = pd.read_csv('data/movies.csv')

# Get Test results
genres_result = pd.DataFrame(predictions)
genres_result.drop(columns=['details'], inplace=True)
genres_result.rename(columns={'uid': 'userId', 'iid': 'itemId', 'r_ui': 'rating', 'est': 'estimation'}, inplace=True)

# Add genres of each test movie
genres_result['movieGenres'] = genres_result['itemId'].apply(get_genres)

# Add genres of the K most similar movies
genres_result['kNN_Genres'] = genres_result['itemId'].apply(get_k_genres)

# Show the first rows of dataframe
genres_result[:10]

Unnamed: 0,userId,itemId,rating,estimation,movieGenres,kNN_Genres
0,1,50,5.0,4.874108,Crime|Mystery|Thriller,Action|Drama|Thriller|Animation|Children|Drama...
1,1,593,4.0,5.0,Crime|Horror|Thriller,Documentary|Action|Crime|Drama|Thriller|Comedy...
2,1,673,3.0,3.271038,Adventure|Animation|Children|Comedy|Fantasy|Sc...,Drama|Children|Comedy|Drama|Comedy|Romance|Com...
3,1,736,3.0,3.597528,Action|Adventure|Romance|Thriller,Comedy|Drama|Romance|Comedy|Drama|Romance|Come...
4,1,780,3.0,4.143473,Action|Adventure|Sci-Fi|Thriller,Drama|Romance|Western|Action|Crime|Comedy|Dram...
5,1,804,4.0,3.541026,Comedy|Romance,Drama|Comedy|Crime|Thriller|Action|Comedy|Roma...
6,1,919,5.0,4.094578,Adventure|Children|Fantasy|Musical,Drama|Thriller|Adventure|Drama|Fantasy|Romance...
7,1,1009,3.0,3.677801,Adventure|Children|Fantasy,Adventure|Children|Drama|Children|Comedy|Adven...
8,1,1089,5.0,5.0,Crime|Mystery|Thriller,Comedy|Romance|Drama|Romance|Adventure|Comedy|...
9,1,1214,4.0,4.507837,Horror|Sci-Fi,Comedy|Drama|Crime|Thriller|Action|Comedy|Horr...


In [2]:
# Select a movie
movieId = 1214

# Get similar genres
counter = Counter(genres_result.iloc[movieId]['kNN_Genres'].split('|'))

# Show genres in a Graph
init_notebook_mode()

trace1 = go.Bar(
    x=list(counter.keys()),
    y=list(counter.values()),
    name='SF Zoo'
)

data = [trace1]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')

NameError: name 'Counter' is not defined

In [33]:
print(genres_result[genres_result['itemId'] == movieId]['movieGenres'].iloc[0])

Comedy|Romance


In [92]:
def item_similarity_2(i1, i2):
    remove_dimension = 1
    item1 = i1[1:]
    item2 = i2[1:]
    sim = 1 / (1 +  euclidean(np.delete(item1, [remove_dimension]), np.delete(item2, [remove_dimension])))
    return sim

In [None]:
similarities = pdist(itemsDF, item_similarity_2)

In [None]:
#Convertir en matriz
item_sim_DF = pd.DataFrame(squareform(similarities), columns=indexMovies, index=indexMovies)
item_sim_DF.to_csv('')