In [70]:
from surprise import Dataset
from surprise import Reader
from surprise import NMF
from surprise.model_selection import train_test_split
from surprise import accuracy

import pandas as pd
import random
import numpy as np

import csv

from scipy.spatial.distance import euclidean, pdist, squareform

#from plotly.offline import iplot, init_notebook_mode
import pandas as pd
#import plotly.graph_objs as go
#import plotly.io as pio
import os

from collections import Counter

# Carga Dataset

Cargamos el último dataset de MovieLens con un tamaño de 100K ratings. Este dataset contiene 9000 películas y 600 usuarios.

In [2]:
ratingsDF = pd.read_csv('data/ratings.csv', usecols=[0,1,2])
indexMovies = ratingsDF.movieId.unique()

# Preparamos NMF

Entrenamos el algoritmo de recomendación NMF con el 90% de los ratings. A continuación, comprobamos el MAE para conocer el margen de error del recomendador.

In [3]:
def train_test_split(dataDF, training_percentage):
    msk = np.random.rand(len(dataDF)) < float(training_percentage / 100)
    train = dataDF[msk]
    test = dataDF[~msk]
    
    return train, test    

In [4]:
# Load Dataset
trainset, testset = train_test_split(ratingsDF, 90)

reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(trainset, reader).build_full_trainset()
test_data = Dataset.load_from_df(testset, reader).build_full_trainset().build_testset()

In [5]:
# Select algorithm
recommendation_algorithm = NMF()

# Train the algorithm on the trainset and predict ratings for the testset
recommendation_algorithm.fit(train_data)
predictions = recommendation_algorithm.test(test_data)

In [6]:
# Compute error
mae = accuracy.mae(predictions)
rmse = accuracy.rmse(predictions)

MAE:  0.7100
RMSE: 0.9276


In [7]:
# Get items and users factors
qi = recommendation_algorithm.qi
pu = recommendation_algorithm.pu

In [8]:
# Create a data frame that contains movies factors
indexMovies = trainset.groupby(by='movieId').mean().index.values

itemsDF = pd.DataFrame(qi, columns=['i-1','i-2','i-3','i-4','i-5','i-6','i-7','i-8','i-9','i-10','i-11','i-12','i-13','i-14','i-15'])
itemsDF['movieId'] = indexMovies
itemsDF.set_index('movieId', inplace=True)
itemsDF.reset_index(inplace=True)
itemsDF.head()

Unnamed: 0,movieId,i-1,i-2,i-3,i-4,i-5,i-6,i-7,i-8,i-9,i-10,i-11,i-12,i-13,i-14,i-15
0,1,0.602058,0.381265,0.843772,0.387881,0.332114,0.720432,0.521389,0.27551,0.805035,0.246799,0.34968,0.556224,0.69197,0.413779,0.737229
1,2,0.499437,0.6466,0.390814,0.260498,0.181374,0.985156,0.771732,0.323428,0.628478,0.098665,0.387971,0.452381,0.168619,0.135168,0.678626
2,3,0.58347,0.82851,0.502004,0.624266,0.226152,0.647084,0.188174,0.569627,0.546911,0.497135,0.665215,0.513906,0.537065,0.38322,0.667611
3,4,0.626424,0.777111,0.706295,0.983156,0.796316,0.627984,0.469352,0.563454,0.202121,0.090217,0.551084,0.384929,0.440761,0.457086,0.423949
4,5,0.657285,0.743779,0.894287,0.256018,0.472446,0.553767,0.083231,0.761103,0.628213,0.730674,0.565262,0.69984,0.527954,0.773099,0.284846


# Calcular la matrix de similitud de los items

Calculamos la matriz de similitud de los items a partir de sus factores. La similitud entre los items se basará en la distancia Euclidea.

In [9]:
def item_similarity(i1, i2):
    sim = 1 / (1 +  euclidean(i1[1:], i2[1:]))
    return sim

In [10]:
similarities = pdist(itemsDF, item_similarity)

In [11]:
item_sim_DF = pd.DataFrame(squareform(similarities), columns=indexMovies, index=indexMovies)
item_sim_DF.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,191005,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,0.0,0.515914,0.520653,0.45273,0.465086,0.431921,0.454374,0.445779,0.423801,0.470334,...,0.453739,0.456077,0.415278,0.454986,0.396661,0.436503,0.432528,0.405212,0.44081,0.451228
2,0.515914,0.0,0.482996,0.429437,0.391908,0.415889,0.377629,0.420575,0.440108,0.431207,...,0.424036,0.443293,0.414316,0.458068,0.396559,0.450427,0.403765,0.400374,0.440269,0.433253
3,0.520653,0.482996,0.0,0.504091,0.524259,0.405994,0.375976,0.471757,0.45488,0.445892,...,0.461171,0.512901,0.458007,0.507942,0.459528,0.452837,0.441044,0.458732,0.496458,0.521997
4,0.45273,0.429437,0.504091,0.0,0.435321,0.402609,0.382917,0.454623,0.476377,0.408455,...,0.435783,0.433455,0.474711,0.468868,0.428796,0.421221,0.468574,0.421786,0.540285,0.529358
5,0.465086,0.391908,0.524259,0.435321,0.0,0.414502,0.356311,0.44048,0.45598,0.38924,...,0.408326,0.415265,0.421439,0.41021,0.426174,0.396193,0.398432,0.415866,0.426189,0.414596


# Evaluaciones

## Evaluación generos en común

En esta sección evaluamos los generos qué tienen en común la pelicula de la que se va a predecir los ratings y las K películas más similares respecto a sus factores.

In [12]:
def get_genres(movieId):
    return moviesDF[moviesDF['movieId'] == movieId]['genres'].values[0]

In [13]:
def get_k_genres(item):
    K = 50
    
    if item in item_sim_DF.index:
        kIndex = item_sim_DF.loc[item].sort_values(ascending=False)[:K].index.values
        kGenres = []

        for i in kIndex:
            kGenres.append(get_genres(i))
        
        return '|'.join(str(e) for e in kGenres)
    else:
        return ''

In [14]:
# Get Movies information
moviesDF = pd.read_csv('data/movies.csv')

# Get Test results
genres_result = pd.DataFrame(predictions)
genres_result.drop(columns=['details'], inplace=True)
genres_result.rename(columns={'uid': 'userId', 'iid': 'itemId', 'r_ui': 'rating', 'est': 'estimation'}, inplace=True)

# Add genres of each test movie
genres_result['movieGenres'] = genres_result['itemId'].apply(get_genres)

# Add genres of the K most similar movies
genres_result['kNN_Genres'] = genres_result['itemId'].apply(get_k_genres)

# Show the first rows of dataframe
genres_result[:10]

Unnamed: 0,userId,itemId,rating,estimation,movieGenres,kNN_Genres
0,1,235,4.0,4.61899,Comedy|Drama,Romance|Comedy|Drama|Fantasy|Comedy|Romance|Co...
1,1,296,3.0,5.0,Comedy|Crime|Drama|Thriller,Comedy|Romance|Crime|Drama|Crime|Drama|Comedy|...
2,1,362,5.0,3.323643,Adventure|Children|Romance,Drama|Crime|Drama|Thriller|Adventure|Animation...
3,1,423,3.0,3.036705,Action|Thriller,Documentary|Drama|Western|Action|Comedy|Fantas...
4,1,608,5.0,5.0,Comedy|Crime|Drama|Thriller,Crime|Drama|Thriller|Comedy|Drama|Crime|Drama|...
5,1,940,5.0,5.0,Action|Adventure|Romance,Comedy|Romance|Action|Comedy|Fantasy|Horror|Co...
6,1,1042,4.0,3.520634,Comedy|Drama,Crime|Drama|Comedy|Fantasy|Romance|Drama|Actio...
7,1,1073,5.0,4.465319,Children|Comedy|Fantasy|Musical,Action|Adventure|Comedy|Fantasy|Crime|Drama|Th...
8,1,1090,4.0,5.0,Drama|War,Comedy|Horror|Sci-Fi|Drama|Children|Comedy|Dra...
9,1,1219,2.0,5.0,Crime|Horror,Crime|Drama|Fantasy|Horror|Thriller|Drama|Chil...


# Contruccción de csv para la base de casos

In [16]:
def item_similarity_2(i1, i2):
    item1 = i1[1:]
    item2 = i2[1:]
    sim = 1 / (1 +  euclidean(np.delete(item1, [remove_dimension]), np.delete(item2, [remove_dimension])))
    return sim

In [None]:
# construir un csv de casos [valores de las columnas],[generos]

len_columns = len(itemsDF.columns.tolist()) 
with open('cases.csv', 'w') as result_file:
    print('movieId#i-values#genres', file=result_file)
    
    for i in range(1,len_columns):
        list_values_matrix = itemsDF['i-'+str(i)].tolist()
        list_values = ','.join(str(e) for e in list_values_matrix)

        genres = moviesDF[moviesDF['movieId'] == i]['genres'].tolist()[0]
        genres = genres.replace("|", ",")
        
        my_list = [i] + [list_values] + [genres]
        
        print(f"{i}#{list_values}#{genres}", file=result_file)

# Creación de archivos csv con la matriz de similitudes de las películas

In [19]:
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

In [20]:

len_columns = len(itemsDF.columns.tolist()) - 1
remove_dimension = 1
for i in range(len_columns):
    print(remove_dimension)
   
    # calcular la similitud
    similarities = pdist(itemsDF, item_similarity_2)
    
    #Convertir en matriz
    item_sim_DF = pd.DataFrame(squareform(similarities), columns=indexMovies, index=indexMovies)
    
    # convertir en csv
    item_sim_DF.to_csv('MF_remove_dim_' + str(remove_dimension) + '.csv')
    
    #print('MF_remove_dim_' + str(remove_dimension) + '.csv')
    
    remove_dimension = remove_dimension + 1


"\nlen_columns = len(itemsDF.columns.tolist()) - 1\nremove_dimension = 1\nfor i in range(len_columns):\n    print(remove_dimension)\n   \n    # calcular la similitud\n    similarities = pdist(itemsDF, item_similarity_2)\n    \n    #Convertir en matriz\n    item_sim_DF = pd.DataFrame(squareform(similarities), columns=indexMovies, index=indexMovies)\n    \n    # convertir en csv\n    item_sim_DF.to_csv('MF_remove_dim_' + str(remove_dimension) + '.csv')\n    \n    #print('MF_remove_dim_' + str(remove_dimension) + '.csv')\n    \n    remove_dimension = remove_dimension + 1\n"