In [1]:
import pandas as pd
import numpy as np
import similarities as sim
import utils

from scipy.spatial.distance import pdist, squareform

In [2]:
# Inputs
user = 6
k = 15

In [3]:
# Get train data by user
trainset_DF = pd.read_csv('data/experiment_data/trainset.csv')
train_user_data = trainset_DF[trainset_DF.userId == user]
train_user_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,6,10,3.0,845553253
1,6,32,4.0,845553426
2,6,34,4.0,845553354
3,6,47,4.0,845553317
4,6,50,1.0,845553381


In [4]:
# Get test data by user
testset_DF = pd.read_csv('data/experiment_data/testset.csv')
test_user_data = testset_DF[testset_DF.userId == user]
test_user_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,6,165,3.0,845553146
1,6,296,2.0,845553110
2,6,377,5.0,845553317
3,6,588,5.0,845553146


In [5]:
# Get Q matrix from user
q_path = 'data/matrices_data/q_user_{}.csv'.format(user)
q_u = pd.read_csv(q_path)
q_u.head()

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,0.301498,0.467121,0.141795,0.017201,0.217677,0.1137,0.105127,0.015963,0.374883,0.636164,0.163629,0.055172,0.024918,0.106678,0.021532
1,10,0.173557,0.777252,0.221952,0.006801,0.022617,0.175456,0.173607,0.254761,0.234166,0.00961,0.020329,0.475115,0.17428,0.267083,0.498224
2,32,0.09446,2.032427,0.044375,0.004282,0.14466,0.0231,0.397514,0.178309,0.225353,0.30596,0.051185,0.187642,0.137801,0.25653,0.458292
3,34,0.204817,0.158587,0.258,0.000268,0.444853,0.010501,0.450876,0.064775,0.292796,0.506253,0.049599,0.980147,0.090821,0.343645,0.363801
4,47,0.15417,0.365045,0.202591,0.015639,0.054762,0.212742,0.359866,0.177833,0.133097,0.48272,0.119245,0.362057,0.142503,0.259136,0.253063


In [6]:
# Get binary properties of movies
binary_prop = pd.read_csv('data/experiment_data/movies_binary.csv')
binary_prop.head()

Unnamed: 0,id,Quentin Tarantino,Jonathan Demme,Steven Spielberg,Tim Burton,Mel Stuart,Tom Shadyac,Andrew Davis,Jerry Zucker,Chris Noonan,...,psychiatrist,pig,alfred pennyworth character,19th century,love,soldier,electromagnetic pulse,friendship,dc comics,delorean
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,32,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,34,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4,47,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Similarity matrix by binary properties (using equal similarity function)
movies_prop = binary_prop.drop(columns=['id']).values
movies_sim_values = pdist(movies_prop, sim.equal_sim)
movies_similarity = pd.DataFrame(squareform(movies_sim_values))
movies_similarity.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.030303,0.0,0.0625,0.0,0.0,0.0,0.0625,0.0625,0.029412,...,0.027778,0.206897,0.096774,0.0,0.060606,0.0,0.172414,0.030303,0.0,0.030303
1,0.030303,0.0,0.03125,0.0,0.028571,0.033333,0.0,0.032258,0.066667,0.1,...,0.0,0.03125,0.066667,0.068966,0.03125,0.0,0.066667,0.103448,0.0625,0.103448
2,0.0,0.03125,0.0,0.03125,0.088235,0.066667,0.0,0.03125,0.0,0.0625,...,0.0,0.0,0.0,0.032258,0.0,0.03125,0.0,0.03125,0.029412,0.064516
3,0.0625,0.0,0.03125,0.0,0.028571,0.0,0.032258,0.066667,0.0,0.0,...,0.029412,0.064516,0.0,0.0,0.03125,0.032258,0.032258,0.0,0.030303,0.0
4,0.0,0.028571,0.088235,0.028571,0.0,0.129032,0.028571,0.028571,0.0,0.027778,...,0.054054,0.0,0.0,0.060606,0.057143,0.028571,0.0,0.028571,0.085714,0.028571


In [8]:
# Filter movies in matrix
test_movies = test_user_data.movieId.values
train_movies = train_user_data.movieId.values
all_movies = q_u.movieId.values
binary_sim_DF = utils.filter_similarity_matrix(movies_similarity, test_movies, train_movies, all_movies)
binary_sim_DF.head()

Unnamed: 0,10,32,34,47,50,110,150,153,185,208,...,589,590,592,593,595,597,608,736,780,1073
9,0.1,0.0625,0.0,0.027778,0.032258,0.0,0.03125,0.064516,0.060606,0.121212,...,0.032258,0.03125,0.0625,0.030303,0.0,0.0,0.032258,0.090909,0.096774,0.0
15,0.0,0.0,0.033333,0.060606,0.034483,0.033333,0.033333,0.0,0.064516,0.0,...,0.0,0.033333,0.0,0.066667,0.0,0.0,0.111111,0.030303,0.0,0.0
22,0.1,0.030303,0.0,0.057143,0.066667,0.0,0.03125,0.064516,0.129032,0.15625,...,0.032258,0.03125,0.0625,0.0625,0.0,0.0,0.066667,0.16129,0.096774,0.0
31,0.028571,0.0,0.058824,0.0,0.0,0.0,0.028571,0.058824,0.0,0.025641,...,0.0,0.028571,0.027778,0.0,0.25,0.096774,0.0,0.026316,0.027778,0.096774


In [9]:
# De cada película del test obtenemos los K más similares y nos quedamos con sus ratings
predicted_ratings = {}
for i in range(len(test_movies)):
    most_similar_movies = binary_sim_DF.iloc[i].sort_values(ascending=False).index.values[:k]
    rating_predicted = train_user_data[train_user_data.movieId.isin(most_similar_movies)].rating.values
    predicted_ratings[test_movies[i]] = rating_predicted

In [10]:
predicted_ratings_DF = pd.DataFrame.from_dict(predicted_ratings, orient='index')
predicted_ratings_DF['real_rating'] = test_user_data.rating.values
predicted_ratings_DF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,real_rating
165,3.0,4.0,3.0,3.0,3.0,5.0,5.0,4.0,5.0,5.0,3.0,3.0,3.0,5.0,5.0,3.0
296,4.0,4.0,1.0,5.0,4.0,3.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,3.0,2.0
377,3.0,1.0,3.0,3.0,3.0,3.0,5.0,4.0,5.0,5.0,3.0,4.0,3.0,5.0,5.0,5.0
588,4.0,4.0,3.0,3.0,5.0,3.0,5.0,5.0,4.0,4.0,5.0,5.0,5.0,4.0,3.0,5.0
