In [1]:
import pandas as pd
import numpy as np
import src.similarities as sim
import src.utils as utils
from tqdm import tqdm

from scipy.spatial.distance import pdist, squareform

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [2]:
trainset_df = pd.read_csv('data/experiment_data/ratings.csv')
predicted_df = pd.read_csv('data/experiment_data/predicted_values.csv')#pd.read_csv('data/experiment_data/predicte_values_clean.csv', sep=';')pd.read_csv('data/experiment_data/predicted_values.csv')

users = list(set(predicted_df['userId']))
functions = ['euclidean', 'cosine', 'manhattan']
k_list =[1, 2, 3, 5, 6, 7, 8, 9, 10]

In [3]:
len(predicted_df[predicted_df['rating'] >= 4])/len(predicted_df) * 100

51.48601398601399

In [4]:
def error_rating(user, algorithm, k, matrix_type=''):
    #print(user)
    # Obtenemos los datos de las películas
    train_movies, test_movies = utils.get_train_test_movies(user)
    all_movies = utils.get_all_movies(user)
    
    # Calculamos la matriz filtrada de similitud
    #if algorithm == 'jaccard':
    #    matrix = pd.read_csv('data/similarity_data/jaccard_matrix.csv')
    if matrix_type == '':
        matrix = pd.read_csv('data/similarity_data/sim_{}_user_{}.csv'.format(algorithm, user))
    else:
        matrix = pd.read_csv('data/similarity_data/sim_{}_binary.csv'.format(algorithm))
        
        
    filtered_matrix = utils.filter_similarity_matrix(matrix, test_movies, train_movies, all_movies)
    return utils.calculate_mean_ratings(filtered_matrix, test_movies, user, k)
    

In [5]:
def average(errors_list):
    return sum(errors_list) / len(errors_list)

In [6]:
results = {}
for f in tqdm(range(len(functions))):
    k_res = pd.Series(index=k_list)
    for k in range(len(k_list)):
        avg = []
        for u in range(len(users)):
            avg.append(error_rating(users[u], functions[f], k_list[k]))
        k_res[k_list[k]] = average(avg)
    key = 'q_u-{}'.format(functions[f])
    results[key] = k_res
        #errorsDF[functions[f]][k_list[k]] = average(avg)

100%|██████████| 3/3 [07:50<00:00, 156.71s/it]


In [7]:
results_user_df = pd.DataFrame.from_dict(data=results)
results_user_df.transpose()

Unnamed: 0,1,2,3,5,6,7,8,9,10
q_u-euclidean,0.969043,1.461889,1.812597,2.334263,2.526051,2.704142,2.872672,3.017123,3.150115
q_u-cosine,0.924196,1.430314,1.764929,2.30136,2.532121,2.700838,2.859426,3.001359,3.133987
q_u-manhattan,0.949045,1.471061,1.821666,2.323286,2.520178,2.698023,2.862007,2.996976,3.12498


In [8]:
results_binary = {}
for f in tqdm(range(len(functions))):
    k_res = pd.Series(index=k_list)
    for k in range(len(k_list)):
        avg = []
        for u in range(len(users)):
            avg.append(error_rating(users[u], functions[f], k_list[k], matrix_type='binary'))
        k_res[k_list[k]] = average(avg)
    key = 'binary-{}'.format(functions[f])
    results_binary[key] = k_res

100%|██████████| 3/3 [07:50<00:00, 156.62s/it]


In [9]:
results_binary_df = pd.DataFrame.from_dict(data=results_binary)
results_binary_df.transpose()

Unnamed: 0,1,2,3,5,6,7,8,9,10
binary-euclidean,0.965921,1.456385,1.821432,2.350559,2.541398,2.725855,2.880429,3.020033,3.152933
binary-cosine,1.005034,1.491472,1.84261,2.356184,2.552742,2.737454,2.898822,3.042421,3.180156
binary-manhattan,0.965921,1.456385,1.821432,2.350559,2.541398,2.725855,2.880429,3.020033,3.152933


In [10]:
for k,v in results_binary.items():
    results[k] = v

In [11]:
results_df = pd.DataFrame.from_dict(data=results)
results_df.transpose()

Unnamed: 0,1,2,3,5,6,7,8,9,10
q_u-euclidean,0.969043,1.461889,1.812597,2.334263,2.526051,2.704142,2.872672,3.017123,3.150115
q_u-cosine,0.924196,1.430314,1.764929,2.30136,2.532121,2.700838,2.859426,3.001359,3.133987
q_u-manhattan,0.949045,1.471061,1.821666,2.323286,2.520178,2.698023,2.862007,2.996976,3.12498
binary-euclidean,0.965921,1.456385,1.821432,2.350559,2.541398,2.725855,2.880429,3.020033,3.152933
binary-cosine,1.005034,1.491472,1.84261,2.356184,2.552742,2.737454,2.898822,3.042421,3.180156
binary-manhattan,0.965921,1.456385,1.821432,2.350559,2.541398,2.725855,2.880429,3.020033,3.152933


In [12]:
predicted_ratings_DF = pd.read_csv('data/experiment_data/predicted_values.csv')
predicted_ratings_DF['diff'] = predicted_ratings_DF.apply(lambda row: abs(row['rating'] - row['predicted']), axis=1)
predicted_ratings_DF.head()

Unnamed: 0,userId,movieId,rating,predicted,diff
0,1,223,3.0,4.042081,1.042081
1,1,349,4.0,2.726556,1.273444
2,1,527,5.0,2.743715,2.256285
3,4,357,3.0,2.737091,0.262909
4,5,110,4.0,3.492654,0.507346


In [13]:
results_df

Unnamed: 0,q_u-euclidean,q_u-cosine,q_u-manhattan,binary-euclidean,binary-cosine,binary-manhattan
1,0.969043,0.924196,0.949045,0.965921,1.005034,0.965921
2,1.461889,1.430314,1.471061,1.456385,1.491472,1.456385
3,1.812597,1.764929,1.821666,1.821432,1.84261,1.821432
5,2.334263,2.30136,2.323286,2.350559,2.356184,2.350559
6,2.526051,2.532121,2.520178,2.541398,2.552742,2.541398
7,2.704142,2.700838,2.698023,2.725855,2.737454,2.725855
8,2.872672,2.859426,2.862007,2.880429,2.898822,2.880429
9,3.017123,3.001359,2.996976,3.020033,3.042421,3.020033
10,3.150115,3.133987,3.12498,3.152933,3.180156,3.152933


In [14]:
def improvement(row):
    return (row['Binary'] - row['Q']) / row['Binary'] * 100 

In [15]:
euclidean_df = results_df[['binary-euclidean','q_u-euclidean']]
euclidean_df = euclidean_df.rename(columns={'binary-euclidean': 'Binary','q_u-euclidean': 'Q'})
euclidean_df['Improvement'] = euclidean_df.apply(lambda row: improvement(row), axis = 1)
euclidean_df

Unnamed: 0,Binary,Q,Improvement
1,0.965921,0.969043,-0.323168
2,1.456385,1.461889,-0.377951
3,1.821432,1.812597,0.485027
5,2.350559,2.334263,0.693274
6,2.541398,2.526051,0.603874
7,2.725855,2.704142,0.796548
8,2.880429,2.872672,0.269269
9,3.020033,3.017123,0.096347
10,3.152933,3.150115,0.089373


In [16]:
cosine_df = results_df[['binary-cosine','q_u-cosine']]
cosine_df = cosine_df.rename(columns={'binary-cosine': 'Binary','q_u-cosine': 'Q'})
cosine_df['Improvement'] = cosine_df.apply(lambda row: improvement(row), axis = 1)
cosine_df

Unnamed: 0,Binary,Q,Improvement
1,1.005034,0.924196,8.043293
2,1.491472,1.430314,4.100496
3,1.84261,1.764929,4.215802
5,2.356184,2.30136,2.326789
6,2.552742,2.532121,0.807805
7,2.737454,2.700838,1.33757
8,2.898822,2.859426,1.359042
9,3.042421,3.001359,1.349652
10,3.180156,3.133987,1.451798


In [17]:
manhattan_df = results_df[['binary-manhattan','q_u-manhattan']]
manhattan_df = manhattan_df.rename(columns={'binary-manhattan': 'Binary','q_u-manhattan': 'Q'})
manhattan_df['Improvement'] = manhattan_df.apply(lambda row: improvement(row), axis = 1)
manhattan_df

Unnamed: 0,Binary,Q,Improvement
1,0.965921,0.949045,1.74712
2,1.456385,1.471061,-1.00773
3,1.821432,1.821666,-0.012863
5,2.350559,2.323286,1.160253
6,2.541398,2.520178,0.834974
7,2.725855,2.698023,1.021029
8,2.880429,2.862007,0.639544
9,3.020033,2.996976,0.763455
10,3.152933,3.12498,0.886548
