In [2]:
import pandas as pd
import numpy as np
import src.similarities as sim
import src.utils as utils
from tqdm import tqdm

from scipy.spatial.distance import pdist, squareform

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [3]:
trainset_df = pd.read_csv('data/experiment_data/ratings.csv')
predicted_df = pd.read_csv('data/experiment_data/predicted_values.csv')#pd.read_csv('data/experiment_data/predicte_values_clean.csv', sep=';')pd.read_csv('data/experiment_data/predicted_values.csv')

users = list(set(predicted_df['userId']))
functions = ['euclidean', 'cosine', 'manhattan']
k_list =[1, 2, 3, 5, 10]

In [9]:
len(predicted_df[predicted_df['rating'] >= 4])/len(predicted_df) * 100

51.48601398601399

In [66]:
def error_rating(user, algorithm, k, matrix_type=''):
    #print(user)
    # Obtenemos los datos de las películas
    train_movies, test_movies = utils.get_train_test_movies(user)
    all_movies = utils.get_all_movies(user)
    
    # Calculamos la matriz filtrada de similitud
    #if algorithm == 'jaccard':
    #    matrix = pd.read_csv('data/similarity_data/jaccard_matrix.csv')
    if matrix_type == '':
        matrix = pd.read_csv('data/similarity_data/sim_{}_user_{}.csv'.format(algorithm, user))
    else:
        matrix = pd.read_csv('data/similarity_data/sim_{}_binary.csv'.format(algorithm))
        
        
    filtered_matrix = utils.filter_similarity_matrix(matrix, test_movies, train_movies, all_movies)
    return utils.calculate_mean_ratings(filtered_matrix, test_movies, user, k)
    

In [67]:
def average(errors_list):
    return sum(errors_list) / len(errors_list)

In [68]:
results = {}
for f in tqdm(range(len(functions))):
    k_res = pd.Series(index=k_list)
    for k in range(len(k_list)):
        avg = []
        for u in range(len(users)):
            avg.append(error_rating(users[u], functions[f], k_list[k]))
        k_res[k_list[k]] = average(avg)
    key = 'q_u-{}'.format(functions[f])
    results[key] = k_res
        #errorsDF[functions[f]][k_list[k]] = average(avg)

100%|██████████| 3/3 [01:21<00:00, 27.10s/it]


In [69]:
results_user_df = pd.DataFrame.from_dict(data=results)
results_user_df.transpose()

Unnamed: 0,1,2,3,5,10
q_u-euclidean,0.926845,1.399975,1.78831,2.351203,3.316588
q_u-cosine,0.927355,1.410969,1.760868,2.344392,3.298902
q_u-manhattan,0.957529,1.462547,1.817776,2.364417,3.304225


In [70]:
results_binary = {}
for f in tqdm(range(len(functions))):
    k_res = pd.Series(index=k_list)
    for k in range(len(k_list)):
        avg = []
        for u in range(len(users)):
            avg.append(error_rating(users[u], functions[f], k_list[k], matrix_type='binary'))
        k_res[k_list[k]] = average(avg)
    key = 'binary-{}'.format(functions[f])
    results_binary[key] = k_res

100%|██████████| 3/3 [01:20<00:00, 26.86s/it]


In [71]:
results_binary_df = pd.DataFrame.from_dict(data=results_binary)
results_binary_df.transpose()

Unnamed: 0,1,2,3,5,10
binary-euclidean,0.91303,1.461876,1.84689,2.409337,3.326028
binary-cosine,1.047798,1.534817,1.910374,2.462659,3.386887
binary-manhattan,0.91303,1.461876,1.84689,2.409337,3.326028


In [72]:
for k,v in results_binary.items():
    results[k] = v

In [73]:
results_df = pd.DataFrame.from_dict(data=results)
results_df.transpose()

Unnamed: 0,1,2,3,5,10
q_u-euclidean,0.926845,1.399975,1.78831,2.351203,3.316588
q_u-cosine,0.927355,1.410969,1.760868,2.344392,3.298902
q_u-manhattan,0.957529,1.462547,1.817776,2.364417,3.304225
binary-euclidean,0.91303,1.461876,1.84689,2.409337,3.326028
binary-cosine,1.047798,1.534817,1.910374,2.462659,3.386887
binary-manhattan,0.91303,1.461876,1.84689,2.409337,3.326028


In [74]:
predicted_ratings_DF = pd.read_csv('data/experiment_data/predicted_values.csv')
predicted_ratings_DF['diff'] = predicted_ratings_DF.apply(lambda row: abs(row['rating'] - row['predicted']), axis=1)
predicted_ratings_DF.head()

Unnamed: 0,userId,movieId,rating,predicted,diff
0,1,223,3.0,4.042081,1.042081
1,1,349,4.0,2.726556,1.273444
2,1,527,5.0,2.743715,2.256285
3,4,357,3.0,2.737091,0.262909
4,5,110,4.0,3.492654,0.507346


In [75]:
results_df

Unnamed: 0,q_u-euclidean,q_u-cosine,q_u-manhattan,binary-euclidean,binary-cosine,binary-manhattan
1,0.926845,0.927355,0.957529,0.91303,1.047798,0.91303
2,1.399975,1.410969,1.462547,1.461876,1.534817,1.461876
3,1.78831,1.760868,1.817776,1.84689,1.910374,1.84689
5,2.351203,2.344392,2.364417,2.409337,2.462659,2.409337
10,3.316588,3.298902,3.304225,3.326028,3.386887,3.326028


In [76]:
def improvement(row):
    return (row['Binary'] - row['Q']) / row['Binary'] * 100 

In [77]:
euclidean_df = results_df[['binary-euclidean','q_u-euclidean']]
euclidean_df = euclidean_df.rename(columns={'binary-euclidean': 'Binary','q_u-euclidean': 'Q'})
euclidean_df['Improvement'] = euclidean_df.apply(lambda row: improvement(row), axis = 1)
euclidean_df

Unnamed: 0,Binary,Q,Improvement
1,0.91303,0.926845,-1.513089
2,1.461876,1.399975,4.234359
3,1.84689,1.78831,3.171774
5,2.409337,2.351203,2.412853
10,3.326028,3.316588,0.283809


In [78]:
cosine_df = results_df[['binary-cosine','q_u-cosine']]
cosine_df = cosine_df.rename(columns={'binary-cosine': 'Binary','q_u-cosine': 'Q'})
cosine_df['Improvement'] = cosine_df.apply(lambda row: improvement(row), axis = 1)
cosine_df

Unnamed: 0,Binary,Q,Improvement
1,1.047798,0.927355,11.494913
2,1.534817,1.410969,8.069247
3,1.910374,1.760868,7.826009
5,2.462659,2.344392,4.802418
10,3.386887,3.298902,2.597824


In [79]:
manhattan_df = results_df[['binary-manhattan','q_u-manhattan']]
manhattan_df = manhattan_df.rename(columns={'binary-manhattan': 'Binary','q_u-manhattan': 'Q'})
manhattan_df['Improvement'] = manhattan_df.apply(lambda row: improvement(row), axis = 1)
manhattan_df

Unnamed: 0,Binary,Q,Improvement
1,0.91303,0.957529,-4.873776
2,1.461876,1.462547,-0.045865
3,1.84689,1.817776,1.576335
5,2.409337,2.364417,1.86441
10,3.326028,3.304225,0.655518
