In [63]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from scipy.spatial.distance import pdist, squareform

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Experimento 0 



In [64]:
user = 18
k = 10

In [65]:
# Cargamos las películas de entrenamiento (ya vistas)
trainset_DF = pd.read_csv('data/experiment_data/trainset.csv')
trainset_DF.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,6,10,3.0,845553253
1,6,32,4.0,845553426
2,6,34,4.0,845553354
3,6,47,4.0,845553317
4,6,50,1.0,845553381


In [66]:
# Nos quedamos con las películas del usuario
rated_movies_by_user = trainset_DF[trainset_DF.userId == user]
rated_movies_by_user.head()

Unnamed: 0,userId,movieId,rating,timestamp
34,18,32,4.0,1455209840
35,18,34,2.5,1455617533
36,18,47,4.5,1455050013
37,18,50,5.0,1455049343
38,18,110,4.5,1455050170


In [67]:
# Cargamos las películas de evaluación (las que no ha visto)
testset_DF = pd.read_csv('data/experiment_data/testset.csv')
testset_DF.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,6,165,3.0,845553146
1,6,296,2.0,845553110
2,6,377,5.0,845553317
3,6,588,5.0,845553146
4,18,1,3.5,1455209816


In [68]:
# Nos quedamos con las películas que ha visto
no_rated_movies_by_user_DF = testset_DF[testset_DF.userId == 18]
no_rated_movies_by_user_DF

Unnamed: 0,userId,movieId,rating,timestamp
4,18,1,3.5,1455209816
5,18,356,4.5,1455050112
6,18,780,4.0,1455209824
7,18,1036,4.0,1455618103
8,18,1704,4.0,1455050003
9,18,4973,4.0,1455209683
10,18,5989,4.0,1455209729


## Calculamos la similitud entre las peliculas de evaluación y de entrenamiento

In [69]:
# Creamos la función de similitud
# ¡OJO! Para evitar similitudes muy altas (debido a la cantidad de ceros)
# solo contamos, para la similitud, las propiedades que tengan valor 1 en
# alguno de los items.
def equal_sim(item1, item2):
    dif = 0
    atr = 0
    for i in range(len(item1)):
        if item1[i] != item2[i]:
            dif = dif + 1
        if item1[i] == 1 or item2[i] == 1:
            atr = atr + 1
    return float(atr - dif)/float(atr)

In [70]:
# Calculamos las similitudes entre películas
movies_prop = binary_properties.drop(columns=['id']).values
movies_sim = pdist(movies_prop, equal_sim)
movies_similarity = pd.DataFrame(squareform(movies_sim))

In [71]:
movies_similarity.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.111111,0.0,0.25,0.0,0.0,0.0,0.111111,0.25,0.111111,...,0.0,0.571429,0.25,0.0,0.25,0.0,0.375,0.111111,0.0,0.1
1,0.111111,0.0,0.142857,0.0,0.125,0.142857,0.0,0.142857,0.333333,0.6,...,0.0,0.125,0.333333,0.333333,0.142857,0.0,0.285714,0.6,0.285714,0.5
2,0.0,0.142857,0.0,0.0,0.285714,0.333333,0.0,0.0,0.0,0.142857,...,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,0.142857,0.125,0.285714
3,0.25,0.0,0.0,0.0,0.125,0.0,0.125,0.142857,0.0,0.0,...,0.142857,0.285714,0.0,0.0,0.142857,0.142857,0.125,0.0,0.125,0.0
4,0.0,0.125,0.285714,0.125,0.0,0.5,0.111111,0.125,0.0,0.125,...,0.285714,0.0,0.0,0.285714,0.125,0.125,0.0,0.125,0.428571,0.111111


In [72]:
# sacamos la lista de peliculas que estan en el conjunto de entrenamiento y que estan en el conjunto de evaluacion
test_movies_user = no_rated_movies_by_user_DF['movieId'].values
train_movies_user = rated_movies_by_user['movieId'].values 

In [73]:
# Incluimos la columna de id para la matriz
movies_index = binary_properties['id'].values
movies_similarity['movieId'] = movies_index

In [74]:
# Tenemos que filtrar la matriz de similitud para que en las filas esten los items de evaluacion 
# y las columnas, los items de entrenamiento

# Nos quedamos con las filas que son películas de evaluación
final_items_sim_DF = pd.DataFrame(movies_similarity[movies_similarity.movieId.isin(test_movies_user)])

# Cambiamos los nombres de las columnas
names = {}
for i in range(len(movies_index)):
    names[i] = movies_index[i]
    
final_items_sim_DF.rename(index=str, columns=names, inplace=True)

# Calculamos las películas a eliminar
movies_to_drop = np.setdiff1d(movies_index,rated_movies_by_user.movieId.values)
final_items_sim_DF.drop(columns=movies_to_drop, inplace=True)
final_items_sim_DF.drop(columns=['movieId'], inplace=True)
final_items_sim_DF

Unnamed: 0,32,34,47,50,110,165,208,231,260,293,...,5952,6377,6539,6874,7153,7361,8961,33794,58559,79132
0,0.0,0.25,0.0,0.0,0.0,0.111111,0.1,0.142857,0.222222,0.0,...,0.25,0.571429,0.25,0.0,0.25,0.0,0.375,0.111111,0.0,0.1
19,0.0,0.166667,0.142857,0.0,0.142857,0.0,0.0,0.0,0.0,0.166667,...,0.166667,0.0,0.0,0.0,0.166667,0.4,0.0,0.0,0.142857,0.0
42,0.142857,0.0,0.0,0.0,0.0,0.333333,0.5,0.0,0.5,0.0,...,0.142857,0.125,0.333333,0.142857,0.142857,0.142857,0.285714,0.333333,0.125,0.5
44,0.166667,0.0,0.142857,0.166667,0.0,0.75,0.333333,0.0,0.142857,0.166667,...,0.0,0.0,0.166667,0.4,0.0,0.0,0.142857,0.4,0.333333,0.333333
66,0.0,0.166667,0.142857,0.0,0.142857,0.0,0.0,0.0,0.0,0.166667,...,0.166667,0.0,0.0,0.0,0.166667,0.4,0.0,0.0,0.142857,0.0
84,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.142857,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0
90,0.0,0.142857,0.285714,0.142857,0.285714,0.0,0.0,0.0,0.0,0.333333,...,0.142857,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.0,0.285714,0.0


## Gráfica con resultados estadísticos

In [77]:
# De cada película nos quedamos con los K Mejores valores
eval_values = {}
for i in range(len(final_items_sim_DF)):
    eval_values[i] = np.sort(final_items_sim_DF.iloc[i])[-k:]

In [78]:
x_series = []
avg_series = []
max_series = []

In [79]:
for key, val in eval_values.items():
    x_series.append(i)
    avg_series.append(np.average(val))
    max_series.append(val.max())
    i = i + 1

In [80]:
trace = go.Scatter(
    x = x_series,
    y = avg_series,
    name='Average'
)

trace2 = go.Scatter(
    x = x_series,
    y = max_series,
    name='Max. Value'
)

data = [trace, trace2]

layout = go.Layout(
    yaxis = dict(
        range=[0,1]
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-line')