In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_all_movies():
    # Cargamos todos los ratings
    ratings_df = pd.read_csv('data/experiment_data/ratings.csv')
    
    # Obtenemos todas las películas que existen y las ordenamos
    movies = list(set(ratings_df['movieId']))
    movies.sort()
    return movies

In [3]:
def get_movies_index_dict():
    # Cargamos todos los ratings
    ratings_df = pd.read_csv('data/experiment_data/ratings.csv')
    
    # Obtenemos todas las películas que existen y las ordenamos
    movies = list(set(ratings_df['movieId']))
    movies.sort()
    
    # Creamos el diccionario donde la key es el index y el valor el id de la película
    result = {}
    for i in range(len(movies)):
        result[str(i)] = movies[i]
    return result

In [4]:
def get_all_users():
    # Cargamos todos los ratings
    ratings_df = pd.read_csv('data/experiment_data/trainset.csv')
    
    # Obtenemos todos los usuarios y los devolvemos
    users = ratings_df['userId']
    return list(set(users))    

In [5]:
def get_movies_watched_by_user(trainset_df, user):
    return trainset_df[trainset_df['userId'] == user]['movieId'].values.astype(str)

In [6]:
def remove_not_watched_movies(matrix_df, watched_movies):
    movies_to_drop = matrix_df.columns[~matrix_df.columns.isin(watched_movies)].values
    return matrix_df.drop(labels=movies_to_drop, axis=1)

In [7]:
def get_k_most_similar_ratings(all_movies, clean_matrix_df, trainset_df, user, movie, k):
    # Obtenemos el indice de la película
    movie_index = all_movies.index(movie)
    
    # Obtenemos las k películas más similares
    most_similar_movies = clean_matrix_df.iloc[movie_index].sort_values(ascending=False)[:k].index.values
    most_similar_movies = [int(x) for x in most_similar_movies]
    #print(most_similar_movies_index)
    #most_similar_movies = np.array(all_movies)[most_similar_movies_index]
    
    # Obtenemos los ratings de las películas
    user_trainset_df = trainset_df[trainset_df['userId'] == user]

    ratings = []
    for m in most_similar_movies:
        rat = user_trainset_df[user_trainset_df['movieId'] == m]['rating'].values[0]
        ratings.append(rat)
    
    return ratings

# Preparar datos

Cambiamos el nombre de las columnas de todos los dataframe y los guardamos. **SOLO EJECUTAR CUANDO SE HAN GENERADO LAS MATRICES CON EL NOTEBOOK 4**

In [8]:
movies_dict = get_movies_index_dict()

In [9]:
binary_df = pd.read_csv('data/similarity_data/sim_cosine_binary.csv')
binary_df.rename(index=str, columns=movies_dict, inplace=True)
binary_df.to_csv('data/similarity_data/sim_cosine_binary.csv', index=False)

# Cargamos el dataset de test

In [10]:
testset_df = pd.read_csv('data/experiment_data/testset.csv')
trainset_df = pd.read_csv('data/experiment_data/trainset.csv')
binary_df = pd.read_csv('data/similarity_data/sim_cosine_binary.csv')

In [11]:
all_movies = get_all_movies()

In [12]:
def get_binary_ratings(user, movie, k=10):
    binary_df = pd.read_csv('data/similarity_data/sim_cosine_binary.csv')
    testset_df = pd.read_csv('data/experiment_data/testset.csv')
    trainset_df = pd.read_csv('data/experiment_data/trainset.csv')
    
    watched_movies = get_movies_watched_by_user(trainset_df, user)
    clean_matrix_df = remove_not_watched_movies(binary_df, watched_movies)
    return get_k_most_similar_ratings(all_movies, clean_matrix_df, trainset_df, user, movie, k)

In [13]:
def get_q_ratings(user, movie, k=10):
    binary_df = pd.read_csv('data/similarity_data/sim_cosine_user_{}.csv'.format(user))
    testset_df = pd.read_csv('data/experiment_data/testset.csv')
    trainset_df = pd.read_csv('data/experiment_data/trainset.csv')
    
    watched_movies = get_movies_watched_by_user(trainset_df, user)
    clean_matrix_df = remove_not_watched_movies(binary_df, watched_movies)
    return get_k_most_similar_ratings(all_movies, clean_matrix_df, trainset_df, user, movie, k)

In [14]:
predicted_df = pd.read_csv('data/experiment_data/predicted_values.csv')
predicted_df['k_ratings_binary'] = predicted_df.apply(lambda row: get_binary_ratings(row['userId'], row['movieId']), axis=1)
predicted_df['k_ratings_q'] = predicted_df.apply(lambda row: get_q_ratings(row['userId'], row['movieId']), axis=1)
predicted_df.head()

Unnamed: 0,userId,movieId,rating,predicted,k_ratings_binary,k_ratings_q
0,1,223,3.0,4.042081,"[5.0, 5.0, 4.0, 4.0, 4.0, 3.0, 5.0, 3.0, 5.0, ...","[5.0, 3.0, 5.0, 5.0, 5.0, 4.0, 5.0, 4.0]"
1,1,349,4.0,2.726556,"[5.0, 5.0, 3.0, 5.0, 5.0, 4.0, 4.0, 5.0, 3.0, ...","[5.0, 3.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0]"
2,1,527,5.0,2.743715,"[5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, ...","[5.0, 3.0, 5.0, 5.0, 4.0, 5.0, 4.0, 5.0]"
3,4,357,3.0,2.737091,"[5.0, 3.0, 3.0, 5.0, 5.0, 2.0, 5.0, 3.0, 1.0, ...","[3.0, 2.0, 3.0, 5.0]"
4,5,110,4.0,3.492654,"[4.0, 3.0, 4.0, 5.0, 4.0, 4.0, 3.0, 3.0, 1.0, ...","[4.0, 4.0, 3.0, 4.0, 4.0, 4.0, 3.0, 3.0]"


In [15]:
def get_value(n, values):
    if n >= len(values):
        return float('NaN')
    else:
        return values[n]

for k in range(1, 11):
    predicted_df['rating_binary_k_' + str(k)] = predicted_df.apply(lambda row: get_value(k-1 ,row['k_ratings_binary']), axis=1)
    predicted_df['rating_q_k_' + str(k)] = predicted_df.apply(lambda row: get_value(k-1 ,row['k_ratings_q']), axis=1)
    
predicted_df.drop(columns=['k_ratings_binary', 'k_ratings_q'], inplace=True)

In [16]:
predicted_df.to_csv('data/experiment_data/result_all_values.csv', index=False)