# 3er Entregable

Integrantes:
- Araoz, Tania
- Bajo, Pablo
- Barrera, Manuel

### Carga de librerias a utilizar 

In [1]:
import pandas as pd
from datetime import datetime
from scipy.sparse import csr_matrix
from lightfm import LightFM
import numpy as np
from sklearn.model_selection import GridSearchCV



### Carga de datasets

In [2]:
movies = pd.read_csv("../data/ml-latest/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv("../data/ml-latest/ratings.csv").sample(300000, random_state=42).sort_values(by='timestamp', ascending=True).reset_index(drop=True)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,86847,34,5.0,822873600
1,268919,23,2.0,823185233
2,25445,52,4.0,823255313
3,80271,70,3.0,823264587
4,311037,74,4.0,823867612
...,...,...,...,...
299995,254054,208108,3.0,1689806642
299996,279030,231701,4.0,1689807358
299997,310092,553,2.5,1689817643
299998,310092,1193,2.0,1689817990


> Se usa el dataset de ratings para trabajar, tiene las interacciones entre usuarios y películas

In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     300000 non-null  int64  
 1   movieId    300000 non-null  int64  
 2   rating     300000 non-null  float64
 3   timestamp  300000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 9.2 MB


> El dataset contiene 100836 interacciones. <span style="color:red">ACTUALIZAR CON DATASET GRANDE</span>

> El timestamp está en formato int64, se debe convertir a formato fecha para poder trabajar.

In [5]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

> No hay valores nulos

In [6]:
ratings['userId'].nunique()

119409

> El dataset tiene 610 ususarios. <span style="color:red">Cambiar con dataset grande</span> 

In [7]:
ratings['movieId'].nunique()

17006

> el dataset contiene ratings de 9724 peliculas. <span style="color:red">Actualizar con dataset grande</span> 

In [8]:
ratings['rating'].sort_values(ascending=True).unique()

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

> Los valores posibles de ratings van del 0.5 al 5, con un incremento de 0.5. 

#### Preprocesado

Convertimos el timestamp numerico en formato fecha

In [9]:
ratings["timestamp"] = ratings["timestamp"].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y/%m/%d'))

In [10]:
ratings["timestamp"]

0         1996/01/29
1         1996/02/01
2         1996/02/02
3         1996/02/02
4         1996/02/09
             ...    
299995    2023/07/19
299996    2023/07/19
299997    2023/07/20
299998    2023/07/20
299999    2023/07/20
Name: timestamp, Length: 300000, dtype: object

> Vemos que la fecha tiene un formato de fecha, pero la columna es de tipo object

Utilizando pandas convertimos a un formato de fechas que permita el filtrado

In [11]:
ratings["timestamp"] = pd.to_datetime(ratings['timestamp'], format='%Y/%m/%d')

In [12]:
ratings["timestamp"]

0        1996-01-29
1        1996-02-01
2        1996-02-02
3        1996-02-02
4        1996-02-09
            ...    
299995   2023-07-19
299996   2023-07-19
299997   2023-07-20
299998   2023-07-20
299999   2023-07-20
Name: timestamp, Length: 300000, dtype: datetime64[ns]

> Vemos que la columna tiene el formato datetime64

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,86847,34,5.0,1996-01-29
1,268919,23,2.0,1996-02-01
2,25445,52,4.0,1996-02-02
3,80271,70,3.0,1996-02-02
4,311037,74,4.0,1996-02-09


Vemos el rango de fechas del dataset

In [14]:
ratings.timestamp.min()

Timestamp('1996-01-29 00:00:00')

In [15]:
ratings.timestamp.max()

Timestamp('2023-07-20 00:00:00')

> Vemos que el rango de fechas va desde el 29/03/1996 al 24/09/24

#### Dividimos dataset en train, test y validation
Vemos la catidad de ratings por año

In [16]:
plot_df = ratings.copy()
plot_df["year"] = ratings.timestamp.dt.year
plot_df = plot_df.groupby("year", as_index=False).count()[["year", "userId"]]
plot_df.columns = ["year", "reviews_count"]
plot_df.head(25)

Unnamed: 0,year,reviews_count
0,1996,15406
1,1997,6824
2,1998,2927
3,1999,10944
4,2000,18405
5,2001,10987
6,2002,8038
7,2003,9622
8,2004,10693
9,2005,16230


> Tomamos una proporción 80/20 para dividir el dataset en train - test

In [17]:
train = ratings[(ratings.timestamp < datetime(year=2017, month=1, day=1))]
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,86847,34,5.0,1996-01-29
1,268919,23,2.0,1996-02-01
2,25445,52,4.0,1996-02-02
3,80271,70,3.0,1996-02-02
4,311037,74,4.0,1996-02-09


In [18]:
train.shape

(217239, 4)

In [19]:
train.userId.nunique()

92023

In [20]:
train.movieId.nunique()

11343

In [21]:
test = ratings[ratings.timestamp >= datetime(year=2017, month=1, day=1)]
test.head()

Unnamed: 0,userId,movieId,rating,timestamp
217239,291055,5218,4.0,2017-01-01
217240,291055,135887,4.5,2017-01-01
217241,291055,7451,2.0,2017-01-01
217242,291055,5444,2.5,2017-01-01
217243,178800,40339,2.5,2017-01-01


In [22]:
test.shape

(82761, 4)

In [23]:
test.userId.nunique()

30230

In [24]:
test.movieId.nunique()

11954

In [25]:
plot_df = train.copy()
plot_df["year"] = train.timestamp.dt.year
plot_df = plot_df.groupby("year", as_index=False).count()[["year", "userId"]]
plot_df.columns = ["year", "reviews_count"]
plot_df.head(25)

Unnamed: 0,year,reviews_count
0,1996,15406
1,1997,6824
2,1998,2927
3,1999,10944
4,2000,18405
5,2001,10987
6,2002,8038
7,2003,9622
8,2004,10693
9,2005,16230


> Definimos el conjunto de validación, en función de nuestro conjunto de entrenamiento. <span style="color:red">Actualizar con dataset grande</span>

In [26]:
validation = train[train.timestamp >= datetime(year=2015, month=1, day=1)]
validation.head()

Unnamed: 0,userId,movieId,rating,timestamp
182910,201998,4701,3.5,2015-01-01
182911,61766,63859,3.0,2015-01-01
182912,61766,74685,0.5,2015-01-01
182913,215460,5218,2.5,2015-01-01
182914,188012,112552,5.0,2015-01-01


In [27]:
validation.shape

(34329, 4)

In [28]:
validation.userId.nunique()

14731

In [29]:
validation.movieId.nunique()

6010

In [30]:
plot_df = validation.copy()
plot_df["year"] = validation.timestamp.dt.year
plot_df = plot_df.groupby("year", as_index=False).count()[["year", "userId"]]
plot_df.columns = ["year", "reviews_count"]
plot_df.head(25)

Unnamed: 0,year,reviews_count
0,2015,16411
1,2016,17918


> Redefinimos el conjunto de entrenamiento. <span style="color:red">Actualizar con dataset grande</span>

In [31]:
train = train[(train.timestamp < datetime(year=2015, month=1, day=1))]
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,86847,34,5.0,1996-01-29
1,268919,23,2.0,1996-02-01
2,25445,52,4.0,1996-02-02
3,80271,70,3.0,1996-02-02
4,311037,74,4.0,1996-02-09


In [32]:
plot_df = train.copy()
plot_df["year"] = train.timestamp.dt.year
plot_df = plot_df.groupby("year", as_index=False).count()[["year", "userId"]]
plot_df.columns = ["year", "reviews_count"]
plot_df.head(25)

Unnamed: 0,year,reviews_count
0,1996,15406
1,1997,6824
2,1998,2927
3,1999,10944
4,2000,18405
5,2001,10987
6,2002,8038
7,2003,9622
8,2004,10693
9,2005,16230


In [33]:
train.shape

(182910, 4)

¿Tenemos COLDSTAR? 

In [34]:
test[~test.userId.isin(train.userId.unique())].userId.nunique()

28749

> Tenemos ### Usuarios que se encuentra en el dataset de test y no en el de train. <span style="color:red">Actualizar con dataset grande</span>

In [35]:
validation[~validation.userId.isin(train.userId.unique())].userId.nunique()

13275

> Tenemos ### Usuarios que se encuentra en el dataset de validation y no en el de train. <span style="color:red">Actualizar con dataset grande</span>

#### Matriz de Interacciones

In [36]:
interactions_train = train[["userId", "movieId", "rating"]].copy()
interactions_train.head()

Unnamed: 0,userId,movieId,rating
0,86847,34,5.0
1,268919,23,2.0
2,25445,52,4.0
3,80271,70,3.0
4,311037,74,4.0


In [37]:
interactions_matrix = interactions_train.pivot(index="userId", columns="movieId", values="rating")

In [38]:
interactions_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,117865,118101,118198,118344,118492,118696,118866,119141,119804,120128
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
21,,,,,,,,,,,...,,,,,,,,,,
24,,,,,,,,,,,...,,,,,,,,,,


In [39]:
interactions_matrix = interactions_matrix.fillna(0)

In [40]:
interactions_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,117865,118101,118198,118344,118492,118696,118866,119141,119804,120128
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
interactions_matrix.shape

(78748, 9542)

In [42]:
interactions_matrix_csr = csr_matrix(interactions_matrix.values)

In [43]:
user_ids = list(interactions_matrix.index)
user_map = {}
counter = 0
for i in user_ids:
    user_map[i] = counter
    counter += 1

In [44]:
user_map

{6: 0,
 7: 1,
 14: 2,
 21: 3,
 24: 4,
 25: 5,
 31: 6,
 35: 7,
 39: 8,
 41: 9,
 43: 10,
 45: 11,
 48: 12,
 51: 13,
 53: 14,
 62: 15,
 63: 16,
 64: 17,
 69: 18,
 72: 19,
 73: 20,
 79: 21,
 82: 22,
 88: 23,
 101: 24,
 111: 25,
 113: 26,
 117: 27,
 118: 28,
 119: 29,
 123: 30,
 124: 31,
 126: 32,
 128: 33,
 135: 34,
 137: 35,
 141: 36,
 142: 37,
 144: 38,
 148: 39,
 149: 40,
 153: 41,
 155: 42,
 157: 43,
 160: 44,
 164: 45,
 174: 46,
 176: 47,
 177: 48,
 181: 49,
 183: 50,
 187: 51,
 189: 52,
 190: 53,
 193: 54,
 198: 55,
 202: 56,
 207: 57,
 212: 58,
 214: 59,
 222: 60,
 223: 61,
 224: 62,
 227: 63,
 229: 64,
 231: 65,
 233: 66,
 240: 67,
 241: 68,
 246: 69,
 249: 70,
 255: 71,
 260: 72,
 262: 73,
 263: 74,
 265: 75,
 267: 76,
 270: 77,
 283: 78,
 284: 79,
 289: 80,
 311: 81,
 314: 82,
 315: 83,
 318: 84,
 322: 85,
 326: 86,
 328: 87,
 330: 88,
 332: 89,
 335: 90,
 339: 91,
 347: 92,
 349: 93,
 367: 94,
 381: 95,
 384: 96,
 385: 97,
 392: 98,
 408: 99,
 412: 100,
 417: 101,
 423: 102,
 42

#### Modelo

In [48]:

# Definir el modelo LightFM
model = LightFM()

# Definir el espacio de búsqueda de hiperparámetros
param_grid = {
    'loss': ['logistic'],
    'no_components': [30, 50, 70, 90],
    'learning_rate': [0.03, 0.05, 0.1, 0.2],
}

# Inicializar GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='precision', n_jobs=-1)

# Ejecutar la búsqueda de hiperparámetros
%%time
grid_search.fit(interactions_matrix_csr, epochs=10)

# Obtener los mejores hiperparámetros y el mejor score
best_params = grid_search.best_params_

UsageError: Line magic function `%%time` not found.


In [46]:
best_params

{'learning_rate': 0.03, 'loss': 'logistic', 'no_components': 30}

In [47]:
%%time
model = LightFM(no_components=30, random_state=100, learning_rate=0.03, loss='logistic')

Wall time: 3.99 ms


In [49]:
%%time
model = model.fit(interactions_matrix_csr, epochs=50)

Wall time: 39.7 s


In [50]:
model

<lightfm.lightfm.LightFM at 0x21483964a90>

#### Metodos auxiliares
Metodo para obtener las recomendaciones en caso de ColdStart

In [51]:
def getColdStarRecomm(dataset, no_recom): 
    '''
    Esta funcion recibe el dataset y retonar una lista de tamaño no_recom con recomendaciones mas populares 
    '''
    recomm = dataset.groupby("movieId", as_index=False).agg({"userId":"nunique"}).sort_values(by="userId", ascending=False)
    recomm.columns=["movieId", "popularity"]
    return recomm.movieId.values[:no_recom]

In [52]:
def getColdStarRandomRecomm(dataset, no_recom):
    '''
    Esta funcion recibe el dataset y retonar una lista de tamaño no_recom con recomendaciones al azahar 
    '''
    recomm = dataset[[ 'movieId', 'userId']].sample(20, random_state=42)
    recomm.columns=["movieId", "popularity"]
    return recomm.movieId.values[:no_recom]

Metodo para obtener los scores a partir del modelo

In [53]:
def predict(user):
    '''
    Esta funcion recibe un usuario y retorna lista de scores
    '''
    user_predic = user_map[user]
    n_users, n_items = interactions_matrix.shape
    item_ids = np.arange(n_items)
    return model.predict(user_ids=user_predic, item_ids = item_ids)

Metodo para obtener las recomendaciones ordenadas

In [54]:
def getOrderedMoviesId(preds, no_recom):
    '''
    Esta funcion recibe los scores predecidos y retorna lista ordenada de moviesId
    '''
    recomm = pd.Series(preds)
    recomm.index = interactions_matrix.columns
    return list(pd.Series(recomm.sort_values(ascending=False).index))[:no_recom]

Metodo para eliminar de las recomendaciones las peliculas ya vistas

In [55]:
def getNotWatchedMovieId(user, recomm):
    '''
    Esta funcion recibe las recomendaciones y retorna lista de recomendaciones sin las peliculas vistas por el usuario
    '''
    watched = train[train.userId == user].movieId.unique()
    return [x for x in recomm if x not in watched][:20]

In [56]:
def recomm(user, no_recom, strategy):
    result = None
    if user in list(interactions_matrix.index):
        score = predict(user)
        recomm = getOrderedMoviesId(score, no_recom)
        result = getNotWatchedMovieId(user, recomm)
    else:
        if strategy == 1:
            result = getColdStarRecomm(train, 20)
        if strategy == 2:
            result = getColdStarRandomRecomm(train, 20)
    return result

### Recomendaciones
Generamos recomendaciones para todos los ususarios de validation

Primero usamos recomendaciones con coldstar de mas populares 

In [62]:
def recommAll_Validation(no_recom, strategy):
    recomms_dict = {
        'user_id': [],
        'recomms': []
    }
    for user in validation.userId.unique():
        recomms_dict['user_id'].append(user)
        recomms_dict['recomms'].append(recomm(user, no_recom, strategy))
    return pd.DataFrame(recomms_dict)


In [63]:
%%time
pd.options.display.max_colwidth = None
df_popular = recommAll_Validation(100, 1)#Populares
df_popular.head(3)

Recomendaciones con coldstar random

In [None]:
df_random = recommAll_Validation(100, 2)#Random
df_random.head(3)

> Como de los ## usuarios del dataset validation, ## son usuarios nuevos, les asigna las recomendaciones coldstart <span style="color:red">Actualizar con dataset grande</span>

#### Comparación

> Primero generamos la recomendaciones ideales del conjunto de validación.

In [None]:
interactions_validation = validation[["userId", "movieId", "rating"]].copy()
interactions_validation.head()

Unnamed: 0,userId,movieId,rating
232,2,318,3.0
233,2,333,4.0
234,2,1704,4.5
235,2,3578,4.0
236,2,6874,4.0


In [None]:
ideal_recomms = interactions_validation.sort_values(by=["userId", "rating"], ascending=False)\
                  .groupby("userId", as_index=False)\
                  .agg({"movieId": "unique"})
ideal_recomms.head(5)

Unnamed: 0,userId,movieId
0,2,"[60756, 80906, 89774, 106782, 122882, 131724, 1704, 58559, 68157, 80489, 333, 3578, 6874, 46970, 48516, 74458, 79132, 86345, 112552, 8798, 91529, 99114, 115713, 318, 71535, 77455, 109487, 91658, 114060]"
1,21,"[10, 1270, 2011, 2012, 7573, 260, 356, 648, 1196, 1210, 1544, 1580, 2947, 2948, 2949, 2989, 2990, 2991, 2993, 3633, 3635, 3638, 3639, 3984, 4489, 4963, 5445, 7569, 7570, 8529, 8984, 33004, 33493, 53121, 53322, 58998, 68954, 78637, 101864, 111759, 111781, 135887, 364, 480, 588, 597, 743, 1198, 1291, 2115, 2424, 2529, 2571, 2628, 2671, 2763, 2916, 3022, 3253, 3868, 3869, 4005, 4306, 4545, 4896, 5218, 5378, 5418, 5574, 5816, 6539, 6934, 6942, 7143, 8360, 8368, 8644, 8665, 8798, 8972, 33615, 40815, 47566, 49272, 53125, 54286, 59315, 59615, 69644, 69844, 72998, 77561, 78499, 79185, 82202, 85259, 88125, 89745, 91630, 94677, ...]"
2,29,"[1408, 5464, 6502, 111362, 4223, 5010, 104841, 111759]"
3,60,"[527, 858, 58559, 318, 362, 783, 805, 1242, 2150, 2739, 3386, 3424, 6016, 48, 50, 60, 455, 832, 1203, 1562, 2067, 2724]"
4,63,"[1, 50, 260, 296, 318, 344, 745, 1080, 1136, 1148, 1196, 1198, 1208, 1220, 1223, 1270, 1288, 2078, 2716, 2788, 2858, 2959, 3949, 5669, 6104, 7361, 8874, 32587, 33779, 38038, 48774, 51255, 57669, 58559, 77455, 77800, 79702, 86290, 89753, 89904, 91529, 92494, 92535, 97913, 98491, 102217, 106696, 108932, 111781, 115617, 134853, 608, 1210, 1965, 2542, 2692, 6016, 6350, 7980, 48516, 64285, 81845, 96610, 102445, 47, 165, 357, 364, 367, 527, 588, 589, 592, 648, 778, 858, 912, 913, 953, 1036, 1084, 1193, 1201, 1202, 1203, 1213, 1221, 1262, 1282, 1291, 1617, 1968, 2115, 2329, 2791, 3088, 3435, 3481, 3535, 3751, ...]"
...,...,...
64,573,"[858, 109487, 111362, 112852]"
65,581,"[318, 527, 3147, 4896, 5816, 5995, 7361, 356, 2324, 2762, 2959, 3949, 4022, 4226, 4306, 4886, 4993, 4995, 5349, 5952, 6377, 7153, 8368, 44191, 79132, 81845, 92259, 109487, 112552, 1704, 59315, 60069, 68954, 116797, 134130, 2571, 4973, 5989, 7147, 48394]"
66,582,"[2571, 79091, 79132, 81834, 88125, 89745, 92259, 49272, 58559, 69844, 76093, 91529, 96079, 99114, 102125, 104841, 109487, 260, 1196, 48516, 54001, 60069, 68157, 74458, 81229, 81564, 87232, 91500, 91630, 94864, 109374, 134130, 4993, 5618, 44191, 48780, 68954, 73321, 115617, 84954, 97752, 97913, 76251, 77561, 85414, 96610]"
67,598,"[5816, 7361, 46578, 54001, 56367, 79132, 101577, 103543, 113829, 114265, 124851, 130490, 4963, 5418, 63082, 1197, 110771, 4226, 4306, 8360, 593]"


Proximo paso sería hacer nuevos dataframe, en donde combinariamos lo recomendado con la lista ideal para poder hacer la comparación. Uno para Popula y otro random

In [None]:
merged_df_popular = pd.merge(df_popular, ideal_recomms, left_on='user_id', right_on='userId', how='inner')
merged_df_popular = merged_df_popular.rename(columns={'recomms': 'recomms_df', 'movieId': 'recomms_ideal'})
merged_df_popular = merged_df_popular.drop(columns=['userId'])
merged_df_popular.head()

In [None]:
merged_df_random = pd.merge(df_random, ideal_recomms, left_on='user_id', right_on='userId', how='inner')
merged_df_random = merged_df_random.rename(columns={'recomms': 'recomms_df', 'movieId': 'recomms_ideal'})
merged_df_random = merged_df_random.drop(columns=['userId'])
merged_df_random.head()

### MAP

Tenemos dos algoritmos de calculos de metrica

In [214]:
def map(dataframe):
  aps = []
  for pred, label in dataframe[["recomms_ideal", "recomms_df"]].values:
    n = len(pred)
    arange = np.arange(n, dtype=np.int32) + 1.
    rel_k = np.in1d(pred[:n], label)
    tp = np.ones(rel_k.sum(), dtype=np.int32).cumsum()
    denom = arange[rel_k]
    ap = (tp / denom).sum() / len(label)
    aps.append(ap)
  return aps

Algoritmo investigado

In [213]:
def calculate_map(recommendations, test):
    average_precisions = []
    for i in range(len(test)):
        actual = test[i]
        predicted = recommendations[i]
        relevant_indices = np.nonzero(actual)[0]
        if len(relevant_indices) == 0:
            continue
        precision_sum = 0.0
        num_hits = 0
        for j, item in enumerate(predicted):
            if item in relevant_indices:
                num_hits += 1
                precision_sum += num_hits / (j + 1)
        average_precision = precision_sum / len(relevant_indices)
        average_precisions.append(average_precision)
    return np.mean(average_precisions)

Calculamos para popular

In [215]:
MAP = np.mean(map(merged_df_popular))
print(f'mean average precision = {round(MAP, 5)}')

mean average precision = 0.08595


In [216]:
MAP_2 = calculate_map(merged_df_popular["recomms_df"], merged_df_popular["recomms_ideal"])
MAP_2

0.006479675813013669

Calculamos para random

In [None]:
MAP = np.mean(map(merged_df_popular))
print(f'mean average precision = {round(MAP, 5)}')

In [None]:
MAP_2 = calculate_map(merged_df_popular["recomms_df"], merged_df_popular["recomms_ideal"])
MAP_2

#### Metricas en test

In [217]:
def recommAll_test(no_recom):
    recomms_dict = {
        'user_id': [],
        'recomms': []
    }
    for user in test.userId.unique():
        recomms_dict['user_id'].append(user)
        recomms_dict['recomms'].append(recomm(user, no_recom,2))
    return pd.DataFrame(recomms_dict)

In [218]:
pd.options.display.max_colwidth = None
df_test = recommAll_test(100)
df_test

Unnamed: 0,user_id,recomms
0,10,"[31696, 1288, 6332, 1089, 1370, 1198, 1097, 4040, 4671, 6643, 1064, 5630, 2167, 586, 3052, 2712, 260, 276, 4322, 1272]"
1,15,"[296, 356, 318, 593, 110, 480, 150, 589, 592, 380, 457, 590, 780, 527, 1, 344, 588, 377, 32, 260]"
2,18,"[31696, 1288, 6332, 1089, 1370, 1198, 1097, 4040, 4671, 6643, 1064, 5630, 2167, 586, 3052, 2712, 260, 276, 4322, 1272]"
3,21,"[296, 356, 318, 593, 110, 480, 150, 589, 592, 380, 457, 590, 780, 527, 1, 344, 588, 260, 32, 377]"
4,24,"[31696, 1288, 6332, 1089, 1370, 1198, 1097, 4040, 4671, 6643, 1064, 5630, 2167, 586, 3052, 2712, 260, 276, 4322, 1272]"
...,...,...
115,586,"[31696, 1288, 6332, 1089, 1370, 1198, 1097, 4040, 4671, 6643, 1064, 5630, 2167, 586, 3052, 2712, 260, 276, 4322, 1272]"
116,596,"[31696, 1288, 6332, 1089, 1370, 1198, 1097, 4040, 4671, 6643, 1064, 5630, 2167, 586, 3052, 2712, 260, 276, 4322, 1272]"
117,599,"[31696, 1288, 6332, 1089, 1370, 1198, 1097, 4040, 4671, 6643, 1064, 5630, 2167, 586, 3052, 2712, 260, 276, 4322, 1272]"
118,601,"[31696, 1288, 6332, 1089, 1370, 1198, 1097, 4040, 4671, 6643, 1064, 5630, 2167, 586, 3052, 2712, 260, 276, 4322, 1272]"


In [219]:
interactions_test = test[["userId", "movieId", "rating"]].copy()
interactions_test.head()

Unnamed: 0,userId,movieId,rating
1119,10,296,1.0
1120,10,356,3.5
1121,10,588,4.0
1122,10,597,3.5
1123,10,912,4.0


In [220]:
ideal_recomms2 = interactions_test.sort_values(by=["userId", "rating"], ascending=False)\
                  .groupby("userId", as_index=False)\
                  .agg({"movieId": "unique"})
ideal_recomms2

Unnamed: 0,userId,movieId
0,10,"[7458, 8533, 8869, 33794, 49272, 49286, 71579, 79091, 81845, 91529, 92259, 96079, 136020, 140110, 4306, 4447, 7169, 31685, 51705, 58559, 63992, 69406, 94070, 106696, 113275, 588, 912, 1907, 3578, 4993, 4995, 5952, 6535, 6942, 7149, 7153, 7154, 7375, 40819, 68954, 88163, 95167, 95449, 103335, 103339, 104374, 109853, 112006, 113394, 137595, 356, 597, 1784, 2671, 4246, 5377, 6377, 7293, 7451, 8529, 8636, 8665, 8969, 30749, 54286, 56367, 58047, 63113, 66203, 72330, 72720, 72737, 80549, 81847, 82167, 84374, 87222, 95543, 106489, 129428, 1088, 1247, 1307, 3882, 5066, 5620, 5943, 5957, 6155, 6266, 7151, 8808, 33145, 33679, 40629, 47099, 51662, 56949, 60397, 69844, ...]"
1,15,"[260, 318, 356, 527, 589, 1196, 1200, 1210, 1214, 1270, 2011, 3147, 3156, 3578, 4720, 4995, 5989, 33493, 84152, 122886, 152077, 166528, 1653, 2329, 2916, 48304, 48780, 84954, 104841, 111759, 112556, 134853, 296, 858, 1198, 1240, 2012, 2571, 2858, 3499, 3949, 4370, 5445, 64614, 71057, 97938, 101864, 105504, 109487, 134130, 47, 780, 1265, 2028, 3535, 4022, 4886, 4993, 5952, 6502, 7254, 8644, 56174, 60069, 68954, 70286, 79132, 85414, 103249, 158872, 160980, 166635, 293, 364, 588, 1527, 2081, 2762, 3753, 3994, 4306, 5618, 6377, 6874, 7438, 48774, 50872, 63859, 68237, 72998, 91500, 94864, 96610, 143385, 152081, 1, 2959, 8360, 8961, 71264, ...]"
2,18,"[50, 318, 923, 1201, 1203, 1209, 1221, 16, 47, 110, 235, 293, 356, 527, 589, 593, 608, 778, 904, 1080, 1136, 1148, 1193, 1206, 1207, 1210, 1212, 1213, 1219, 1222, 1223, 1227, 1234, 1247, 1356, 1374, 1732, 2324, 2542, 2571, 2762, 2951, 2959, 3052, 3275, 3578, 3681, 3949, 4011, 4226, 4993, 4995, 5008, 5120, 5995, 6300, 6440, 6539, 6807, 7147, 7153, 7254, 7843, 27716, 27878, 33794, 44191, 44199, 44665, 48516, 48780, 51540, 52604, 55118, 55290, 55765, 64197, 67255, 68073, 71108, 71899, 73017, 73323, 74458, 74510, 76251, 79132, 81788, 84392, 109487, 112334, 112552, 112852, 115713, 116797, 134130, 142488, 157108, 157110, 177593, ...]"
3,21,"[47997, 2717, 33679, 117529, 119145, 122886, 122896, 122922, 136020, 143385, 152081, 164179, 167036, 296, 541, 780, 2617, 6155, 30793, 32296, 34048, 36519, 50872, 51662, 53996, 60397, 63082, 68791, 69122, 69526, 72378, 73321, 78469, 79293, 79592, 87520, 90249, 91535, 95167, 96588, 108190, 112138, 114180, 115149, 116823, 122900, 122904, 135133, 135536, 138036, 164909, 166492, 168248, 902, 2052, 2953, 4700, 5219, 5254, 7373, 32587, 34150, 41566, 72641, 93510, 110553, 126548, 130450, 136016, 168252, 1573, 3697, 142536, 58025, 97913, 148675, 6874, 7438, 38038, 143245, 149380, 2174, 5266, 61160, 108932, 160565, 1391, 160872, 173307]"
4,24,"[6, 318, 356, 593, 1198, 1265, 3147, 5064, 6350, 27773, 50, 296, 608, 1197, 1246, 1396, 1527, 1580, 1704, 1784, 2028, 2115, 2424, 2571, 2686, 3578, 4027, 4262, 4489, 4855, 5418, 5673, 5791, 7143, 35836, 38061, 44191, 46976, 51662, 54286, 58559, 58998, 64957, 68358, 70286, 72011, 79132, 86882, 91529, 94777, 119145, 132660, 134130, 134853, 32, 165, 253, 316, 457, 552, 780, 1220, 1370, 1663, 1682, 2273, 2421, 2617, 2916, 4299, 4701, 4973, 4995, 7293, 31685, 33679, 34437, 49272, 52973, 57368, 59615, 61024, 91542, 96079, 102407, 111759, 122886, 733, 1297, 1639, 1653, 2134, 5445, 5903, 8784, 31696, 45672, 47610, 61132, 63113, ...]"
...,...,...
115,586,"[110, 318, 589, 1198, 1200, 1374, 1580, 1704, 2011, 2353, 2490, 2571, 3175, 3578, 3753, 3793, 4886, 4993, 5952, 6333, 6539, 7153, 8368, 8665, 33615, 45431, 45499, 45517, 47610, 50872, 59315, 59369, 59784, 60069, 62999, 63859, 68954, 76093, 77561, 79091, 86298, 86880, 87222, 93272, 95167, 96861, 98243, 101142, 103141, 106489, 106696, 110102, 112852, 117851, 118696, 120635, 122886, 122896, 122906, 122918, 122920, 122922, 122926, 134853, 135133, 136556, 149406, 152081, 160438, 166461, 168252, 168418, 179819, 187595, 161, 260, 380, 457, 553, 588, 1073, 1196, 1210, 1265, 1270, 1376, 1610, 3114, 5459, 27619, 33493, 41566, 42738, 52287, 54001, 54286, 54648, 58559, 65682, 78499, ...]"
116,596,"[2288, 3000, 4878, 5971, 31658, 33649, 57669, 110102, 122882, 122906, 122916, 166528, 167746, 168252, 904, 1192, 1356, 1688, 1748, 4226, 4342, 4720, 5444, 5618, 6350, 7615, 8874, 38061, 51255, 60069, 70286, 76093, 107406, 111913, 122886, 134853, 135569, 138036, 143355, 171917, 1, 34, 39, 260, 364, 527, 541, 581, 593, 595, 616, 919, 924, 1028, 1035, 1036, 1046, 1097, 1136, 1210, 1214, 1270, 1374, 1375, 1376, 1704, 1907, 2081, 2393, 2407, 2571, 2687, 2692, 2762, 2959, 3052, 3094, 3213, 3275, 3535, 3786, 3793, 3967, 3996, 4306, 4366, 4973, 4993, 5903, 6333, 6377, 6502, 6934, 8961, 27246, 27611, 33794, 34405, 37729, 39183, ...]"
117,599,"[112, 260, 293, 296, 741, 750, 924, 951, 1089, 1178, 1196, 1200, 1208, 1210, 1214, 1274, 1283, 1732, 1967, 2395, 2427, 2571, 2858, 2959, 3030, 3160, 3435, 3703, 3949, 4973, 6711, 6874, 6, 215, 541, 589, 720, 1080, 1129, 1136, 1148, 1206, 1215, 1223, 1249, 1377, 1704, 1945, 2692, 2716, 3087, 3334, 3379, 3468, 3503, 3741, 4226, 4467, 4848, 5669, 6440, 7387, 7748, 27156, 68945, 80463, 96004, 170355, 21, 47, 318, 329, 431, 480, 745, 858, 861, 912, 923, 928, 930, 1036, 1050, 1095, 1097, 1183, 1204, 1213, 1221, 1225, 1230, 1232, 1236, 1237, 1242, 1244, 1246, 1265, 1282, 1291, ...]"
118,601,"[904, 1197, 1203, 2324, 5618, 31658, 48516, 50872, 59315, 60069, 68954, 76093, 134853, 166024, 170705, 908, 953, 1704, 2355, 2762, 3578, 5971, 33794, 48780, 67255, 70286, 74458, 80463, 81834, 88810, 89745, 91529, 106782, 115617, 152081, 172591, 177765, 1, 47, 912, 1136, 1193, 1198, 1207, 1527, 1721, 1917, 3000, 4306, 6016, 44191, 48394, 49272, 54286, 63082, 68157, 72378, 72998, 74946, 78499, 80549, 81845, 95167, 99114, 112852, 122918, 168326, 170697, 174055, 176371, 1584, 3114, 46578, 122916]"


In [221]:
merged_df_test = pd.merge(df, ideal_recomms2, left_on='user_id', right_on='userId', how='inner')
merged_df_test = merged_df_test.rename(columns={'recomms': 'recomms_df', 'movieId': 'recomms_ideal'})
merged_df_test = merged_df_test.drop(columns=['userId'])

merged_df_test.head()

Unnamed: 0,user_id,recomms_df,recomms_ideal
0,21,"[296, 356, 318, 593, 110, 480, 150, 589, 592, 380, 457, 590, 780, 527, 1, 344, 588, 260, 32, 377]","[47997, 2717, 33679, 117529, 119145, 122886, 122896, 122922, 136020, 143385, 152081, 164179, 167036, 296, 541, 780, 2617, 6155, 30793, 32296, 34048, 36519, 50872, 51662, 53996, 60397, 63082, 68791, 69122, 69526, 72378, 73321, 78469, 79293, 79592, 87520, 90249, 91535, 95167, 96588, 108190, 112138, 114180, 115149, 116823, 122900, 122904, 135133, 135536, 138036, 164909, 166492, 168248, 902, 2052, 2953, 4700, 5219, 5254, 7373, 32587, 34150, 41566, 72641, 93510, 110553, 126548, 130450, 136016, 168252, 1573, 3697, 142536, 58025, 97913, 148675, 6874, 7438, 38038, 143245, 149380, 2174, 5266, 61160, 108932, 160565, 1391, 160872, 173307]"
1,103,"[296, 356, 318, 593, 480, 110, 589, 780, 260, 150, 2571, 592, 1, 457, 380, 527, 590, 50, 47, 2858]","[1206, 1222, 2997, 3160, 64614, 81932, 148626, 48516, 56782, 86882, 923, 1945, 4677, 6873, 61323, 69140, 96728, 116799, 367, 2018, 2421, 4254, 7367, 50912, 106766, 168250, 122886]"
2,105,"[296, 356, 318, 593, 480, 110, 589, 780, 260, 150, 2571, 592, 1, 457, 380, 527, 590, 50, 47, 2858]","[2131, 4788, 5889, 37731, 57183, 80124, 104780, 116897, 120138, 120478, 129514, 130970, 134095, 136445, 136447, 138835, 140265, 141816, 142020, 143511, 145994, 147196, 147250, 147286, 147300, 147326, 147328, 147330, 150548, 151769, 159811, 159817, 163072, 163112, 163386, 163925, 165959, 166183, 170597, 170705, 170777, 171011, 171749, 172577, 172583, 172585, 172587, 172589, 172637, 172793, 172909, 173351, 173355, 173619, 173963, 174551, 175293, 175387, 175397, 175431, 179133, 56060, 86345, 86347, 92535, 98604, 104069, 107408, 120625, 127098, 133712, 133716, 134326, 140038, 141818, 141830, 152081, 163134, 163645, 172825, 173535, 175435, 935, 75341, 101088, 118888, 118896, 122904, 141810, 141820, 147282, 152591, 170411, 175401, 178613, 8533, 54997, 158402]"
3,112,"[296, 356, 318, 593, 480, 110, 589, 780, 260, 150, 2571, 592, 1, 457, 380, 527, 590, 50, 47, 2858]","[91529, 6, 16, 253, 508, 68157, 79132, 19, 62, 104, 141, 288, 292, 454, 1356, 1393, 3717, 68954, 72998, 99114, 551, 778, 70286, 25, 208, 329, 21, 185, 2, 95, 161, 223, 339, 788, 17, 36, 39, 300, 357, 434]"
4,119,"[296, 356, 318, 593, 480, 110, 589, 780, 260, 150, 2571, 592, 1, 457, 380, 527, 590, 50, 47, 2858]","[67255, 103984, 136598, 138210, 148626, 318, 7458, 56174, 89087, 112552, 122904, 131023, 135532, 141004, 142488, 157699, 162350, 165551, 168252, 48774, 50872, 108689, 110718, 110730, 114795, 117533, 129354, 134853, 139385, 139642, 166461, 7454, 8464, 8622, 51077, 64839, 136562, 140711]"


In [222]:
aps = []

for pred, label in merged_df_test[["recomms_ideal","recomms_df"]].values:
  n = len(pred)
  arange = np.arange(n, dtype=np.int32) + 1.
  rel_k = np.in1d(pred[:n], label)
  tp = np.ones(rel_k.sum(), dtype=np.int32).cumsum()
  denom = arange[rel_k]
  ap = (tp / denom).sum() / len(label)
  aps.append(ap)

In [223]:
MAP = np.mean(aps)
print(f'mean average precision = {round(MAP, 5)}')

mean average precision = 0.02005


In [224]:
MAP_2 = calculate_map(merged_df["recomms_df"], merged_df["recomms_ideal"])
MAP_2

0.006479675813013669