# Modelos Baseline

En este notebook se implementaran los modelos baseline del proyecto y se guardaran las métricas con el mismo dataset que se utilizará para el modelo principal para hacer bentchmarking

## Lectura de los datos

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from utils.recomender_metrics import prepare_ground_truth, evaluate_recommendations, print_evaluation_results
from collections import defaultdict
import pyreclab

In [2]:
data_folder = '../data/processed/'

In [3]:
train_df = pd.read_csv(f'{data_folder}processed_train.csv', encoding='latin-1', header=0)
test_df = pd.read_csv(f'{data_folder}processed_test.csv', encoding='latin-1', header=0)

train_df['rating'] = train_df['rating'].astype(int)
test_df['rating'] = test_df['rating'].astype(int)

train_df['rating'] = pd.to_numeric(train_df['rating'], errors='coerce').astype('Int64')
test_df['rating'] = pd.to_numeric(test_df['rating'], errors='coerce').astype('Int64')

print("Training Data Head:")
print(train_df.head())

Training Data Head:
   movieId  item_id  imdbId  tmdbId  user_id  rating  timestamp
0        1        1  114709   862.0        1       5  874965758
1        1        1  114709   862.0        2       4  888550871
2        1        1  114709   862.0        6       4  883599478
3        1        1  114709   862.0       13       3  882140487
4        1        1  114709   862.0       16       5  877717833


In [4]:
movies_df = pd.read_csv(f'{data_folder}processed_movies.csv', encoding='latin-1', header=0)

print("\nMovie Data Head:")
print(movies_df.head())


Movie Data Head:
   movieId                               title  item_id release_date  unknown  \
0        1                    Toy Story (1995)        1  01-Jan-1995        0   
1        2                      Jumanji (1995)      755  01-Jan-1995        0   
2        3             Grumpier Old Men (1995)     1028  01-Jan-1995        0   
3        4            Waiting to Exhale (1995)     1311  15-Jan-1996        0   
4        5  Father of the Bride Part II (1995)      756  01-Jan-1995        0   

   Action  Adventure  Animation  Children's  Comedy  ...  Horror  Musical  \
0       0          0          1           1       1  ...       0        0   
1       1          1          0           1       0  ...       0        0   
2       0          0          0           0       1  ...       0        0   
3       0          0          0           0       1  ...       0        0   
4       0          0          0           0       1  ...       0        0   

   Mystery  Romance  Sci-Fi  Thr

In [None]:
users_df = pd.read_csv(f'{data_folder}users_with_moods.csv', encoding='latin-1', header=0)

print("\nUser Data Head:")
print(users_df.head())


User Data Head:
   user_id  age gender  occupation zip_code  mood_dark  mood_emotional  \
0        1   24      M  technician    85711   0.034632        0.212121   
1        2   53      F       other    94043   0.014706        0.308824   
2        3   23      M      writer    32067   0.000000        0.288462   
3        4   24      M  technician    43537   0.041667        0.125000   
4        5   33      F       other    15213   0.028986        0.123188   

   mood_exciting  mood_family-friendly  mood_intense  mood_lighthearted  \
0       0.121212              0.073593      0.099567           0.259740   
1       0.102941              0.044118      0.147059           0.161765   
2       0.192308              0.019231      0.269231           0.153846   
3       0.375000              0.000000      0.375000           0.083333   
4       0.181159              0.123188      0.137681           0.275362   

   mood_neutral  mood_relaxing  mood_romantic  mood_suspenseful  \
0      0.012987     

## Estructuras para métricas

Para calcular las métricas, necesitamos saber, por ejemplo, el `ground truth` o la popularidad de los items

In [6]:
i_cols = [
    'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy',
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

# Agrupamos el DataFrame de test por usuario y convertimos los item_id de cada grupo en un conjunto (set)
ground_truth_old = test_df[test_df['rating'] > 2].groupby('user_id')['item_id'].apply(set).to_dict()

ground_truth = (test_df[test_df['rating'] >= 2]
                        .groupby('user_id')
                        .apply(lambda x: dict(zip(x['item_id'].astype(str), x['rating'])))
                        .to_dict())
ground_truth = {str(k): v for k, v in ground_truth.items()}

k_values = [10, 20, 50]

# Contamos popularidad como suma de ratings para dar más peso a los items mejor puntuados
item_popularity = train_df.groupby('item_id')['rating'].sum().to_dict()

# Creamos el diccionario iterando sobre el dataframe de películas
item_features = {}
for index, row in movies_df.iterrows():
    item_id = row['item_id']
    # Creamos un conjunto con los nombres de las columnas de género donde el valor es 1
    genres = {genre for genre in i_cols if row[genre] == 1}
    item_features[item_id] = genres

all_items = set(movies_df['item_id'])

  .apply(lambda x: dict(zip(x['item_id'].astype(str), x['rating'])))


## Random model

Generaremos, para cada usuario, aleatoriamente una lista de hasta 50 recomendaciones para poder evaluar las métricas a distintos puntos.

In [7]:
# Obtener todos los IDs de películas únicos
all_movie_ids = movies_df['item_id'].unique().tolist()

# Crear el diccionario de ítems vistos por usuario (SOLO con datos de entrenamiento)
user_seen_items = train_df.groupby('user_id')['item_id'].apply(set).to_dict()

# Obtener la lista de usuarios para los que generaremos recomendaciones
users_in_train = train_df['user_id'].unique().tolist()

In [8]:
print(f"Total de películas: {len(all_movie_ids)}")
print(f"Total de usuarios en el set de entrenamiento: {len(users_in_train)}")
print(f"Películas vistas por el usuario 1: {user_seen_items[1]}") 

Total de películas: 1195
Total de usuarios en el set de entrenamiento: 943
Películas vistas por el usuario 1: {1, 2, 3, 4, 5, 8, 9, 13, 15, 16, 21, 22, 25, 26, 28, 29, 32, 34, 35, 37, 38, 40, 41, 42, 43, 46, 48, 52, 57, 58, 63, 66, 68, 71, 77, 79, 83, 87, 88, 89, 93, 94, 95, 99, 101, 105, 106, 109, 110, 111, 116, 119, 122, 123, 124, 126, 127, 131, 133, 135, 136, 137, 138, 139, 141, 142, 144, 147, 149, 152, 153, 156, 158, 162, 165, 166, 167, 173, 176, 178, 179, 187, 191, 192, 194, 195, 197, 199, 203, 204, 205, 207, 216, 220, 223, 231, 234, 237, 238, 239, 244, 245, 246, 247, 249, 261, 263, 268, 269, 270, 271}


In [9]:
print(len(train_df), len(test_df))

70151 17611


In [10]:
def generate_random_recommendations(users_to_recommend, all_movie_ids, user_seen_items, max_recommendations=50, random_state=42):
    """
    Genera recomendaciones aleatorias para una lista de usuarios,
    asegurándose de no recomendar ítems que ya han visto.
    """
    np.random.seed(random_state)
    recommendations = {}
    
    for user_id in users_to_recommend:
        # Obtener el conjunto de ítems que el usuario ya ha visto (del diccionario)
        seen_items = user_seen_items.get(user_id, set())
        
        # Calcular los ítems candidatos (todos menos los ya vistos)
        candidate_items = list(set(all_movie_ids) - seen_items)
        
        # Determinar cuántas recomendaciones generar
        n_recommendations = min(max_recommendations, len(candidate_items))
        
        # Si hay candidatos, seleccionar aleatoriamente
        if n_recommendations > 0:
            recommended_items = np.random.choice(candidate_items, size=n_recommendations, replace=False).tolist()
            recommendations[user_id] = recommended_items
        else:
            # En el caso improbable de que un usuario haya visto todo
            recommendations[user_id] = []
            
    return recommendations

In [11]:
random_recs = generate_random_recommendations(
    users_to_recommend=users_in_train,
    all_movie_ids=all_movie_ids,
    user_seen_items=user_seen_items,
    max_recommendations=50, # Generar hasta 50 recomendaciones por usuario
    random_state=42
)

In [12]:
# Run evaluation
results = evaluate_recommendations(
    recommendations=random_recs,
    ground_truth=ground_truth,
    k_values=k_values,
    item_popularity=item_popularity,
    all_items=all_items,
    item_features=item_features
)

print_evaluation_results(results)


RECOMMENDATION EVALUATION RESULTS

CATALOG COVERAGE:
  @10: 0.9992
  @20: 1.0000
  @50: 1.0000

F1:
  @10: 0.0467
  @20: 0.0472
  @50: 0.0504

INTRA LIST SIMILARITY:
  @10: 0.1671
  @20: 0.1691
  @50: 0.1680

MAP:
  @10: 0.0860
  @20: 0.0861
  @50: 0.0773

MRR:
  @10: 0.0935
  @20: 0.1023
  @50: 0.1095

NDCG:
  @10: 0.0371
  @20: 0.0365
  @50: 0.0440

NOVELTY:
  @10: 11.4614
  @20: 11.4797
  @50: 11.4904

PRECISION:
  @10: 0.0355
  @20: 0.0330
  @50: 0.0335

RECALL:
  @10: 0.0090
  @20: 0.0172
  @50: 0.0459



## Most popular items

En este caso, todas las recomendaciones seran iguales para todos los usuarios: recomendaremos las 50 películas más populares

In [13]:
# Agarramos el diccionario de popularidad y sacamos las 50 peliculas mas populares
most_popular_items = sorted(item_popularity, key=item_popularity.get, reverse=True)
most_popular_items = most_popular_items[:50]

# Ahora llenamos las recomendaciones con las mismas peliculas para todos los usuarios
pop_recs = {user_id: most_popular_items for user_id in users_in_train}

In [14]:
# Run evaluation
results = evaluate_recommendations(
    recommendations=pop_recs,
    ground_truth=ground_truth,
    k_values=k_values,
    item_popularity=item_popularity,
    all_items=all_items,
    item_features=item_features
)

print_evaluation_results(results)


RECOMMENDATION EVALUATION RESULTS

CATALOG COVERAGE:
  @10: 0.0084
  @20: 0.0167
  @50: 0.0418

F1:
  @10: 0.1162
  @20: 0.1291
  @50: 0.1526

INTRA LIST SIMILARITY:
  @10: 0.1663
  @20: 0.1748
  @50: 0.1717

MAP:
  @10: 0.3470
  @20: 0.3099
  @50: 0.2492

MRR:
  @10: 0.4030
  @20: 0.4093
  @50: 0.4112

NDCG:
  @10: 0.1986
  @20: 0.1958
  @50: 0.2249

NOVELTY:
  @10: 7.4730
  @20: 7.6388
  @50: 7.9237

PRECISION:
  @10: 0.1845
  @20: 0.1610
  @50: 0.1376

RECALL:
  @10: 0.0801
  @20: 0.1263
  @50: 0.2447



## SVD

En concreto, implementaremos una FunkSVD

In [15]:
# Porcesamos nuestros archivos para poder utilizar el objecto SVD de pyreclab

pyreclab_test = test_df[['user_id', 'item_id', 'rating']].drop_duplicates()
pyreclab_train = train_df[['user_id', 'item_id', 'rating']].drop_duplicates()

pyreclab_test.to_csv(f'{data_folder}pyreclab_format/test.csv', index=False)
pyreclab_train.to_csv(f'{data_folder}pyreclab_format/train.csv', index=False)

In [16]:
# Definicion de objeto svd
svd = pyreclab.SVD(dataset=f'{data_folder}pyreclab_format/train.csv',
                   dlmchar=b',',
                   header=True,
                   usercol=0,
                   itemcol=1,
                   ratingcol=2)

# Entrenamiento del modelo
svd.train(factors=100, maxiter=100, lr=0.01, lamb=0.1)



In [17]:
predlist, mae, rmse = svd.test(input_file=f'{data_folder}pyreclab_format/test.csv',
                               dlmchar=b',',
                               header=True,
                               usercol=0,
                               itemcol=1,
                               ratingcol=2)

print('MAE: {}\nRMSE: {}'.format(mae, rmse))

MAE: 0.7294387105186461
RMSE: 0.9231182101385945




In [18]:
# Testing de recomendaciones
top_n = 50

recommendList, maprec, ndcg = svd.testrec(input_file=f'{data_folder}pyreclab_format/test.csv',
                                          dlmchar=b',',
                                          header=True,
                                          usercol=0,
                                          itemcol=1,
                                          ratingcol=2,
                                          topn=top_n,
                                          relevance_threshold=2,
                                          includeRated=False)

print('MAP: {}\nNDCG@{}: {}'.format(maprec, top_n, ndcg))

MAP: 0.1604530144034825
NDCG@50: 0.1251875103473805


In [19]:
recommendList_int = {
    int(user_id): [int(item_id) for item_id in items]
    for user_id, items in recommendList.items()
}

In [20]:
results = evaluate_recommendations(
    recommendations=recommendList_int,
    ground_truth=ground_truth,
    k_values=k_values,
    item_popularity=item_popularity,
    all_items=all_items,
    item_features=item_features
)

print_evaluation_results(results)


RECOMMENDATION EVALUATION RESULTS

CATALOG COVERAGE:
  @10: 0.2117
  @20: 0.3063
  @50: 0.4519

F1:
  @10: 0.0777
  @20: 0.0985
  @50: 0.1125

INTRA LIST SIMILARITY:
  @10: 0.2555
  @20: 0.2313
  @50: 0.2094

MAP:
  @10: 0.1995
  @20: 0.1843
  @50: 0.1605

MRR:
  @10: 0.2156
  @20: 0.2252
  @50: 0.2302

NDCG:
  @10: 0.1070
  @20: 0.1125
  @50: 0.1252

NOVELTY:
  @10: 10.1562
  @20: 10.0792
  @50: 10.1439

PRECISION:
  @10: 0.1035
  @20: 0.1050
  @50: 0.0918

RECALL:
  @10: 0.0286
  @20: 0.0582
  @50: 0.1274

