In [1]:
import pandas as pd

from src.algorithms.alternating_least_squares import AlternatingLeastSquares
from src.helpers.dataset_indexer import DatasetIndexer
from src.helpers.checkpoint_manager import CheckpointManager
from src.recommenders import CollaborativeFilteringRecommenderBuilder
from src.backends import Backend
from src.helpers._logging import logger  # noqa
from src.settings import settings
from src.utils import vocabulary_based_one_hot_encode, load_pickle, save_pickle

from src.helpers.graphing import (
    plot_als_train_test_loss_evolution,
    plot_als_train_test_rmse_evolution,
    # plot_error_evolution,
    plot_power_low_distribution,
    plot_data_item_distribution_as_hist,
)

2025-01-17 20:23:18,206 [DEBUG] matplotlib data path: /home/hjisaac/.cache/pypoetry/virtualenvs/recommender-system-uSlwvUxw-py3.10/lib/python3.10/site-packages/matplotlib/mpl-data
2025-01-17 20:23:18,214 [DEBUG] CONFIGDIR=/home/hjisaac/.config/matplotlib
2025-01-17 20:23:18,238 [DEBUG] interactive is False
2025-01-17 20:23:18,238 [DEBUG] platform is linux
2025-01-17 20:23:18,310 [DEBUG] CACHEDIR=/home/hjisaac/.cache/matplotlib
2025-01-17 20:23:18,323 [DEBUG] Using fontManager instance from /home/hjisaac/.cache/matplotlib/fontlist-v390.json


In [2]:
USER_HEADER = "userId"
ITEM_HEADER = "movieId"
RATING_HEADER = "rating"
FEATURE_TO_ENCODE = "genres"
ITEM_FEATURE_LIST = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "IMAX",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

In [3]:
dataset_indexer = DatasetIndexer(
    file_path="./ml-32m/ratings.csv",
    user_header=USER_HEADER,
    item_header=ITEM_HEADER,
    rating_header=RATING_HEADER,
    limit=settings.general.LINES_COUNT_TO_READ,
)

indexed_data = dataset_indexer.index_simple(
    approximate_train_ratio=settings.general.APPROXIMATE_TRAIN_RATIO
)

2025-01-11 05:33:51,924 [INFO] Successfully indexed 1000 lines from ./ml-32m/ratings.csv


In [4]:

# Import the movies csv file joined with the movie links csv file and that will act
# as our movie database. The backend needs this database to query the movies.
item_database = (
    pd.read_csv("./ml-32m/movies.csv", dtype={ITEM_HEADER: str})
    .merge(
        pd.read_csv("./ml-32m/links.csv", dtype={ITEM_HEADER: str}),
        on=ITEM_HEADER,
        how="left",
    )
    .assign(
        genres=lambda df: df[FEATURE_TO_ENCODE].apply(
            lambda genres: genres.split("|")
        ),
        features_hot_encoded=lambda df: df[FEATURE_TO_ENCODE].apply(
            lambda g: vocabulary_based_one_hot_encode(
                words=g, vocabulary=ITEM_FEATURE_LIST
            )
        ),
        features_count=lambda df: df["features_hot_encoded"].apply(lambda x: sum(x)),
    )
    .set_index(ITEM_HEADER)  # Set the movieId as the index
    .to_dict(orient="index")  # Convert the DataFrame to a dictionary
)

In [5]:
# plot_data_item_distribution_as_hist(indexed_data)

In [6]:
# plot_power_low_distribution(indexed_data,)

In [7]:
als_instance = AlternatingLeastSquares(
    hyper_lambda=settings.als.HYPER_LAMBDA,
    hyper_gamma=settings.als.HYPER_GAMMA,
    hyper_tau=settings.als.HYPER_TAU,
    hyper_n_epochs=settings.als.HYPER_N_EPOCH,
    hyper_n_factors=settings.als.HYPER_N_FACTOR,
)

als_backend = Backend(
    # Define the algorithm
    algorithm=als_instance,
    checkpoint_manager=CheckpointManager(
        checkpoint_folder=settings.als.CHECKPOINT_FOLDER,
        sub_folder=str(settings.general.LINES_COUNT_TO_READ),
    ),
    # The predictor needs this to render the name of the items
    item_database=item_database,
    # Whether we should resume by using the last state of
    # the algorithm the checkpoint manager folder or not.
    resume=False,
    save_checkpoint=True,
)

In [8]:
recommender_builder = CollaborativeFilteringRecommenderBuilder(
    backend=als_backend,
)

# This might take some moment before finishing
recommender = recommender_builder.build(
    data=indexed_data, item_database=item_database, include_features=True
)

2025-01-11 05:33:53,140 [INFO] Starting the build of the recommender using AlternatingLeastSquares with the state {'hyper_lambda': 0.1, 'hyper_tau': 0.1, 'hyper_gamma': 0.1, 'hyper_n_epochs': 2, 'hyper_n_factors': 10, 'user_factors': None, 'item_factors': None, 'user_biases': None, 'item_biases': None, 'feature_factors': None, 'loss_train': [], 'loss_test': [], 'rmse_train': [], 'rmse_test': []}
2025-01-11 05:33:53,142 [INFO] Starting a model fitting using the backend AlternatingLeastSquares...
2025-01-11 05:33:53,143 [INFO] Initializing user and item's factors and biases, as none of them is provided.
2025-01-11 05:33:53,145 [INFO] About to start training with the `include_features` parameter set to True.
2025-01-11 05:33:53,147 [INFO] Epochs count to train for 2, entering the training loop now...


Epochs:  50%|█████     | 1/2 [00:00<00:00,  2.85epoch/s]

Epoch 1/2 Loss (Train/Test) : -175.4973 / -181.6884, RMSE (Train/Test) : 0.4574 / 1.2136


Epochs: 100%|██████████| 2/2 [00:00<00:00,  3.44epoch/s]

Epoch 2/2 Loss (Train/Test) : -41.2564 / -51.5987, RMSE (Train/Test) : 0.2485 / 1.1379
2025-01-11 05:33:53,736 [INFO] Successfully run AlternatingLeastSquares algorithm running till the end
2025-01-11 05:33:53,737 [DEBUG] Cleaning the AlternatingLeastSquares algorithm self maintained cache, and exiting...
2025-01-11 05:33:53,739 [INFO] Successfully built the recommender using AlternatingLeastSquares





In [9]:
# plot_als_train_test_rmse_evolution(als_backend.algorithm)

In [10]:
# plot_als_train_test_loss_evolution(als_backend.algorithm)

In [11]:
#
prediction_input = [("17", 4)]
recommender.recommend(prediction_input)

[{'title': 'Braveheart (1995)',
  'genres': ['Action', 'Drama', 'War'],
  'features_hot_encoded': array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
  'features_count': 3},
 {'title': 'Patton (1970)',
  'genres': ['Drama', 'War'],
  'features_hot_encoded': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
  'features_count': 2},
 {'title': 'Shawshank Redemption, The (1994)',
  'genres': ['Crime', 'Drama'],
  'features_hot_encoded': array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  'features_count': 2},
 {'title': 'Misérables, Les (1998)',
  'genres': ['Crime', 'Drama', 'Romance', 'War'],
  'features_hot_encoded': array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0]),
  'features_count': 4},
 {'title': "The Emperor's Club (2002)",
  'genres': ['Drama'],
  'features_hot_encoded': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  'features_count': 1},
 {'title': "Mr. Holland's Opus (1995)",
  'genres': ['Drama']

In [12]:
prediction_input = [("267654", 4)]  # Harry Poter
recommender.recommend(prediction_input)

2025-01-11 05:33:54,133 [ERROR] The provided user ratings data contains the following unknown item rating(s), skipping unknown items' ratings [('267654', 4)]


[{'title': 'Star Wars: Episode V - The Empire Strikes Back (1980)',
  'genres': ['Action', 'Adventure', 'Sci-Fi'],
  'features_hot_encoded': array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
  'features_count': 3},
 {'title': 'Patton (1970)',
  'genres': ['Drama', 'War'],
  'features_hot_encoded': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
  'features_count': 2},
 {'title': 'Lion King, The (1994)',
  'genres': ['Adventure', 'Animation', 'Children', 'Drama', 'Musical', 'IMAX'],
  'features_hot_encoded': array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]),
  'features_count': 6},
 {'title': 'Serenity (2005)',
  'genres': ['Action', 'Adventure', 'Sci-Fi'],
  'features_hot_encoded': array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
  'features_count': 3},
 {'title': 'Ice Age (2002)',
  'genres': ['Adventure', 'Animation', 'Children', 'Comedy'],
  'features_hot_encoded': array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [13]:
#
recommender.recommend()

[{'title': 'Star Wars: Episode V - The Empire Strikes Back (1980)',
  'genres': ['Action', 'Adventure', 'Sci-Fi'],
  'features_hot_encoded': array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
  'features_count': 3},
 {'title': 'Patton (1970)',
  'genres': ['Drama', 'War'],
  'features_hot_encoded': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
  'features_count': 2},
 {'title': 'Lion King, The (1994)',
  'genres': ['Adventure', 'Animation', 'Children', 'Drama', 'Musical', 'IMAX'],
  'features_hot_encoded': array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]),
  'features_count': 6},
 {'title': 'Serenity (2005)',
  'genres': ['Action', 'Adventure', 'Sci-Fi'],
  'features_hot_encoded': array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
  'features_count': 3},
 {'title': 'Ice Age (2002)',
  'genres': ['Adventure', 'Animation', 'Children', 'Comedy'],
  'features_hot_encoded': array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0