In [12]:
import pandas as pd

from src.algorithms.alternating_least_squares import AlternatingLeastSquares
from src.helpers.dataset_indexer import DatasetIndexer
from src.helpers.checkpoint_manager import CheckpointManager
from src.recommenders import CollaborativeFilteringRecommenderBuilder
from src.backends import Backend
from src.helpers._logging import logger  # noqa
from src.settings import settings
from src.utils import vocabulary_based_one_hot_encode

from src.helpers.graphing import (
    plot_als_train_test_loss_evolution,
    plot_als_train_test_rmse_evolution,
    # plot_error_evolution,
    plot_power_low_distribution,
    plot_data_item_distribution_as_hist,
)

In [2]:
USER_HEADER = "userId"
ITEM_HEADER = "movieId"
RATING_HEADER = "rating"
FEATURE_TO_ENCODE = "genres"
ITEM_FEATURE_LIST = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "IMAX",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

In [3]:
dataset_indexer = DatasetIndexer(
    file_path="./ml-32m/ratings.csv",
    user_header=USER_HEADER,
    item_header=ITEM_HEADER,
    rating_header=RATING_HEADER,
    limit=settings.general.LINES_COUNT_TO_READ,
)

indexed_data = dataset_indexer.index_simple(
    approximate_train_ratio=settings.general.APPROXIMATE_TRAIN_RATIO
)

2025-01-11 00:34:37,165 [INFO] Successfully indexed 1000000 lines from ./ml-32m/ratings.csv


In [6]:
# Import the movie csv file that will act as our movie database
# And that database is needed by the backend to query the movies
item_database = (
    pd.read_csv("./ml-32m/movies.csv", dtype={ITEM_HEADER: str})
    .assign(
        genres=lambda df: df[FEATURE_TO_ENCODE].apply(lambda genres: genres.split("|")),
        features_hot_encoded=lambda df: df[FEATURE_TO_ENCODE].apply(
            lambda g: vocabulary_based_one_hot_encode(
                words=g, vocabulary=ITEM_FEATURE_LIST
            )
        ),
        features_count=lambda df: df["features_hot_encoded"].apply(lambda x: sum(x)),
    )
    .set_index(ITEM_HEADER)  # Set the movieId as the index
    .to_dict(orient="index")  # Convert the DataFrame to a dictionary
)

In [23]:
# plot_data_item_distribution_as_hist(indexed_data)

In [7]:
# plot_power_low_distribution(indexed_data,)

In [8]:
als_instance = AlternatingLeastSquares(
    hyper_lambda=settings.als.HYPER_LAMBDA,
    hyper_gamma=settings.als.HYPER_GAMMA,
    hyper_tau=settings.als.HYPER_TAU,
    hyper_n_epochs=settings.als.HYPER_N_EPOCH,
    hyper_n_factors=settings.als.HYPER_N_FACTOR,
)

als_backend = Backend(
    # Define the algorithm
    algorithm=als_instance,
    checkpoint_manager=CheckpointManager(
        checkpoint_folder=settings.als.CHECKPOINT_FOLDER,
        sub_folder=str(settings.general.LINES_COUNT_TO_READ),
    ),
    # The predictor needs this to render the name of the items
    item_database=item_database,
    # Whether we should resume by using the last state of
    # the algorithm the checkpoint manager folder or not.
    resume=True,
    save_checkpoint=False,
)

In [9]:
recommender_builder = CollaborativeFilteringRecommenderBuilder(
    backend=als_backend,
)

# This might take some moment before finishing
recommender = recommender_builder.build(
    data=indexed_data, item_database=item_database, include_features=True
)

NameError: name 'indexed_data' is not defined

In [10]:
# plot_als_train_test_rmse_evolution(als_backend.algorithm)

In [11]:
# plot_als_train_test_loss_evolution(als_backend.algorithm)

In [12]:
#
prediction_input = [("17", 4)]
recommender.recommend(prediction_input)

Hé predictio iter
Hé predictio iter
Hé predictio iter
Hé predictio iter


[{'title': 'Aziz Ansari: Live at Madison Square Garden (2015)',
  'genres': ['Comedy'],
  'feature_vector': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'title': 'Zeitgeist: Moving Forward (2011)',
  'genres': ['Documentary'],
  'feature_vector': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'title': 'Legend, The (Legend of Fong Sai-Yuk, The) (Fong Sai Yuk) (1993)',
  'genres': ['Action', 'Comedy'],
  'feature_vector': array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'title': 'Woman of the Year (1942)',
  'genres': ['Comedy', 'Romance'],
  'feature_vector': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])},
 {'title': 'Zeitgeist: The Movie (2007)',
  'genres': ['Documentary', 'War'],
  'feature_vector': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])},
 {'title': 'Limbo (1999)',
  'genres': ['Drama'],
  'feature_vector': array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},


In [13]:
prediction_input = [("267654", 4)]  # Harry Poter
recommender.recommend(prediction_input)

Hé predictio iter
Hé predictio iter
Hé predictio iter
Hé predictio iter


[{'title': 'Aziz Ansari: Live at Madison Square Garden (2015)',
  'genres': ['Comedy'],
  'feature_vector': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'title': 'The Yakuza Papers, Vol. 4: Police Tactics (1974)',
  'genres': ['Crime', 'Drama'],
  'feature_vector': array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'title': 'The Last House on the Beach (1978)',
  'genres': ['Crime', 'Drama', 'Horror', 'Thriller'],
  'feature_vector': array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])},
 {'title': 'Evilenko (2004)',
  'genres': ['Crime', 'Horror', 'Thriller'],
  'feature_vector': array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])},
 {'title': 'Afflicted, The (2010)',
  'genres': ['Horror', 'Thriller'],
  'feature_vector': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])},
 {'title': 'Paradise (1982)',
  'genres': ['Adventure', 'Romance'],
  'feature_vector': array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [14]:
#
recommender.recommend()

[{'title': 'Aziz Ansari: Live at Madison Square Garden (2015)',
  'genres': ['Comedy'],
  'feature_vector': array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'title': 'Facing the Giants (2006)',
  'genres': ['Action', 'Drama'],
  'feature_vector': array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'title': 'The Last House on the Beach (1978)',
  'genres': ['Crime', 'Drama', 'Horror', 'Thriller'],
  'feature_vector': array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])},
 {'title': 'The Yakuza Papers, Vol. 4: Police Tactics (1974)',
  'genres': ['Crime', 'Drama'],
  'feature_vector': array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])},
 {'title': 'Evilenko (2004)',
  'genres': ['Crime', 'Horror', 'Thriller'],
  'feature_vector': array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])},
 {'title': 'Afflicted, The (2010)',
  'genres': ['Horror', 'Thriller'],
  'feature_vector': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 