# MovieLens Recommender System Using Alternating Least Squares

In [1]:
import pandas as pd
from pprint import pprint

from src.algorithms.alternating_least_squares import AlternatingLeastSquares
from src.helpers.dataset_indexer import DatasetIndexer
from src.helpers.checkpoint_manager import CheckpointManager
from src.recommenders import CollaborativeFilteringRecommenderBuilder
from src.backends import Backend
from src.helpers._logging import logger  # noqa
from src.settings import settings
from src.utils import vocabulary_based_one_hot_encode


In [None]:
USER_HEADER = "userId"
ITEM_HEADER = "movieId"
RATING_HEADER = "rating"
FEATURE_TO_ENCODE = "genres"

# https://files.grouplens.org/datasets/movielens/ml-32m-README.html
ITEM_FEATURE_LIST = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "IMAX",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]


In [2]:
CSV_FILES_DIR = "./ml-32m" # The dataset subfolder


In [None]:

# Import the movies csv file joined with the movie links csv file and that will act
# as our movie database. The backend needs this database to query the movies.
item_database = (
    pd.read_csv("./ml-32m/movies.csv", dtype={ITEM_HEADER: str})
    .merge(
        pd.read_csv("./ml-32m/links.csv", dtype={ITEM_HEADER: str}),
        on=ITEM_HEADER,
        how="left",
    )
    .assign(
        genres=lambda df: df[FEATURE_TO_ENCODE].apply(
            lambda genres: genres.split("|")
        ),
        features_hot_encoded=lambda df: df[FEATURE_TO_ENCODE].apply(
            lambda g: vocabulary_based_one_hot_encode(
                words=g, vocabulary=ITEM_FEATURE_LIST
            )
        ),
        features_count=lambda df: df["features_hot_encoded"].apply(lambda x: sum(x)),
    )
    .set_index(ITEM_HEADER)  # Set the movieId as the index
    .to_dict(orient="index")  # Convert the DataFrame to a dictionary
)

In [None]:
dataset_indexer = DatasetIndexer(
    # Path to the ratings.csv file
    file_path=f"{CSV_FILES_DIR}/ratings.csv",
    user_header=USER_HEADER,
    item_header=ITEM_HEADER,
    rating_header=RATING_HEADER,
    limit=settings.general.LINES_COUNT_TO_READ,
)

# Create an indexed dataset object
indexed_data = dataset_indexer.index_simple(
    approximate_train_ratio=settings.general.APPROXIMATE_TRAIN_RATIO
)

In [None]:
als_instance = AlternatingLeastSquares(
    hyper_lambda=settings.als.HYPER_LAMBDA,
    hyper_gamma=settings.als.HYPER_GAMMA,
    hyper_tau=settings.als.HYPER_TAU,
    hyper_n_epochs=settings.als.HYPER_N_EPOCH,
    hyper_n_factors=settings.als.HYPER_N_FACTOR,
)

als_backend = Backend(
    # Define the algorithm
    algorithm=als_instance,
    checkpoint_manager=CheckpointManager(
        checkpoint_folder=settings.als.CHECKPOINT_FOLDER,
        sub_folder=str(settings.general.LINES_COUNT_TO_READ),
    ),
    # The predictor needs this to render the name of the items
    item_database=item_database,
    # Whether we should resume by using the last state of
    # the algorithm the checkpoint manager folder or not.
    resume=False,
    save_checkpoint=True,
)


In [None]:
recommender_builder = CollaborativeFilteringRecommenderBuilder(
    backend=als_backend,
)


In [None]:
# This might take some moment before finishing
# Here is where the model is actually trained
recommender = recommender_builder.build(
    data=indexed_data, 
    item_database=item_database, 
    # Whether to include feature functionality or not
    include_features=True
)

In [None]:
SENSE_AND_SENSIBILITY = "17"
prediction_input = [("17", 4)] # Sense and Sensibility (1995)

###  Recommendations


In [3]:
HARRY_POTER = "267654"
LORD_OF_THE_RING = "279178"

In [4]:
SENSE_AND_SENSIBILITY = "17"

In [None]:
recommendations = recommender.recommend([(HARRY_POTER, 5)]) # [(MOVIE, RATING), ...]
pprint("Recommendations for Harry Poter (5-stared):")
pprint(recommendations)

In [None]:
recommendations = recommender.recommend([(HARRY_POTER, 4)]) # [(MOVIE, RATING), ...]
pprint("Recommendations for Harry Poter (4-stared):")
pprint(recommendations)

recommendations = recommender.recommend([(HARRY_POTER, 4)]) # [(MOVIE, RATING), ...]
pprint("Recommendations for Harry Poter (4-stared):")
pprint(recommendations)


In [None]:
recommendations = recommender.recommend([(LORD_OF_THE_RING, 5)]) # [(MOVIE, RATING), ...]
pprint("Recommendations for Lord of the Ring (5-stared):")
pprint(recommendations)

In [None]:
recommendations = recommender.recommend([(LORD_OF_THE_RING, 4)]) # [(MOVIE, RATING), ...]
pprint("Recommendations for Lord of the Ring (4-stared):")
pprint(recommendations)

In [None]:
# Ideally, the recommendation should not contain many "Lord of the Ring."
recommendations = recommender.recommend([(LORD_OF_THE_RING, 3)]) # [(MOVIE, RATING), ...]
pprint("Recommendations for Lord of the Ring (3-stared):")
pprint(recommendations)


In [None]:
# We should not recommend "Lord of the Ring."
recommendations = recommender.recommend([(LORD_OF_THE_RING, 2)]) # [(MOVIE, RATING), ...]
pprint("Recommendations for Lord of the Ring (2-stared):")
pprint(recommendations)

Here is the corrected table:

| Sample size             | $\beta$   | $\lambda$ | $\gamma$ | $\tau$ | $k$  | Epochs | RMSE Train   | RMSE Test    | Loss Train       | Loss Test        | Recommendation |
|--------------------------|-----------|-----------|----------|--------|------|--------|--------------|--------------|------------------|------------------|----------------|
| 1,000,000               | None      | 5         | 0.2      | 0.5    | 10   | 10     | 0.6398357382 | 0.9573722035 | -859437.8237     | -497705.9132     | Not good       |
| 1,000,000               | None      | 0.5       | 0.01     | 0.4    | 10   | 20     | 0.6345867557 | 0.8798720925 | -87902.6796      | -45921.5971      | Not good       |
| 1,000,000               | None      | 1         | 0.04     | 0.4    | 10   | 20     | 0.6301793968 | 0.9039692663 | -171394.7980     | -94303.5198      | Not good       |
| 1,000,000               | None      | 0.5       | 0.1      | 0.1    | 10   | 20     | 0.6281751607 | 0.921288     | -90251.1060      | -53596.0488      | Not good       |
| 1,000,000               | None      | 0.1       | 0.1      | 0.1    | 10   | 20     | 0.6387279301 | 0.8667931235 | -23797.7348      | -15022.7668      | Not good       |
| 100,000,000,000,000,000 | None      | 5         | 0.2      | 0.5    | 10   | 10     | 0.7002276159 | 0.8106347909 | -32134423.5451   | -11279037.9697   | Not good       |
| 100,000,000,000,000,000 | None      | 0.1       | 0.01     | 0.1    | 10   | 20     | 0.6974530613 | 0.7876710025 | -662039.0551     | -237845.6851     | Got some same genre movie |
| 100,000,000,000,000,000 | None      | 0.1       | 0.1      | 0.1    | 10   | 20     | 0.7005592936 | 0.791084577  | -805758.7356     | -377689.7136     | Got some same genre movie |
| 100,000,000,000,000,000 | 10        | 0.1       | 0.1      | 0.1    | 30   | 20     | 0.6001383912 | 0.8438440669 | -636944.0774     | -403691.8737     | Good           |
| 100,000,000,000,000,000 | 10        | 0.5       | 0.01     | 0.5    | 10   | 20     | 0.6975553770 | 0.7890316677 | -3210830.2328    | -1092998.8915    | Got some same genre movie |
| 100,000,000,000,000,000 | 0.1       | 0.5       | 0.01     | 2      | 10   | 20     | 0.7040819727 | 0.781400124  | -3332430.7169    | -1137663.3828    | Got some same genre movie |