In [1]:
import time

notebook_start_time = time.time()

# Setup Environment

In [2]:
import sys
from pathlib import Path
from IPython import get_ipython

In [3]:
def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

In [4]:
def clone_repository() -> None:
    !git clone https://github.com/haturusinghe/mlops-datascience-anime-recsys.git
    %cd mlops-datascience-anime-recsys/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !!uv pip install --all-extras --system --prerelease disallow --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

⛳️ Local environment
Adding the following directory to the PYTHONPATH: /home/haturusinghe/recsys-decoding_ml/my-anime-recsys


In [None]:
import os

if is_google_colab() and os.getcwd() != '/content/mlops-datascience-anime-recsys':
    %cd mlops-datascience-anime-recsys

In [None]:
if is_google_colab():
    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

# Feature Pipeline

## Imports

In [5]:
%load_ext autoreload
%autoreload 2

import warnings
from pprint import pprint

# Configure Polars to display the full content of list columns
import polars as pl

# Set the max_columns width to display full list content
pl.Config.set_tbl_width_chars(1000)

# Set the max list elements displayed
pl.Config.set_fmt_str_lengths(100)

import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys.raw_data_sources import myanimelist_dataset
from recsys.features.anime import (compute_features_of_anime, generate_embeddings_for_dataframe)
from recsys.features.user import (compute_features_of_user)
from recsys.features.ratings import (compute_features_of_ratings)
from recsys.config import settings

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [7]:
print(dict(settings))

{'HOPSWORKS_API_KEY': SecretStr('**********'), 'USER_DATASET_SIZE': <UserDatasetSize.LARGE: 'LARGE'>, 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2', 'SEED': 42}


## MyAnimeList Dataset

### Anime Data

- MAL_ID - MyAnimelist ID of the anime.

In [8]:
anime_df = myanimelist_dataset.extract_anime_data()
anime_df.shape

(16214, 36)

In [9]:
anime_df.head(3)

MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,Synopsis
i64,str,f64,str,str,str,str,i64,str,str,str,str,str,str,str,str,f64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,"""Cowboy Bebop""",8.78,"""Action, Adventure, Comedy, Drama, Sci-Fi, Space""","""Cowboy Bebop""","""カウボーイビバップ""","""TV""",26,"""Apr 3, 1998 to Apr 24, 1999""","""Spring 1998""","""Bandai Visual""","""Funimation, Bandai Entertainment""","""Sunrise""","""Original""","""24 min. per ep.""","""R - 17+ (violence & profanity)""",28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"""In the year 2071, humanity has colonized several of the planets and moons of the solar system leavin…"
5,"""Cowboy Bebop: Tengoku no Tobira""",8.39,"""Action, Drama, Mystery, Sci-Fi, Space""","""Cowboy Bebop:The Movie""","""カウボーイビバップ 天国の扉""","""Movie""",1,"""Sep 1, 2001""",,"""Sunrise, Bandai Visual""","""Sony Pictures Entertainment""","""Bones""","""Original""","""1 hr. 55 min.""","""R - 17+ (violence & profanity)""",159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"""other day, another bounty—such is the life of the often unlucky crew of the Bebop. However, this rou…"
6,"""Trigun""",8.24,"""Action, Sci-Fi, Adventure, Comedy, Drama, Shounen""","""Trigun""","""トライガン""","""TV""",26,"""Apr 1, 1998 to Sep 30, 1998""","""Spring 1998""","""Victor Entertainment""","""Funimation, Geneon Entertainment USA""","""Madhouse""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"""Vash the Stampede is the man with a $$60,000,000,000 bounty on his head. The reason: he's a merciles…"


In [10]:
anime_df = compute_features_of_anime(anime_df)
anime_df.shape


(10899, 38)

In [11]:
anime_df.head(2)

MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,Synopsis,anime_id,description
i64,str,f64,str,str,str,str,i64,str,str,str,str,str,str,str,str,f64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str
1,"""Cowboy Bebop""",8.78,"""Action, Adventure, Comedy, Drama, Sci-Fi, Space""","""Cowboy Bebop""","""カウボーイビバップ""","""TV""",26,"""Apr 3, 1998 to Apr 24, 1999""","""Spring 1998""","""Bandai Visual""","""Funimation, Bandai Entertainment""","""Sunrise""","""Original""","""24 min. per ep.""","""R - 17+ (violence & profanity)""",28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"""In the year 2071, humanity has colonized several of the planets and moons of the solar system leavin…","""1""","""This is a TV anime. It belongs to the Action, Adventure, Comedy, Drama, Sci-Fi, Space genres. It has…"
5,"""Cowboy Bebop: Tengoku no Tobira""",8.39,"""Action, Drama, Mystery, Sci-Fi, Space""","""Cowboy Bebop:The Movie""","""カウボーイビバップ 天国の扉""","""Movie""",1,"""Sep 1, 2001""",,"""Sunrise, Bandai Visual""","""Sony Pictures Entertainment""","""Bones""","""Original""","""1 hr. 55 min.""","""R - 17+ (violence & profanity)""",159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"""other day, another bounty—such is the life of the often unlucky crew of the Bebop. However, this rou…","""5""","""This is a Movie anime. It belongs to the Action, Drama, Mystery, Sci-Fi, Space genres. It has a syno…"


#### Create an embedding for each anime description

In [12]:
for i, desc in enumerate(anime_df["description"].head(n=3)):
    logger.info(f"Item {i+1}:\n{desc}")

[32m2025-04-14 11:24:13.072[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
This is a TV anime.
It belongs to the Action, Adventure, Comedy, Drama, Sci-Fi, Space genres.
It has a synopsis : In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz kid Edw

In [13]:
logger.info(
    f"Loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}"
)

[32m2025-04-14 11:24:15.099[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading 'all-MiniLM-L6-v2' embedding model to device='cpu'[0m


In [None]:
# Load the embedding model from SentenceTransformer's model registry.
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

In [None]:
anime_df = generate_embeddings_for_dataframe(
    anime_df,
    model=model,
    embedding_column_name="description",
    batch_size=128,
)

In [None]:
anime_df[["description", "embeddings"]].head(3)

## User Data

In [14]:
users_df = myanimelist_dataset.extract_user_data()
users_df.shape

(57633278, 3)

In [15]:
users_df.head(3)

user_id,anime_id,rating
i64,i64,i64
0,430,9
0,1004,5
0,3010,7


In [16]:
users_df.null_count()

user_id,anime_id,rating
u32,u32,u32
0,0,0


In [17]:
users_df = compute_features_of_user(users_df)

In [18]:
users_df.head()

user_id,top_anime,top_ratings
i64,list[i64],list[i64]
0,"[578, 1571, … 169]","[10, 10, … 7]"
1,"[35760, 20, … 11061]","[10, 10, … 9]"
2,"[11061, 263, … 28171]","[10, 10, … 9]"
3,"[5205, 17895, … 5781]","[10, 10, … 9]"
4,"[1535, 1689, … 372]","[10, 10, … 9]"


## Ratings

In [19]:
ratings_df = myanimelist_dataset.extract_ratings_data(limit_rows=True)
ratings_df.shape

(10000, 5)

In [20]:
ratings_df.head(3)

user_id,anime_id,rating,watching_status,watched_episodes
i64,i64,i64,i64,i64
0,67,9,1,1
0,6702,7,1,4
0,242,10,1,4


In [21]:
ratings_df = compute_features_of_ratings(ratings_df, anime_df=anime_df)
ratings_df.head(3)

user_id,anime_id,rating,watching_status,watched_episodes,total_episodes,watched_episodes_ratio
i64,str,i64,i64,i64,i32,f32
0,"""67""",9,1,1,24,0.041667
0,"""6702""",7,1,4,175,0.022857
0,"""242""",10,1,4,13,0.307692


# Data Sampling 

In [22]:
from recsys.features.helpers.dataset_sampler import DatasetSampler

In [23]:
sampler = DatasetSampler(size = settings.USER_DATASET_SIZE, seed = settings.SEED)
dataset_subset = sampler.sample_dataset(
    ratings_df=ratings_df,
    users_df=users_df,
)

users_df = dataset_subset["users_df"]
ratings_df = dataset_subset["ratings_df"]

[32m2025-04-14 11:25:15.401[0m | [1mINFO    [0m | [36mrecsys.features.helpers.dataset_sampler[0m:[36msample_dataset[0m:[36m26[0m - [1mSampling 10000 users from the dataset[0m
[32m2025-04-14 11:25:15.422[0m | [1mINFO    [0m | [36mrecsys.features.helpers.dataset_sampler[0m:[36msample_dataset[0m:[36m29[0m - [1mTotal Ratings for the sampled users: 10000[0m
[32m2025-04-14 11:25:15.440[0m | [1mINFO    [0m | [36mrecsys.features.helpers.dataset_sampler[0m:[36msample_dataset[0m:[36m32[0m - [1mTotal Ratings for the sampled users: 472[0m


## Interaction Data

In [None]:
from recsys.features.interactions import generate_interactions_data

In [None]:
interactions_df = generate_interactions_data(ratings_df=ratings_df, anime_df=anime_df)
interactions_df.shape

In [None]:
interactions_df.head(10)

In [None]:
interactions_df.group_by("recommendation_score").agg(
    pl.count("recommendation_score").alias("count")
)

Here is what each score means:
- `0` : Not Recommended
- `1` : Recommend to User
- `2` : Best