In [1]:
import time

notebook_start_time = time.time()

# Setup Environment

In [5]:
import sys
from pathlib import Path
from IPython import get_ipython

In [3]:
def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

In [2]:
def clone_repository() -> None:
    !git clone https://github.com/haturusinghe/mlops-datascience-anime-recsys.git
    %cd mlops-datascience-anime-recsys/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !!uv pip install --all-extras --system --prerelease disallow --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

Cloning into 'mlops-datascience-anime-recsys'...
remote: Enumerating objects: 132, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 132 (delta 73), reused 119 (delta 60), pack-reused 0 (from 0)[K
Receiving objects: 100% (132/132), 186.83 KiB | 1.97 MiB/s, done.
Resolving deltas: 100% (73/73), done.
/content/mlops-datascience-anime-recsys
Collecting uv
  Downloading uv-0.6.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.6.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.6.14
⛳️ Google Colab environment
Adding the following directory to the PYTHONPATH: /content/mlops-datascience-anime-recsys


In [21]:
import os

if os.getcwd() != '/content/mlops-datascience-anime-recsys':
    %cd mlops-datascience-anime-recsys

In [6]:
if is_google_colab():
    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

⛳️ Google Colab environment


# Feature Pipeline

## Imports

In [10]:
# %load_ext autoreload
# %autoreload 2

import warnings
from pprint import pprint

import polars as pl
import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys.raw_data_sources import myanimelist_dataset
from recsys.features.anime import (compute_features_of_anime, generate_embeddings_for_dataframe)
from recsys.features.user import (compute_features_of_user)
from recsys.config import settings

In [11]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [12]:
print(dict(settings))

{'HOPSWORKS_API_KEY': None, 'USER_DATASET_SIZE': <UserDatasetSize.LARGE: 'LARGE'>, 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2'}


## MyAnimeList Dataset

### Anime Data

- MAL_ID - MyAnimelist ID of the anime.

In [13]:
anime_df = myanimelist_dataset.extract_anime_data()
anime_df.shape

[32m2025-04-13 03:19:12.061[0m | [1mINFO    [0m | [36mrecsys.raw_data_sources.myanimelist_dataset[0m:[36mextract_anime_data[0m:[36m130[0m - [1mFiles do not exist. Downloading from Kaggle...[0m


Downloading from https://www.kaggle.com/api/v1/datasets/download/hernan4444/anime-recommendation-database-2020?dataset_version_number=7...


100%|██████████| 661M/661M [00:06<00:00, 110MB/s]

Extracting files...



[32m2025-04-13 03:20:07.558[0m | [1mINFO    [0m | [36mrecsys.raw_data_sources.myanimelist_dataset[0m:[36mdownload_and_extract_from_kaggle[0m:[36m52[0m - [1mDownloaded dataset to /root/.cache/kagglehub/datasets/hernan4444/anime-recommendation-database-2020/versions/7[0m
[32m2025-04-13 03:20:07.566[0m | [1mINFO    [0m | [36mrecsys.raw_data_sources.myanimelist_dataset[0m:[36mdownload_and_extract_from_kaggle[0m:[36m63[0m - [1mCopied anime.csv to /content/mlops-datascience-anime-recsys/kaggle/anime.csv[0m
[32m2025-04-13 03:20:07.574[0m | [1mINFO    [0m | [36mrecsys.raw_data_sources.myanimelist_dataset[0m:[36mdownload_and_extract_from_kaggle[0m:[36m63[0m - [1mCopied anime_with_synopsis.csv to /content/mlops-datascience-anime-recsys/kaggle/anime_with_synopsis.csv[0m
[32m2025-04-13 03:20:52.539[0m | [1mINFO    [0m | [36mrecsys.raw_data_sources.myanimelist_dataset[0m:[36mdownload_and_extract_from_kaggle[0m:[36m63[0m - [1mCopied rating_complete.csv

(16214, 36)

In [14]:
anime_df.head(3)

MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,Synopsis
i64,str,f64,str,str,str,str,i64,str,str,str,str,str,str,str,str,f64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,"""Cowboy Bebop""",8.78,"""Action, Adventure, Comedy, Dra…","""Cowboy Bebop""","""カウボーイビバップ""","""TV""",26,"""Apr 3, 1998 to Apr 24, 1999""","""Spring 1998""","""Bandai Visual""","""Funimation, Bandai Entertainme…","""Sunrise""","""Original""","""24 min. per ep.""","""R - 17+ (violence & profanity)""",28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"""In the year 2071, humanity has…"
5,"""Cowboy Bebop: Tengoku no Tobir…",8.39,"""Action, Drama, Mystery, Sci-Fi…","""Cowboy Bebop:The Movie""","""カウボーイビバップ 天国の扉""","""Movie""",1,"""Sep 1, 2001""",,"""Sunrise, Bandai Visual""","""Sony Pictures Entertainment""","""Bones""","""Original""","""1 hr. 55 min.""","""R - 17+ (violence & profanity)""",159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"""other day, another bounty—such…"
6,"""Trigun""",8.24,"""Action, Sci-Fi, Adventure, Com…","""Trigun""","""トライガン""","""TV""",26,"""Apr 1, 1998 to Sep 30, 1998""","""Spring 1998""","""Victor Entertainment""","""Funimation, Geneon Entertainme…","""Madhouse""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"""Vash the Stampede is the man w…"


In [15]:
anime_df = compute_features_of_anime(anime_df)
anime_df.shape


(10899, 38)

#### Create an embedding for each anime description

In [16]:
for i, desc in enumerate(anime_df["description"].head(n=3)):
    logger.info(f"Item {i+1}:\n{desc}")

[32m2025-04-13 03:24:47.427[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m2[0m - [1mItem 1:
This is a TV anime.
It belongs to the Action, Adventure, Comedy, Drama, Sci-Fi, Space genres.
It has a synopsis : In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz k

In [17]:
logger.info(
    f"Loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}"
)

[32m2025-04-13 03:24:50.441[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m1[0m - [1mLoading 'all-MiniLM-L6-v2' embedding model to device='cpu'[0m


In [18]:
# Load the embedding model from SentenceTransformer's model registry.
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O1.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O2.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O3.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O4.onnx:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_quint8_avx2.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

openvino_model.xml:   0%|          | 0.00/211k [00:00<?, ?B/s]

openvino_model_qint8_quantized.bin:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

openvino_model_qint8_quantized.xml:   0%|          | 0.00/368k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [19]:
anime_df = generate_embeddings_for_dataframe(
    anime_df,
    model=model,
    embedding_column_name="description",
    batch_size=128,
)

TypeError: generate_embeddings_for_dataframe() got an unexpected keyword argument 'embedding_column_name'

In [None]:
anime_df[["article_description", "embeddings"]].head(3)

## User Data

In [None]:
users_df = myanimelist_dataset.extract_user_data()
users_df.shape

In [None]:
users_df.head(3)

In [None]:
users_df.null_count()

In [None]:
users_df = compute_features_of_user(users_df)