In [2]:
import time

notebook_start_time = time.time()

# Setup Environment

In [3]:
import sys
from pathlib import Path

def is_google_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False
    
if is_google_colab():
    print("Running in Google Colab")
else:
    print("Not running in Google Colab")
    root_dir = str(Path().absolute().parent)

if root_dir not in sys.path:
    print(f"Adding {root_dir} to sys.path")
    sys.path.append(root_dir)

Not running in Google Colab
Adding /home/haturusinghe/recsys-decoding_ml/my-anime-recsys to sys.path


# Feature Pipeline

## Imports

In [4]:
%load_ext autoreload
%autoreload 2

import warnings
from pprint import pprint

import polars as pl
import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys.raw_data_sources import myanimelist_dataset
from recsys.features.anime import (compute_features_of_anime, generate_embeddings_for_dataframe)
from recsys.config import settings

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [6]:
print(dict(settings))

{'HOPSWORKS_API_KEY': SecretStr('**********'), 'USER_DATASET_SIZE': <UserDatasetSize.LARGE: 'LARGE'>, 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2'}


## MyAnimeList Dataset

### Anime Data

- MAL_ID - MyAnimelist ID of the anime.

In [7]:
anime_df = myanimelist_dataset.extract_anime_data()
anime_df.shape

(16214, 36)

In [8]:
anime_df.head(3)

MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,Synopsis
i64,str,f64,str,str,str,str,i64,str,str,str,str,str,str,str,str,f64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1,"""Cowboy Bebop""",8.78,"""Action, Adventure, Comedy, Dra…","""Cowboy Bebop""","""カウボーイビバップ""","""TV""",26,"""Apr 3, 1998 to Apr 24, 1999""","""Spring 1998""","""Bandai Visual""","""Funimation, Bandai Entertainme…","""Sunrise""","""Original""","""24 min. per ep.""","""R - 17+ (violence & profanity)""",28.0,39,1251960,61971,105808,718161,71513,26678,329800,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"""In the year 2071, humanity has…"
5,"""Cowboy Bebop: Tengoku no Tobir…",8.39,"""Action, Drama, Mystery, Sci-Fi…","""Cowboy Bebop:The Movie""","""カウボーイビバップ 天国の扉""","""Movie""",1,"""Sep 1, 2001""",,"""Sunrise, Bandai Visual""","""Sony Pictures Entertainment""","""Bones""","""Original""","""1 hr. 55 min.""","""R - 17+ (violence & profanity)""",159.0,518,273145,1174,4143,208333,1935,770,57964,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"""other day, another bounty—such…"
6,"""Trigun""",8.24,"""Action, Sci-Fi, Adventure, Com…","""Trigun""","""トライガン""","""TV""",26,"""Apr 1, 1998 to Sep 30, 1998""","""Spring 1998""","""Victor Entertainment""","""Funimation, Geneon Entertainme…","""Madhouse""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",266.0,201,558913,12944,29113,343492,25465,13925,146918,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"""Vash the Stampede is the man w…"


In [9]:
anime_df = compute_features_of_anime(anime_df)
anime_df.shape


(10899, 38)

#### Create an embedding for each anime description

In [10]:
for i, desc in enumerate(anime_df["description"].head(n=3)):
    logger.info(f"Item {i+1}:\n{desc}")

[32m2025-04-13 05:39:14.567[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
This is a TV anime.
It belongs to the Action, Adventure, Comedy, Drama, Sci-Fi, Space genres.
It has a synopsis : In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz kid Edw

In [11]:
logger.info(
    f"Loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}"
)

[32m2025-04-13 05:39:17.646[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading 'all-MiniLM-L6-v2' embedding model to device='cpu'[0m


In [66]:
# Load the embedding model from SentenceTransformer's model registry.
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

2025-04-12 19:57:47.996776: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-12 19:57:48.168792: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-12 19:57:48.776915: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-12 19:57:48.777003: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-12 19:57:48.780335: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [None]:
anime_df = generate_embeddings_for_dataframe(
    anime_df,
    model=model,
    embedding_column_name="description",
    batch_size=128,
)

In [None]:
anime_df[["article_description", "embeddings"]].head(3)

## User Data

In [14]:
users_df = myanimelist_dataset.extract_user_data()
users_df.shape

: 

In [13]:
users_df.head(3)

user_id,anime_id,rating
i64,i64,i64
0,430,9
0,1004,5
0,3010,7
