In [4]:
import pandas as pd
import numpy as np

df_animes = pd.read_parquet('../../data/processed/df_animes_filled.parquet')

In [3]:
df_animes.head(10)

Unnamed: 0,uid,title,synopsis,aired,episodes,members,popularity,score,genres_list,genre_action,...,genre_slice_of_life,genre_space,genre_sports,genre_super_power,genre_supernatural,genre_thriller,genre_vampire,genre_yaoi,genre_yuri,source
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"Oct 4, 2015 to Mar 27, 2016",25,489888,141,8.82,"[Comedy, Sports, Drama, School, Shounen]",0,...,0,0,1,0,0,0,0,0,0,head
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Oct 10, 2014 to Mar 20, 2015",22,995473,28,8.83,"[Drama, Music, Romance, School, Shounen]",0,...,0,0,0,0,0,0,0,0,0,head
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Jul 7, 2017 to Sep 29, 2017",13,581663,98,8.83,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]",0,...,0,0,0,0,0,0,0,0,0,head
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","Apr 5, 2009 to Jul 4, 2010",64,1615084,4,9.23,"[Action, Military, Adventure, Comedy, Drama, M...",1,...,0,0,0,0,0,0,0,0,0,head
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"Jan 6, 2017",1,214621,502,8.83,"[Action, Mystery, Supernatural, Vampire]",1,...,0,0,0,0,1,0,1,0,0,head
5,37510,Mob Psycho 100 II,"Shigeo ""Mob"" Kageyama is now maturing and unde...","Jan 7, 2019 to Apr 1, 2019",13,442310,176,8.89,"[Action, Slice of Life, Comedy, Supernatural]",1,...,1,0,0,0,1,0,0,0,0,head
6,199,Sen to Chihiro no Kamikakushi,"Stubborn, spoiled, and naïve, 10-year-old Chih...","Jul 20, 2001",1,913212,40,8.9,"[Adventure, Supernatural, Drama]",0,...,0,0,0,0,1,0,0,0,0,head
7,38000,Kimetsu no Yaiba,"Ever since the death of his father, the burden...","Apr 6, 2019 to Sep 28, 2019",26,575037,106,8.92,"[Action, Demons, Historical, Shounen, Supernat...",1,...,0,0,0,0,1,0,0,0,0,head
8,35247,Owarimonogatari 2nd Season,Following an encounter with oddity specialist ...,"Aug 12, 2017 to Aug 13, 2017",7,189944,573,8.93,"[Mystery, Comedy, Supernatural, Vampire]",0,...,0,0,0,0,1,0,1,0,0,head
9,2904,Code Geass: Hangyaku no Lelouch R2,"One year has passed since the Black Rebellion,...","Apr 6, 2008 to Sep 28, 2008",25,992196,27,8.93,"[Action, Military, Sci-Fi, Super Power, Drama,...",1,...,0,0,0,1,0,0,0,0,0,head


#### Already built one-hot features for genres, but lets work on some more features

In [5]:
genre_cols = [c for c in df_animes.columns if isinstance(c, str) and c.startswith('genre_')]

# n_genres: number of genres per anime (use one-hot sum for consistency)
if genre_cols:
    df_animes['n_genres'] = df_animes[genre_cols].sum(axis=1).astype(int)
else:
    # fallback to genres_list length if no one-hot columns
    df_animes['n_genres'] = df_animes['genres_list'].apply(lambda x: len(x) if isinstance(x, (list, tuple)) else 0)

# rare_genre_flag: 1 if anime has at least one rare genre (<1% frequency)
rare_threshold = 0.01
if genre_cols:
    genre_freqs = df_animes[genre_cols].sum(axis=0) / len(df_animes)
    rare_cols = genre_freqs[genre_freqs < rare_threshold].index.tolist()
    if rare_cols:
        df_animes['rare_genre_flag'] = df_animes[rare_cols].any(axis=1).astype(int)
    else:
        df_animes['rare_genre_flag'] = 0
else:
    df_animes['rare_genre_flag'] = 0


# genre_entropy: Shannon entropy using global genre frequencies as weights over present genres
# For an anime, take the global frequencies of the genres present, normalize them and compute -sum p*log2(p).

def _compute_entropy(row, genre_freqs_series):
    present = [g for g in genre_cols if row.get(g) == 1]
    if not present:
        return 0.0
    freqs = genre_freqs_series.loc[present].values.astype(float)
    # if all freqs are zero (unlikely), return 0
    s = freqs.sum()
    if s <= 0:
        return 0.0
    p = freqs / s
    # numerical stability
    p = p[p > 0]
    return float(-(p * np.log2(p)).sum())


if genre_cols:
    # reuse genre_freqs computed above (if not present compute)
    try:
        genre_freqs
    except NameError:
        genre_freqs = df_animes[genre_cols].sum(axis=0) / len(df_animes)
    df_animes['genre_entropy'] = df_animes.apply(lambda r: _compute_entropy(r, genre_freqs), axis=1)
else:
    df_animes['genre_entropy'] = df_animes['n_genres'].apply(lambda n: float(np.log2(n)) if n > 0 else 0.0)

print('Computed features: n_genres, rare_genre_flag, genre_entropy')
print(df_animes[['n_genres', 'rare_genre_flag', 'genre_entropy']].describe())


Computed features: n_genres, rare_genre_flag, genre_entropy
           n_genres  rare_genre_flag  genre_entropy
count  16368.000000     16368.000000   16368.000000
mean       2.877077         0.042766       1.090682
std        1.658707         0.202336       0.780876
min        0.000000         0.000000      -0.000000
25%        2.000000         0.000000       0.268157
50%        3.000000         0.000000       1.084415
75%        4.000000         0.000000       1.713356
max       13.000000         1.000000       3.342649


In [6]:
df_animes.head(10)

Unnamed: 0,uid,title,synopsis,aired,episodes,members,popularity,score,genres_list,genre_action,...,genre_super_power,genre_supernatural,genre_thriller,genre_vampire,genre_yaoi,genre_yuri,source,n_genres,rare_genre_flag,genre_entropy
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"Oct 4, 2015 to Mar 27, 2016",25,489888,141,8.82,"[Comedy, Sports, Drama, School, Shounen]",0,...,0,0,0,0,0,0,head,5,0,2.010101
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Oct 10, 2014 to Mar 20, 2015",22,995473,28,8.83,"[Drama, Music, Romance, School, Shounen]",0,...,0,0,0,0,0,0,head,5,0,2.304928
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Jul 7, 2017 to Sep 29, 2017",13,581663,98,8.83,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]",0,...,0,0,0,0,0,0,head,5,0,2.202561
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","Apr 5, 2009 to Jul 4, 2010",64,1615084,4,9.23,"[Action, Military, Adventure, Comedy, Drama, M...",1,...,0,0,0,0,0,0,head,8,0,2.761946
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"Jan 6, 2017",1,214621,502,8.83,"[Action, Mystery, Supernatural, Vampire]",1,...,0,1,0,1,0,0,head,4,1,1.410939
5,37510,Mob Psycho 100 II,"Shigeo ""Mob"" Kageyama is now maturing and unde...","Jan 7, 2019 to Apr 1, 2019",13,442310,176,8.89,"[Action, Slice of Life, Comedy, Supernatural]",1,...,0,1,0,0,0,0,head,4,0,1.78676
6,199,Sen to Chihiro no Kamikakushi,"Stubborn, spoiled, and naïve, 10-year-old Chih...","Jul 20, 2001",1,913212,40,8.9,"[Adventure, Supernatural, Drama]",0,...,0,1,0,0,0,0,head,3,0,1.528657
7,38000,Kimetsu no Yaiba,"Ever since the death of his father, the burden...","Apr 6, 2019 to Sep 28, 2019",26,575037,106,8.92,"[Action, Demons, Historical, Shounen, Supernat...",1,...,0,1,0,0,0,0,head,5,0,2.043683
8,35247,Owarimonogatari 2nd Season,Following an encounter with oddity specialist ...,"Aug 12, 2017 to Aug 13, 2017",7,189944,573,8.93,"[Mystery, Comedy, Supernatural, Vampire]",0,...,0,1,0,1,0,0,head,4,1,1.190452
9,2904,Code Geass: Hangyaku no Lelouch R2,"One year has passed since the Black Rebellion,...","Apr 6, 2008 to Sep 28, 2008",25,992196,27,8.93,"[Action, Military, Sci-Fi, Super Power, Drama,...",1,...,1,0,0,0,0,0,head,6,0,2.285079


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
import re
import joblib

# basic cleaner: lowercase, strip HTML tags, keep only alphabetic chars, remove stopwords
stopwords = set(ENGLISH_STOP_WORDS)


def clean_synopsis(text):
    if not isinstance(text, str):
        return ''
    s = text.lower()
    # remove HTML tags
    s = re.sub(r'<[^>]+>', ' ', s)
    # replace non-alphabetic characters with space
    s = re.sub(r'[^a-z\s]', ' ', s)
    # tokenize and remove stopwords
    toks = [t for t in s.split() if t and t not in stopwords]
    return ' '.join(toks)


# create cleaned column
df_animes['synopsis_clean'] = df_animes['synopsis'].fillna('').apply(clean_synopsis)

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = vectorizer.fit_transform(df_animes['synopsis_clean'])

# Save vectorizer for future inference
joblib.dump(vectorizer, '../../data/processed/tfidf_vectorizer.joblib')

print('TF-IDF matrix shape:', X_tfidf.shape)
print('Saved TF-IDF vectorizer to ../../data/processed/tfidf_vectorizer.joblib')

# X_tfidf is the sparse matrix output and df_animes updated with 'synopsis_clean'


TF-IDF matrix shape: (16368, 5000)
Saved TF-IDF vectorizer to ../../data/processed/tfidf_vectorizer.joblib


In [8]:
df_animes.head(10)

Unnamed: 0,uid,title,synopsis,aired,episodes,members,popularity,score,genres_list,genre_action,...,genre_supernatural,genre_thriller,genre_vampire,genre_yaoi,genre_yuri,source,n_genres,rare_genre_flag,genre_entropy,synopsis_clean
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"Oct 4, 2015 to Mar 27, 2016",25,489888,141,8.82,"[Comedy, Sports, Drama, School, Shounen]",0,...,0,0,0,0,0,head,5,0,2.010101,following participation inter high karasuno hi...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Oct 10, 2014 to Mar 20, 2015",22,995473,28,8.83,"[Drama, Music, Romance, School, Shounen]",0,...,0,0,0,0,0,head,5,0,2.304928,music accompanies path human metronome prodigi...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Jul 7, 2017 to Sep 29, 2017",13,581663,98,8.83,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]",0,...,0,0,0,0,0,head,5,0,2.202561,abyss gaping chasm stretching depths earth fil...
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","Apr 5, 2009 to Jul 4, 2010",64,1615084,4,9.23,"[Action, Military, Adventure, Comedy, Drama, M...",1,...,0,0,0,0,0,head,8,0,2.761946,order obtained equal value lost alchemy bound ...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"Jan 6, 2017",1,214621,502,8.83,"[Action, Mystery, Supernatural, Vampire]",1,...,1,0,1,0,0,head,4,1,1.410939,helping revive legendary vampire kiss shot ace...
5,37510,Mob Psycho 100 II,"Shigeo ""Mob"" Kageyama is now maturing and unde...","Jan 7, 2019 to Apr 1, 2019",13,442310,176,8.89,"[Action, Slice of Life, Comedy, Supernatural]",1,...,1,0,0,0,0,head,4,0,1.78676,shigeo mob kageyama maturing understanding rol...
6,199,Sen to Chihiro no Kamikakushi,"Stubborn, spoiled, and naïve, 10-year-old Chih...","Jul 20, 2001",1,913212,40,8.9,"[Adventure, Supernatural, Drama]",0,...,1,0,0,0,0,head,3,0,1.528657,stubborn spoiled na ve year old chihiro ogino ...
7,38000,Kimetsu no Yaiba,"Ever since the death of his father, the burden...","Apr 6, 2019 to Sep 28, 2019",26,575037,106,8.92,"[Action, Demons, Historical, Shounen, Supernat...",1,...,1,0,0,0,0,head,5,0,2.043683,death father burden supporting family fallen t...
8,35247,Owarimonogatari 2nd Season,Following an encounter with oddity specialist ...,"Aug 12, 2017 to Aug 13, 2017",7,189944,573,8.93,"[Mystery, Comedy, Supernatural, Vampire]",0,...,1,0,1,0,0,head,4,1,1.190452,following encounter oddity specialist izuko ga...
9,2904,Code Geass: Hangyaku no Lelouch R2,"One year has passed since the Black Rebellion,...","Apr 6, 2008 to Sep 28, 2008",25,992196,27,8.93,"[Action, Military, Sci-Fi, Super Power, Drama,...",1,...,0,0,0,0,0,head,6,0,2.285079,year passed black rebellion failed uprising ho...


In [9]:
from sklearn.preprocessing import StandardScaler

# Ensure numeric types
for c in ['score', 'episodes', 'members', 'popularity']:
    if c in df_animes.columns:
        df_animes[c] = pd.to_numeric(df_animes[c], errors='coerce')

# Log transforms (use log1p to handle zeros and large values)
# Fill negative or missing members/popularity with 0 before log1p
if 'members' in df_animes.columns:
    members_nonneg = df_animes['members'].clip(lower=0).fillna(0)
    df_animes['members_log'] = np.log1p(members_nonneg)
else:
    df_animes['members_log'] = 0.0

if 'popularity' in df_animes.columns:
    pop_nonneg = df_animes['popularity'].clip(lower=0).fillna(0)
    df_animes['popularity_log'] = np.log1p(pop_nonneg)
else:
    df_animes['popularity_log'] = 0.0

# Standardize numeric features
numeric_to_scale = [c for c in ['score', 'episodes', 'members_log', 'popularity_log'] if c in df_animes.columns]

# Fill missing values with median before scaling
for c in numeric_to_scale:
    median_val = df_animes[c].median()
    df_animes[c] = df_animes[c].fillna(median_val)

# Fit scaler and add scaled columns with suffix '_scaled'
scaler = StandardScaler()
if numeric_to_scale:
    scaled_vals = scaler.fit_transform(df_animes[numeric_to_scale])
    for i, c in enumerate(numeric_to_scale):
        df_animes[c + '_scaled'] = scaled_vals[:, i]

# Save the scaler for future inference
joblib.dump(scaler, '../../data/processed/standard_scaler.joblib')

print('Added members_log, popularity_log and scaled numeric features:', numeric_to_scale)
print('Saved scaler to ../../data/processed/standard_scaler.joblib')
print(df_animes[[*numeric_to_scale, *(c + '_scaled' for c in numeric_to_scale)]].head())


Added members_log, popularity_log and scaled numeric features: ['score', 'episodes', 'members_log', 'popularity_log']
Saved scaler to ../../data/processed/standard_scaler.joblib
   score  episodes  members_log  popularity_log  score_scaled  \
0   8.82        25    13.101934        4.955827      2.552177   
1   8.83        22    13.810974        3.367296      2.562526   
2   8.83        13    13.273648        4.595120      2.562526   
3   9.23        64    14.294898        1.609438      2.976486   
4   8.83         1    12.276634        6.220590      2.562526   

   episodes_scaled  members_log_scaled  popularity_log_scaled  
0         0.283506            2.329375              -3.751149  
1         0.221088            2.636419              -5.344280  
2         0.033836            2.403734              -4.112900  
3         1.094933            2.845979              -7.107228  
4        -0.215834            1.971984              -2.482723  


In [10]:
df_animes.head(10)

Unnamed: 0,uid,title,synopsis,aired,episodes,members,popularity,score,genres_list,genre_action,...,n_genres,rare_genre_flag,genre_entropy,synopsis_clean,members_log,popularity_log,score_scaled,episodes_scaled,members_log_scaled,popularity_log_scaled
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"Oct 4, 2015 to Mar 27, 2016",25,489888,141,8.82,"[Comedy, Sports, Drama, School, Shounen]",0,...,5,0,2.010101,following participation inter high karasuno hi...,13.101934,4.955827,2.552177,0.283506,2.329375,-3.751149
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Oct 10, 2014 to Mar 20, 2015",22,995473,28,8.83,"[Drama, Music, Romance, School, Shounen]",0,...,5,0,2.304928,music accompanies path human metronome prodigi...,13.810974,3.367296,2.562526,0.221088,2.636419,-5.34428
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Jul 7, 2017 to Sep 29, 2017",13,581663,98,8.83,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]",0,...,5,0,2.202561,abyss gaping chasm stretching depths earth fil...,13.273648,4.59512,2.562526,0.033836,2.403734,-4.1129
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","Apr 5, 2009 to Jul 4, 2010",64,1615084,4,9.23,"[Action, Military, Adventure, Comedy, Drama, M...",1,...,8,0,2.761946,order obtained equal value lost alchemy bound ...,14.294898,1.609438,2.976486,1.094933,2.845979,-7.107228
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"Jan 6, 2017",1,214621,502,8.83,"[Action, Mystery, Supernatural, Vampire]",1,...,4,1,1.410939,helping revive legendary vampire kiss shot ace...,12.276634,6.22059,2.562526,-0.215834,1.971984,-2.482723
5,37510,Mob Psycho 100 II,"Shigeo ""Mob"" Kageyama is now maturing and unde...","Jan 7, 2019 to Apr 1, 2019",13,442310,176,8.89,"[Action, Slice of Life, Comedy, Supernatural]",1,...,4,0,1.78676,shigeo mob kageyama maturing understanding rol...,12.999769,5.17615,2.62462,0.033836,2.285133,-3.530188
6,199,Sen to Chihiro no Kamikakushi,"Stubborn, spoiled, and naïve, 10-year-old Chih...","Jul 20, 2001",1,913212,40,8.9,"[Adventure, Supernatural, Drama]",0,...,3,0,1.528657,stubborn spoiled na ve year old chihiro ogino ...,13.724724,3.713572,2.634969,-0.215834,2.599069,-4.997001
7,38000,Kimetsu no Yaiba,"Ever since the death of his father, the burden...","Apr 6, 2019 to Sep 28, 2019",26,575037,106,8.92,"[Action, Demons, Historical, Shounen, Supernat...",1,...,5,0,2.043683,death father burden supporting family fallen t...,13.262191,4.672829,2.655667,0.304312,2.398773,-4.034966
8,35247,Owarimonogatari 2nd Season,Following an encounter with oddity specialist ...,"Aug 12, 2017 to Aug 13, 2017",7,189944,573,8.93,"[Mystery, Comedy, Supernatural, Vampire]",0,...,4,1,1.190452,following encounter oddity specialist izuko ga...,12.15449,6.352629,2.666016,-0.090999,1.919091,-2.350302
9,2904,Code Geass: Hangyaku no Lelouch R2,"One year has passed since the Black Rebellion,...","Apr 6, 2008 to Sep 28, 2008",25,992196,27,8.93,"[Action, Military, Sci-Fi, Super Power, Drama,...",1,...,6,0,2.285079,year passed black rebellion failed uprising ho...,13.807677,3.332205,2.666016,0.283506,2.634991,-5.379473


In [11]:
# Train Word2Vec on genres_list (each anime's genres_list is a sentence)
from gensim.models import Word2Vec

# Prepare sentences: normalize genre tokens (lowercase, replace spaces with _)
sentences = []
for gl in df_animes.get('genres_list', []):
    if isinstance(gl, (list, tuple)) and gl:
        toks = [g.strip().lower().replace(' ', '_') for g in gl if isinstance(g, str) and g.strip()]
        if toks:
            sentences.append(toks)

# Train Word2Vec (small vector size since vocab is small)
if sentences:
    w2v_size = 50
    w2v_window = 2
    w2v_min_count = 1
    w2v_epochs = 100
    w2v_model = Word2Vec(sentences=sentences, vector_size=w2v_size, window=w2v_window, min_count=w2v_min_count,
                         epochs=w2v_epochs, seed=42)
    # save model
    try:
        w2v_model.save('../../data/processed/genre_w2v.model')
        print('Saved Word2Vec model to ../../data/processed/genre_w2v.model')
    except Exception as e:
        print('Could not save Word2Vec model:', e)
else:
    print('No genre sentences found to train Word2Vec.')


Saved Word2Vec model to ../../data/processed/genre_w2v.model


In [12]:
# Create anime-level embedding by averaging genre embeddings

# load model if not in memory
try:
    w2v_model
except NameError:
    from gensim.models import Word2Vec

    try:
        w2v_model = Word2Vec.load('../../data/processed/genre_w2v.model')
    except Exception:
        w2v_model = None

vec_size = w2v_model.vector_size if w2v_model is not None else 50


def _anime_genre_embedding(genres_list):
    if not isinstance(genres_list, (list, tuple)) or not genres_list:
        return np.zeros(vec_size, dtype=float)
    toks = [g.strip().lower().replace(' ', '_') for g in genres_list if isinstance(g, str) and g.strip()]
    vecs = []
    for t in toks:
        try:
            vecs.append(w2v_model.wv[t])
        except Exception:
            continue
    if not vecs:
        return np.zeros(vec_size, dtype=float)
    arr = np.vstack(vecs)
    return arr.mean(axis=0)


# Apply to dataframe and store as numpy arrays
df_animes['genre_embedding'] = df_animes['genres_list'].apply(_anime_genre_embedding)

print('Added genre_embedding column. Example embedding shape:',
      df_animes['genre_embedding'].iloc[0].shape if len(df_animes) else (vec_size,))


Added genre_embedding column. Example embedding shape: (50,)


In [13]:
df_animes.head(10)

Unnamed: 0,uid,title,synopsis,aired,episodes,members,popularity,score,genres_list,genre_action,...,rare_genre_flag,genre_entropy,synopsis_clean,members_log,popularity_log,score_scaled,episodes_scaled,members_log_scaled,popularity_log_scaled,genre_embedding
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"Oct 4, 2015 to Mar 27, 2016",25,489888,141,8.82,"[Comedy, Sports, Drama, School, Shounen]",0,...,0,2.010101,following participation inter high karasuno hi...,13.101934,4.955827,2.552177,0.283506,2.329375,-3.751149,"[-0.054394413, -0.09505967, 0.09854341, -0.000..."
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Oct 10, 2014 to Mar 20, 2015",22,995473,28,8.83,"[Drama, Music, Romance, School, Shounen]",0,...,0,2.304928,music accompanies path human metronome prodigi...,13.810974,3.367296,2.562526,0.221088,2.636419,-5.34428,"[0.21533957, 0.05241852, 0.08044383, -0.023834..."
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Jul 7, 2017 to Sep 29, 2017",13,581663,98,8.83,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]",0,...,0,2.202561,abyss gaping chasm stretching depths earth fil...,13.273648,4.59512,2.562526,0.033836,2.403734,-4.1129,"[0.22121415, 0.22084251, 0.31991, -0.060335767..."
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","Apr 5, 2009 to Jul 4, 2010",64,1615084,4,9.23,"[Action, Military, Adventure, Comedy, Drama, M...",1,...,0,2.761946,order obtained equal value lost alchemy bound ...,14.294898,1.609438,2.976486,1.094933,2.845979,-7.107228,"[0.117327034, -0.21923709, 0.21861446, 0.05094..."
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"Jan 6, 2017",1,214621,502,8.83,"[Action, Mystery, Supernatural, Vampire]",1,...,1,1.410939,helping revive legendary vampire kiss shot ace...,12.276634,6.22059,2.562526,-0.215834,1.971984,-2.482723,"[0.06913921, 0.51261604, 0.3248057, -0.0787698..."
5,37510,Mob Psycho 100 II,"Shigeo ""Mob"" Kageyama is now maturing and unde...","Jan 7, 2019 to Apr 1, 2019",13,442310,176,8.89,"[Action, Slice of Life, Comedy, Supernatural]",1,...,0,1.78676,shigeo mob kageyama maturing understanding rol...,12.999769,5.17615,2.62462,0.033836,2.285133,-3.530188,"[-0.2661997, 0.011713609, 0.21685001, -0.37757..."
6,199,Sen to Chihiro no Kamikakushi,"Stubborn, spoiled, and naïve, 10-year-old Chih...","Jul 20, 2001",1,913212,40,8.9,"[Adventure, Supernatural, Drama]",0,...,0,1.528657,stubborn spoiled na ve year old chihiro ogino ...,13.724724,3.713572,2.634969,-0.215834,2.599069,-4.997001,"[0.044922095, 0.20092791, 0.54717976, -0.44525..."
7,38000,Kimetsu no Yaiba,"Ever since the death of his father, the burden...","Apr 6, 2019 to Sep 28, 2019",26,575037,106,8.92,"[Action, Demons, Historical, Shounen, Supernat...",1,...,0,2.043683,death father burden supporting family fallen t...,13.262191,4.672829,2.655667,0.304312,2.398773,-4.034966,"[-0.021042729, -0.21700208, -0.062876604, -0.0..."
8,35247,Owarimonogatari 2nd Season,Following an encounter with oddity specialist ...,"Aug 12, 2017 to Aug 13, 2017",7,189944,573,8.93,"[Mystery, Comedy, Supernatural, Vampire]",0,...,1,1.190452,following encounter oddity specialist izuko ga...,12.15449,6.352629,2.666016,-0.090999,1.919091,-2.350302,"[0.010393366, 0.57123375, 0.5379373, -0.119873..."
9,2904,Code Geass: Hangyaku no Lelouch R2,"One year has passed since the Black Rebellion,...","Apr 6, 2008 to Sep 28, 2008",25,992196,27,8.93,"[Action, Military, Sci-Fi, Super Power, Drama,...",1,...,0,2.285079,year passed black rebellion failed uprising ho...,13.807677,3.332205,2.666016,0.283506,2.634991,-5.379473,"[0.20689298, -0.013847676, -0.1215972, 0.13768..."


In [15]:
# Parse 'aired' for multi-episode anime (episodes > 1)
from datetime import datetime

now = pd.Timestamp.now()


# helper to split aired field into (start_str, end_str_or_None)
def _split_aired_str(aired_str):
    if not isinstance(aired_str, str) or not aired_str.strip():
        return (None, None)
    s = aired_str.strip()
    # normalize dashes to ' to '
    s = s.replace('–', ' to ').replace('—', ' to ')
    # split on ' to ' (case-insensitive, allow spaces)
    parts = re.split(r'\s+to\s+', s, flags=re.IGNORECASE)
    parts = [p.strip() for p in parts if p is not None]
    if len(parts) == 1:
        start = parts[0]
        end = None
    else:
        start = parts[0] if parts[0] and parts[0] != '?' else None
        end_raw = parts[1] if len(parts) > 1 else None
        end = None if (not end_raw or '?' in end_raw or end_raw.strip().lower() in ('unknown', 'ongoing')) else end_raw
    return (start, end)


# operate only on anime with episodes > 1
mask_multi = df_animes['episodes'].fillna(0).astype(int) > 1
if mask_multi.any():
    split_series = df_animes.loc[mask_multi, 'aired'].fillna('').apply(_split_aired_str)
    starts = split_series.apply(lambda x: x[0])
    ends = split_series.apply(lambda x: x[1])

    # parse to datetimes (coerce errors)
    starts_dt = pd.to_datetime(starts, errors='coerce')
    ends_dt = pd.to_datetime(ends, errors='coerce')

    # if end is missing or NaT set it equal to start
    ends_dt = ends_dt.where(ends_dt.notna(), starts_dt)

    # write back to dataframe
    df_animes.loc[mask_multi, 'start_date'] = starts_dt.values
    df_animes.loc[mask_multi, 'end_date'] = ends_dt.values

    # ongoing flag for multi-episode: 1 if end_date is in the future
    df_animes.loc[mask_multi, 'ongoing'] = ((df_animes.loc[mask_multi, 'end_date'] > now)).astype(int)
else:
    # ensure columns exist
    df_animes['start_date'] = pd.NaT
    df_animes['end_date'] = pd.NaT
    df_animes['ongoing'] = 0


In [16]:
# Handle films (episodes == 1) - parse start_date only, end_date = start_date, ongoing = 0
mask_films = df_animes['episodes'].fillna(0).astype(int) == 1
if mask_films.any():
    # reuse _split_aired_str; take the start part only
    starts_f = df_animes.loc[mask_films, 'aired'].fillna('').apply(lambda x: _split_aired_str(x)[0])
    starts_dt_f = pd.to_datetime(starts_f, errors='coerce')

    df_animes.loc[mask_films, 'start_date'] = starts_dt_f.values
    df_animes.loc[mask_films, 'end_date'] = starts_dt_f.values
    df_animes.loc[mask_films, 'ongoing'] = 0

In [17]:
# Extract year, month, day from start_date
# Ensure start_date column exists
if 'start_date' not in df_animes.columns:
    df_animes['start_date'] = pd.NaT

df_animes['start_year'] = df_animes['start_date'].dt.year
df_animes['start_month'] = df_animes['start_date'].dt.month
df_animes['start_day'] = df_animes['start_date'].dt.day

In [18]:
# Compute anime_age in years relative to current date
now = pd.Timestamp.now()


# anime_age: number of years since start_date; NaN if start_date missing
def _compute_age(start_ts):
    if pd.isna(start_ts):
        return np.nan
    # use 365.25 days per year to account for leap years
    days = (now - start_ts).days
    return round(days / 365.25, 2)


df_animes['anime_age'] = df_animes['start_date'].apply(_compute_age)

In [19]:
# Final cleanup and graceful handling of invalid/missing dates
# If start_date is NaT, ensure derived columns are NaN and ongoing is 0
missing_start = df_animes['start_date'].isna()
if missing_start.any():
    df_animes.loc[missing_start, ['start_year', 'start_month', 'start_day', 'anime_age']] = np.nan
    df_animes.loc[missing_start, 'ongoing'] = 0

# Ensure types: start_date/end_date as datetime, ongoing as int, anime_age as float
df_animes['start_date'] = pd.to_datetime(df_animes['start_date'], errors='coerce')
df_animes['end_date'] = pd.to_datetime(df_animes['end_date'], errors='coerce')
df_animes['ongoing'] = df_animes['ongoing'].fillna(0).astype(int)
df_animes['anime_age'] = pd.to_numeric(df_animes['anime_age'], errors='coerce')

print('Step 5 complete: cleanup done. Summary:')
print(df_animes[['start_date', 'end_date', 'start_year', 'start_month', 'start_day', 'anime_age', 'ongoing']].head())

Step 5 complete: cleanup done. Summary:
  start_date   end_date  start_year  start_month  start_day  anime_age  \
0 2015-10-04 2016-03-27      2015.0         10.0        4.0       9.97   
1 2014-10-10 2015-03-20      2014.0         10.0       10.0      10.95   
2 2017-07-07 2017-09-29      2017.0          7.0        7.0       8.21   
3 2009-04-05 2010-07-04      2009.0          4.0        5.0      16.47   
4 2017-01-06 2017-01-06      2017.0          1.0        6.0       8.71   

   ongoing  
0        0  
1        0  
2        0  
3        0  
4        0  


In [20]:
df_animes.head(10)

Unnamed: 0,uid,title,synopsis,aired,episodes,members,popularity,score,genres_list,genre_action,...,members_log_scaled,popularity_log_scaled,genre_embedding,start_date,end_date,ongoing,start_year,start_month,start_day,anime_age
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"Oct 4, 2015 to Mar 27, 2016",25,489888,141,8.82,"[Comedy, Sports, Drama, School, Shounen]",0,...,2.329375,-3.751149,"[-0.054394413, -0.09505967, 0.09854341, -0.000...",2015-10-04,2016-03-27,0,2015.0,10.0,4.0,9.97
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Oct 10, 2014 to Mar 20, 2015",22,995473,28,8.83,"[Drama, Music, Romance, School, Shounen]",0,...,2.636419,-5.34428,"[0.21533957, 0.05241852, 0.08044383, -0.023834...",2014-10-10,2015-03-20,0,2014.0,10.0,10.0,10.95
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Jul 7, 2017 to Sep 29, 2017",13,581663,98,8.83,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]",0,...,2.403734,-4.1129,"[0.22121415, 0.22084251, 0.31991, -0.060335767...",2017-07-07,2017-09-29,0,2017.0,7.0,7.0,8.21
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","Apr 5, 2009 to Jul 4, 2010",64,1615084,4,9.23,"[Action, Military, Adventure, Comedy, Drama, M...",1,...,2.845979,-7.107228,"[0.117327034, -0.21923709, 0.21861446, 0.05094...",2009-04-05,2010-07-04,0,2009.0,4.0,5.0,16.47
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"Jan 6, 2017",1,214621,502,8.83,"[Action, Mystery, Supernatural, Vampire]",1,...,1.971984,-2.482723,"[0.06913921, 0.51261604, 0.3248057, -0.0787698...",2017-01-06,2017-01-06,0,2017.0,1.0,6.0,8.71
5,37510,Mob Psycho 100 II,"Shigeo ""Mob"" Kageyama is now maturing and unde...","Jan 7, 2019 to Apr 1, 2019",13,442310,176,8.89,"[Action, Slice of Life, Comedy, Supernatural]",1,...,2.285133,-3.530188,"[-0.2661997, 0.011713609, 0.21685001, -0.37757...",2019-01-07,2019-04-01,0,2019.0,1.0,7.0,6.71
6,199,Sen to Chihiro no Kamikakushi,"Stubborn, spoiled, and naïve, 10-year-old Chih...","Jul 20, 2001",1,913212,40,8.9,"[Adventure, Supernatural, Drama]",0,...,2.599069,-4.997001,"[0.044922095, 0.20092791, 0.54717976, -0.44525...",2001-07-20,2001-07-20,0,2001.0,7.0,20.0,24.18
7,38000,Kimetsu no Yaiba,"Ever since the death of his father, the burden...","Apr 6, 2019 to Sep 28, 2019",26,575037,106,8.92,"[Action, Demons, Historical, Shounen, Supernat...",1,...,2.398773,-4.034966,"[-0.021042729, -0.21700208, -0.062876604, -0.0...",2019-04-06,2019-09-28,0,2019.0,4.0,6.0,6.47
8,35247,Owarimonogatari 2nd Season,Following an encounter with oddity specialist ...,"Aug 12, 2017 to Aug 13, 2017",7,189944,573,8.93,"[Mystery, Comedy, Supernatural, Vampire]",0,...,1.919091,-2.350302,"[0.010393366, 0.57123375, 0.5379373, -0.119873...",2017-08-12,2017-08-13,0,2017.0,8.0,12.0,8.11
9,2904,Code Geass: Hangyaku no Lelouch R2,"One year has passed since the Black Rebellion,...","Apr 6, 2008 to Sep 28, 2008",25,992196,27,8.93,"[Action, Military, Sci-Fi, Super Power, Drama,...",1,...,2.634991,-5.379473,"[0.20689298, -0.013847676, -0.1215972, 0.13768...",2008-04-06,2008-09-28,0,2008.0,4.0,6.0,17.46


In [23]:
df_animes['genre_embedding'] = df_animes['genre_embedding'].apply(lambda x: np.array(x, dtype=np.float32))

In [28]:
type(df_animes.iloc[0, 63])

numpy.ndarray

In [31]:
if 'genre_embedding' not in df_animes.columns:
    raise KeyError('genre_embedding column not found in df_animes')

# find first non-null embedding to detect dimension
first_emb = None
for v in df_animes['genre_embedding'].values:
    if v is None:
        continue
    # handle lists/tuples as well as numpy arrays
    try:
        arr = np.asarray(v)
    except Exception:
        continue
    if arr.size > 0:
        first_emb = arr
        break
if first_emb is None:
    raise ValueError('No valid embeddings found in genre_embedding column')

emb_dim = int(first_emb.shape[0])
print(f'Detected embedding dimension: {emb_dim}')


# Build embedding ndarray for all rows, filling missing with zeros
def _to_emb_array(v):
    try:
        a = np.asarray(v, dtype=np.float32)
        if a.size == 0:
            return np.zeros(emb_dim, dtype=np.float32)
        if a.shape[0] != emb_dim:
            # if shape mismatch, pad or truncate
            out = np.zeros(emb_dim, dtype=np.float32)
            out[:min(a.shape[0], emb_dim)] = a.ravel()[:emb_dim]
            return out
        return a.astype(np.float32)
    except Exception:
        return np.zeros(emb_dim, dtype=np.float32)


emb_array = np.stack([_to_emb_array(v) for v in df_animes['genre_embedding'].values])
emb_cols = [f'genre_emb_{i}' for i in range(emb_dim)]
emb_df = pd.DataFrame(emb_array, columns=emb_cols, index=df_animes.index)

# Concatenate and drop original column
df_animes_flat = pd.concat([df_animes.drop(columns=['genre_embedding']), emb_df], axis=1)

df_animes_flat.to_parquet('../../data/processed/animes_processed.parquet', index=False)

print('Final dataframe shape:', df_animes_flat.shape)

# assign back to df_animes variable for downstream cells
df_animes = df_animes_flat


Detected embedding dimension: 50
Final dataframe shape: (16368, 120)


In [32]:
df_animes_flat.head(10)

Unnamed: 0,uid,title,synopsis,aired,episodes,members,popularity,score,genres_list,genre_action,...,genre_emb_40,genre_emb_41,genre_emb_42,genre_emb_43,genre_emb_44,genre_emb_45,genre_emb_46,genre_emb_47,genre_emb_48,genre_emb_49
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"Oct 4, 2015 to Mar 27, 2016",25,489888,141,8.82,"[Comedy, Sports, Drama, School, Shounen]",0,...,-0.325108,0.372583,-0.150133,0.063334,-0.037451,-0.023057,-0.015965,-0.079849,0.069605,0.041172
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"Oct 10, 2014 to Mar 20, 2015",22,995473,28,8.83,"[Drama, Music, Romance, School, Shounen]",0,...,0.039412,0.307161,0.010229,0.095036,0.020471,-0.112495,-0.233364,-0.22307,-0.094668,0.010424
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"Jul 7, 2017 to Sep 29, 2017",13,581663,98,8.83,"[Sci-Fi, Adventure, Mystery, Drama, Fantasy]",0,...,0.241236,-0.047506,0.017787,0.183756,0.471296,0.002649,0.130329,-0.239705,-0.322175,-0.608473
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","Apr 5, 2009 to Jul 4, 2010",64,1615084,4,9.23,"[Action, Military, Adventure, Comedy, Drama, M...",1,...,-0.031031,0.094117,0.08401,-0.000744,0.322104,-0.150647,-0.06114,-0.264919,-0.225668,-0.001621
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"Jan 6, 2017",1,214621,502,8.83,"[Action, Mystery, Supernatural, Vampire]",1,...,0.085453,-0.016847,-0.015368,0.250852,0.117892,-0.276332,0.251657,-0.217527,-0.120232,-0.557234
5,37510,Mob Psycho 100 II,"Shigeo ""Mob"" Kageyama is now maturing and unde...","Jan 7, 2019 to Apr 1, 2019",13,442310,176,8.89,"[Action, Slice of Life, Comedy, Supernatural]",1,...,-0.037127,0.159679,0.233275,-0.244771,0.012024,-0.248264,-0.020235,-0.000815,0.01038,0.002741
6,199,Sen to Chihiro no Kamikakushi,"Stubborn, spoiled, and naïve, 10-year-old Chih...","Jul 20, 2001",1,913212,40,8.9,"[Adventure, Supernatural, Drama]",0,...,0.10731,0.132151,0.278707,0.159719,0.402134,-0.485785,0.263917,-0.151865,-0.420217,0.223778
7,38000,Kimetsu no Yaiba,"Ever since the death of his father, the burden...","Apr 6, 2019 to Sep 28, 2019",26,575037,106,8.92,"[Action, Demons, Historical, Shounen, Supernat...",1,...,0.113221,-0.099797,0.01898,0.003247,0.450094,-0.02724,-0.082962,-0.213881,-0.170457,-0.145498
8,35247,Owarimonogatari 2nd Season,Following an encounter with oddity specialist ...,"Aug 12, 2017 to Aug 13, 2017",7,189944,573,8.93,"[Mystery, Comedy, Supernatural, Vampire]",0,...,-0.078951,0.052401,0.011987,0.156864,-0.289549,-0.147272,0.334582,-0.114548,-0.035374,-0.252579
9,2904,Code Geass: Hangyaku no Lelouch R2,"One year has passed since the Black Rebellion,...","Apr 6, 2008 to Sep 28, 2008",25,992196,27,8.93,"[Action, Military, Sci-Fi, Super Power, Drama,...",1,...,-0.105253,0.141387,0.301835,0.080078,0.368225,-0.211279,0.091863,-0.211304,-0.288166,-0.18847


# Notebook summary

This notebook performs feature engineering and preprocessing for the anime dataset. Main steps completed:
___
- Genre features
  - Added n_genres, rare_genre_flag (genres with <1% frequency), and genre_entropy (Shannon entropy)
  - Trained and saved a Word2Vec model on genres and added per-anime genre_embedding (average of genre vectors)
___
- Text features
  - Cleaned `synopsis` into `synopsis_clean` (lowercased, HTML removed, non-alphabetic removed, stopwords removed)
  - Fitted TF-IDF vectorizer (max_features=5000, ngram_range=(1,2)) and saved it
  - Produced sparse TF-IDF matrix X_tfidf for downstream models
___
- Numeric features
  - Created log-transformed features: members_log, popularity_log
  - Standardized numeric features (score, episodes, members_log, popularity_log) and saved StandardScaler
___
- Dates and temporal features
  - Parsed `aired` into start_date and end_date (handled open-ended ranges and films)
  - Extracted start_year, start_month, start_day
  - Computed anime_age (years) and ongoing flag
___