In [None]:
import pandas as pd
import requests
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold, StratifiedKFold
import time 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import classification_report, confusion_matrix
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import seaborn as sb

In [None]:
def get_user_animelist(username: str, client_id: str):
    # Loads list of anime from MAL for a given user
    url = f'https://api.myanimelist.net/v2/users/{username}/animelist?limit=500'
    headers = {
        'X-MAL-CLIENT-ID': client_id
    }
    params = {
        'fields': 'id, title, list_status{score,status}, start_season{year}, mean, genres, popularity, media_type, rating, num_episodes, studios, num_list_users,favorites'
    }

    all_data = []
    next_page = url

    while next_page:
        response = requests.get(next_page, headers=headers, params=params if next_page == url else None)
        if response.status_code != 200:
            raise ValueError(f"Error with API request: {response.status_code} - {response.text}")

        data = response.json()
        all_data.extend(data['data'])
        next_page = data.get("paging", {}).get("next")

        print(f"Loaded {len(all_data)} anime...")

    return all_data

In [None]:
def to_dataframe(all_data):
    rows = []
    for item in all_data:
        anime = item['node']
        score = item.get('list_status', {}).get('score')
        status = item.get('list_status', {}).get('status')
        # num_episodes_watched = item.get('list_status', {}).get('num_episodes_watched')
        year = item.get('node', {}).get('start_season', {}).get('year')
        rows.append({
            "id": anime['id'],
            "title": anime['title'],
            "mean": anime.get('mean'),
            "genres": [g['name'] for g in anime.get('genres', [])],
            "studios": [s['name'] for s in anime.get('studios', [])],
            "rating": anime.get('rating'),
            "year": year,
            "type": anime.get('media_type'),
            "popularity": anime.get('popularity'),
            "score": score,
            "status": status,
            "members": anime['num_list_users'],
            "num_episodes": anime['num_episodes'],
        })
    
    df = pd.DataFrame(rows)
    df['studios'] = df['studios'].str.join(", ")
    df['genres'] = df['genres'].str.join(", ")

    return df

In [None]:
def calculate_genre_affinity_simple(df_train):
    genre_cols = [col for col in df_train.columns if col.startswith('Genre_')]
    genre_affinity = {}
    
    overall_mean = df_train['score'].mean()
    
    for genre_col in genre_cols:
        genre_name = genre_col.replace('Genre_', '')
        mask = df_train[genre_col] == 1
        
        if mask.sum() >= 5:
            genre_affinity[genre_name] = df_train[mask]['score'].mean()
        else:
            genre_affinity[genre_name] = overall_mean
    
    return genre_affinity

In [None]:
def calculate_studio_mean(df_train):
    studios_columns = [col for col in df_train.columns if col.startswith('Studios_')]
    studio_mean = {}
    overall_mean = df_train['score'].mean()
    for studio_column in studios_columns:
        studio_name = studio_column.replace('Studios_', '')
        mask = df_train[studio_column] == 1
        
        if mask.sum() >= 3:
            studio_scores = df_train[mask]['score']
            mean_score = studio_scores.mean()
            studio_mean[studio_name] = mean_score
        else:
            studio_mean[studio_name] = overall_mean
            
    return studio_mean

In [None]:
def classify_3_classes(score):
    if score <= 5:
        return 0
    elif score <= 7:
        return 1
    else:
        return 2

In [None]:
import configparser

config = configparser.ConfigParser()
config.read('../config.ini')
USER_NAME = config.get('USER','USER_NAME').strip()
CLIENT_ID = config.get('USER','CLIENT_ID').strip()

In [None]:
data = get_user_animelist(USER_NAME, CLIENT_ID)

In [None]:
df = to_dataframe(data)

In [None]:
df['score_class'] = df['score'].apply(classify_3_classes)

In [None]:
df['studios'] = df['studios'].replace('', 'Unknown')
df['genres'] = df['genres'].replace('', 'Unknown')

df = df.dropna(subset=['score'])

In [None]:
display(df["score"].value_counts().head(11))

In [None]:
df.drop(columns=['title', 'id'], inplace=True)

In [None]:
df = df[df['score'] > 0].copy()
    
df['mean'] = df['mean'].fillna(df.groupby('type')['mean'].transform('median'))
    
categorical_cols = ['type', 'rating', 'status']
for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')

In [None]:
df['year'] = df['year'].fillna(df['year'].mode()[0])
df['num_episodes'] = df['num_episodes'].fillna(df['num_episodes'].mode()[0])

In [None]:
df['popularity'] = df['popularity'].fillna(df['popularity'].median())

In [None]:
df['anime_age'] = 2025 - df['year']
df['anime_age'] = df['anime_age'].astype(int)
df.drop(columns=['year'], inplace=True)

In [None]:
df['log_members'] = np.log1p(df['members'])
df.drop(columns=['members'], inplace=True)

In [None]:
# Get dummy variables for each unique genre
genre_dummies = (
    df['genres']
    .str.split(', ', expand=True) 
    .stack()       
    .str.get_dummies()         
    .groupby(level=0)         
    .sum()                    
    .add_prefix('Genre_')  
)

In [None]:
df = pd.concat([df, genre_dummies], axis=1)

In [None]:
genre_columns = [col for col in df.columns if col.startswith('Genre_')]

genre_counts = df[genre_columns].sum().sort_values(ascending=False)
print(genre_counts)

In [None]:
genre_columns = [col for col in df.columns if col.startswith('Genre_')]

genre_counts = df[genre_columns].sum().sort_values(ascending=False)
total_anime = len(df)

min_count = max(5, int(total_anime * 0.01))
frequent_genres= genre_counts[genre_counts >= min_count].index.tolist()

print(f"studio_counts: {len(genre_counts)}")
print(f"Studios with >= {min_count} anime: {len(frequent_genres)}")

rare_genre_columns = [col for col in genre_columns if col not in frequent_genres]
df['Genre_Other'] = df[rare_genre_columns].max(axis=1) 

df.drop(columns=rare_genre_columns, inplace=True)

In [None]:
df.drop(columns=['genres'], inplace=True)

In [None]:
type_dummies = pd.get_dummies(df['type'], prefix='Type')
df = pd.concat([df, type_dummies], axis=1)
df.drop(columns=['type'], inplace=True)

In [None]:
studio_dummies = (
    df['studios']
    .str.split(', ', expand=True) 
    .stack()       
    .str.get_dummies()         
    .groupby(level=0)         
    .sum()                    
    .add_prefix('Studios_')  
)
df = pd.concat([df, studio_dummies], axis=1)

In [None]:
studio_columns = [col for col in df.columns if col.startswith('Studios_')]

studio_counts = df[studio_columns].sum().sort_values(ascending=False)
print(studio_counts)

In [None]:
studio_columns = [col for col in df.columns if col.startswith('Studios_')]

studio_counts = df[studio_columns].sum().sort_values(ascending=False)
total_anime = len(df)

min_count = max(10, int(total_anime * 0.01))
frequent_studios = studio_counts[studio_counts >= min_count].index.tolist()

print(f"studio_counts: {len(studio_counts)}")
print(f"Studios with >= {min_count} anime: {len(frequent_studios)}")

rare_studio_columns = [col for col in studio_columns if col not in frequent_studios]
df['Studio_Other'] = df[rare_studio_columns].max(axis=1) 

df.drop(columns=rare_studio_columns, inplace=True)

In [None]:
df.drop(columns=['studios'], inplace=True)

In [None]:
rating_dummies = pd.get_dummies(df['rating'], prefix='Rating')
df = pd.concat([df, rating_dummies], axis=1)
df.drop(columns=['rating'], inplace=True)

In [None]:
status_dummies = pd.get_dummies(df['status'], prefix='Status')
df = pd.concat([df, status_dummies], axis=1)
df.drop(columns=['status'], inplace=True)

In [None]:
df.drop(columns=['Status_completed'], inplace=True)
df.drop(columns=['Type_tv'], inplace=True)
df.drop(columns=['Rating_pg_13'], inplace=True)

In [None]:
bool_cols = df.select_dtypes('bool').columns

df[bool_cols] = df[bool_cols].astype(int)

In [None]:
# df.drop(columns=['Genre_Unknown'], inplace=True)

In [None]:
corr_with_score = df.corr()['score']
high_corr_cols = corr_with_score[np.abs(corr_with_score) > 0.65].index
high_corr_cols = high_corr_cols.drop(['score','score_class'])
print(high_corr_cols)

In [None]:
df.drop(columns=high_corr_cols, inplace=True)

In [None]:
df['num_episodes'] = np.log1p(df['num_episodes'])

In [None]:
df.drop(columns=['popularity'], inplace=True)
# df.drop(columns=['log_members'], inplace=True)

In [None]:
X = df.drop(columns=['score', 'score_class'])
y = df['score_class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
genre_affinity = calculate_genre_affinity_simple(X_train.join(df['score']))

affinity_features_train = pd.DataFrame({
    f"affinity_{genre}": X_train[f"Genre_{genre}"] * affinity
    for genre, affinity in genre_affinity.items()
}, index=X_train.index)

affinity_features_test = pd.DataFrame({
    f"affinity_{genre}": X_test[f"Genre_{genre}"] * affinity
    for genre, affinity in genre_affinity.items()
}, index=X_test.index)

X_train = pd.concat([X_train, affinity_features_train], axis=1)
X_test = pd.concat([X_test, affinity_features_test], axis=1)

In [None]:
X_train = X_train.drop(columns=[col for col in X_train.columns if col.startswith("Genre_")])
X_test = X_test.drop(columns=[col for col in X_test.columns if col.startswith("Genre_")])

In [None]:
# studios_affinity = calculate_studio_mean(X_train.join(df['score']))
# 
# affinity_features_train = pd.DataFrame({
#     f"affinity_{studio}": X_train[f"Studios_{studio}"] * affinity
#     for studio, affinity in studios_affinity.items()
# }, index=X_train.index)
# 
# affinity_features_test = pd.DataFrame({
#     f"affinity_{studio}": X_test[f"Studios_{studio}"] * affinity
#     for studio, affinity in studios_affinity.items()
# }, index=X_test.index)
# 
# X_train = pd.concat([X_train, affinity_features_train], axis=1)
# X_test = pd.concat([X_test, affinity_features_test], axis=1)

In [None]:
# X_train = X_train.drop(columns=[col for col in X_train.columns if col.startswith("Studios_")])
# X_test = X_test.drop(columns=[col for col in X_test.columns if col.startswith("Studios_")])

In [None]:
X_train.sample(10)

In [None]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

In [None]:
print(class_weight_dict)

In [None]:
model = CatBoostClassifier(early_stopping_rounds=50,random_state=42, verbose=0)

In [None]:
params = {
    'iterations': [200, 300, 400],
    'depth': [3, 4, 6],
    'learning_rate': [0.01, 0.03, 0.05],
    'l2_leaf_reg': [3, 5, 7],
    'random_strength': [1.0, 1.5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
grid = GridSearchCV(
    estimator=model,
    param_grid=params,
    scoring='f1_macro',
    cv=cv,
    refit=True,
    n_jobs=-1,
    verbose=2,
    return_train_score=True  
)

In [None]:
grid.fit(
    X_train, y_train
)

In [None]:
results = pd.DataFrame(grid.cv_results_)
results = results.sort_values(by="mean_test_score", ascending=False)
print(results[[
    "param_depth",
    "param_iterations",
    "param_learning_rate",
    "param_l2_leaf_reg",
    "param_random_strength",
    "mean_test_score",
    "std_test_score"
]].head(10))

In [None]:
model = CatBoostClassifier(
    depth=4,
    l2_leaf_reg=3,
    eval_metric='MultiClass', 
    class_weights=class_weight_dict, 
    early_stopping_rounds=50,
    random_state=42,
    verbose=0
)

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

model.fit(
    X_tr, y_tr,
    eval_set=(X_val, y_val),
    use_best_model=True
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Dont Watch', 'Okay', 'Good'],
            yticklabels=['Dont Watch', 'Okay', 'Good'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

print(classification_report(y_test, y_pred, 
                          target_names=['Dont Watch', 'Okay', 'Good']))

In [None]:
model.save_model("../models/catboost_model.cbm")