In [1]:
import pandas as pd
import requests
import configparser
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

## LOADING DATA FROM API


In [2]:
def get_user_animelist(username: str, client_id: str):
    # Loads list of anime from MAL for a given user
    url = f'https://api.myanimelist.net/v2/users/{username}/animelist?limit=500'
    headers = {
        'X-MAL-CLIENT-ID': client_id
    }
    params = {
        'fields': 'id, title, list_status{score,status}, start_season{year}, mean, genres, popularity, media_type, rating, num_episodes, studios, num_list_users,favorites'
    }

    all_data = []
    next_page = url

    while next_page:
        response = requests.get(next_page, headers=headers, params=params if next_page == url else None)
        if response.status_code != 200:
            raise ValueError(f"Error with API request: {response.status_code} - {response.text}")

        data = response.json()
        all_data.extend(data['data'])
        next_page = data.get("paging", {}).get("next")

        print(f"Loaded {len(all_data)} anime...")

    return all_data

In [3]:
def to_dataframe(all_data):
    rows = []
    for item in all_data:
        anime = item['node']
        score = item.get('list_status', {}).get('score')
        status = item.get('list_status', {}).get('status')
        # num_episodes_watched = item.get('list_status', {}).get('num_episodes_watched')
        year = item.get('node', {}).get('start_season', {}).get('year')
        rows.append({
            "id": anime['id'],
            "title": anime['title'],
            "mean": anime.get('mean'),
            "genres": [g['name'] for g in anime.get('genres', [])],
            "studios": [s['name'] for s in anime.get('studios', [])],
            "rating": anime.get('rating'),
            "year": year,
            "type": anime.get('media_type'),
            "popularity": anime.get('popularity'),
            "score": score,
            "status": status,
            "members": anime['num_list_users'],
            "num_episodes": anime['num_episodes'],
        })
    
    df = pd.DataFrame(rows)
    df['studios'] = df['studios'].str.join(", ")
    df['genres'] = df['genres'].str.join(", ")

    return df

In [4]:
config = configparser.ConfigParser()
config.read('../config.ini')
USER_NAME = config.get('USER','USER_NAME').strip()
CLIENT_ID = config.get('USER','CLIENT_ID').strip()

In [5]:
data = get_user_animelist(USER_NAME, CLIENT_ID)

## WORKING WITH DATAFRAME

In [6]:
df = to_dataframe(data)

#### FUNCTIONS FOR DATA PREPROCESSING

In [7]:
def preprocess(df):
    df = df.copy()
    df = df[df['score'] > 0]
    
    categorical_cols = ['studios', 'genres', 'rating', 'status', 'type']
    for col in categorical_cols:
        df[col] = df[col].fillna('Unknown')
        df[col] = df[col].replace('', 'Unknown')
    
    numerical_cols = ['year', 'mean', 'popularity', 'members', 'num_episodes']
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    return df

In [8]:
def one_hot_encoding(df):
    
    status_dummies = pd.get_dummies(df['status'], prefix='Status')
    df = pd.concat([df, status_dummies], axis=1)
    
    type_dummies = pd.get_dummies(df['type'], prefix='Type')
    df = pd.concat([df, type_dummies], axis=1)
    
    rating_dummies = pd.get_dummies(df['rating'], prefix='Rating')
    df = pd.concat([df, rating_dummies], axis=1)
    
    genre_dummies = (
    df['genres']
    .str.split(', ', expand=True) 
    .stack()       
    .str.get_dummies()         
    .groupby(level=0)         
    .sum()                    
    .add_prefix('Genre_')  
    )
    df = pd.concat([df, genre_dummies], axis=1)

    
    studio_dummies = (
    df['studios']
    .str.split(', ', expand=True) 
    .stack()       
    .str.get_dummies()         
    .groupby(level=0)         
    .sum()                    
    .add_prefix('Studios_')  
    )
    df = pd.concat([df, studio_dummies], axis=1)

    df.drop(columns=['genres'], inplace=True)
    df.drop(columns=['studios'], inplace=True)
    df.drop(columns=['type', 'Type_tv'], inplace=True)
    df.drop(columns=['rating', 'Rating_pg_13'], inplace=True)
    df.drop(columns=['status', 'Status_completed'], inplace=True)
    
    return df

In [9]:
def prepare_for_rules(df):
    """
    Preprocessing for rule-based system
    Keep all original features
    """
    df = preprocess(df)  # Fill nulls
    df = one_hot_encoding(df)  # One-hot encode
    
    return df

In [10]:
def classify_3_classes(score, q33,q66):

    if score <= q33:
        return 0
    elif score <= q66:
        return 1
    else:
        return 2

In [11]:
def prepare_for_ml(df, q33=None, q66=None, drop_mean=True):

    df = preprocess(df)
    df = one_hot_encoding(df)

    if q33 is not None and q66 is not None:
        df['score_class'] = df['score'].apply(lambda s: classify_3_classes(s, q33, q66))
    else:
        df['score_class'] = df['score'].apply(classify_3_classes)
        
    genre_cols = [col for col in df.columns if col.startswith('Genre_')]
    df['num_genres'] = df[genre_cols].sum(axis=1) 
    
    if drop_mean and 'mean' in df.columns:
        df = df.drop(columns=['mean'])
    
    # Log transformations
    if 'popularity' in df.columns:
        df['popularity'] = np.log1p(df['popularity'])
    if 'members' in df.columns:
        df.drop(columns=['members'], inplace=True )
    
    if 'num_episodes' in df.columns:
        df['num_episodes'] = np.log1p(df['num_episodes'])
    
    # Create anime_age if year exists
    if 'year' in df.columns:
        df['anime_age'] = 2025 - df['year']
        df = df.drop(columns=['year'])
        
    if 'anime_age' in df.columns:
        df['age_category'] = pd.cut(
            df['anime_age'],
            bins=[-1, 2, 5, 10, 20, np.inf],
            labels=['new', 'recent', 'modern', 'old', 'classic']
        )
        df = pd.get_dummies(df, columns=['age_category'], drop_first=True)
        df = df.drop(columns=['anime_age'])
        
    if 'num_episodes' in df.columns:
        df['episode_cat'] = pd.cut(
            df['num_episodes'],
            bins=[0,1,10,18,26,57,np.inf],
            labels=['single', 'short', 'one_season','two_season', 'long', 'very_long'] # 1, 2-10, 11-18, 18-26,25-57,58-
        )
        df = pd.get_dummies(df, columns=['episode_cat'], drop_first=True)
        df = df.drop(columns=['num_episodes'])
        
    return df

In [12]:
def calculate_genre_affinity(df_train):
    genre_cols = [col for col in df_train.columns if col.startswith('Genre_')]
    genre_affinity = {}
    
    overall_mean = df_train['score'].mean()
    
    for genre_col in genre_cols:
        genre_name = genre_col.replace('Genre_', '')
        mask = df_train[genre_col] == 1
        
        if mask.sum() >= 5:
            genre_affinity[genre_name] = df_train[mask]['score'].mean()
        else:
            genre_affinity[genre_name] = overall_mean
    
    return genre_affinity

In [31]:
def use_affinity(X_train, X_test, y_train):
    genre_affinity = calculate_genre_affinity(X_train.join(y_train))

    affinity_features_train = pd.DataFrame({
    f"affinity_{genre}": X_train[f"Genre_{genre}"] * affinity
    for genre, affinity in genre_affinity.items()
    }, index=X_train.index)

    affinity_features_test = pd.DataFrame({
    f"affinity_{genre}": X_test[f"Genre_{genre}"] * affinity
    for genre, affinity in genre_affinity.items()
    }, index=X_test.index)

    X_train = pd.concat([X_train, affinity_features_train], axis=1)
    X_test = pd.concat([X_test, affinity_features_test], axis=1)
    
    X_train = X_train.drop(columns=[col for col in X_train.columns if col.startswith("Genre_")])
    X_test = X_test.drop(columns=[col for col in X_test.columns if col.startswith("Genre_")])
    
    return X_train,X_test

In [32]:
def group_rare_features(df_train, df_test):
    df_train = df_train.copy()
    df_test = df_test.copy()
    
    total_anime = len(df_train)
    
    # === GENRES ===
    genre_columns = [col for col in df_train.columns if col.startswith('Genre_')]
    genre_counts = df_train[genre_columns].sum().sort_values(ascending=False)
    
    min_count = max(5, int(total_anime * 0.01))
    frequent_genres = genre_counts[genre_counts >= min_count].index.tolist()
    rare_genre_columns = [col for col in genre_columns if col not in frequent_genres]
    
    if rare_genre_columns:
        df_train['Genre_Other'] = df_train[rare_genre_columns].max(axis=1)
        df_train = df_train.drop(columns=rare_genre_columns)
    
    # === STUDIOS ===
    studio_columns = [col for col in df_train.columns if col.startswith('Studios_')]
    studio_counts = df_train[studio_columns].sum().sort_values(ascending=False)
    
    min_count = max(10, int(total_anime * 0.01))
    frequent_studios = studio_counts[studio_counts >= min_count].index.tolist()
    rare_studio_columns = [col for col in studio_columns if col not in frequent_studios]
    
    if rare_studio_columns:
        df_train['Studio_Other'] = df_train[rare_studio_columns].max(axis=1)
        df_train = df_train.drop(columns=rare_studio_columns)

    # === ALIGN ALL COLUMNS ===
    for col in df_train.columns:
        if col not in df_test.columns:
            df_test[col] = 0
    
    # Remove extra columns from test
    for col in df_test.columns:
        if col not in df_train.columns:
            df_test = df_test.drop(columns=[col])
    
    # Ensure same order
    df_test = df_test[df_train.columns]
    
    return df_train, df_test

#### RULE-BASED CLASSIFICATION FUNCTIONS

In [33]:
def analyze_user_preferences(df):
    watched = df[df['score']>0].copy()
        
    preferences = {
        'overall_mean': watched['score'].mean(),
        'genre_scores': {},
        'studios_scores': {},
        'type_scores': {},
        'rating_scores': {},
        'year_scores': {}        
    }
    
    genre_cols = [col for col in watched.columns if col.startswith('Genre_')]
    for genre_col in genre_cols:
        genre_name = genre_col.replace('Genre_', '').strip()
        mask = (watched[genre_col] == 1)       
        if mask.sum() >= 5 :
            preferences['genre_scores'][genre_name] = watched[mask]['score'].mean()
            
    studio_cols = [col for col in watched.columns if col.startswith('Studios_')]
    for studio_col in studio_cols:
        studio_name = studio_col.replace('Studios_', '').strip()
        mask = (watched[studio_col] == 1)
        if mask.sum() >= 5 :
            preferences['studios_scores'][studio_name] = watched[mask]['score'].mean()
            
    type_cols = [col for col in watched.columns if col.startswith('Type_')]
    for type_col in type_cols:
        type_name = type_col.replace('Type_', '')
        mask = (watched[type_col] == 1)
        if mask.sum() >= 3:
            preferences['type_scores'][type_name] = watched[mask]['score'].mean()
    
    rating_cols = [col for col in watched.columns if col.startswith('Rating_')]
    for rating_col in rating_cols:
        rating_name = rating_col.replace('Rating_', '').strip()
        mask = (watched[rating_col] == 1)
        if mask.sum() >= 3:
            preferences['rating_scores'][rating_name] = watched[mask]['score'].mean()
    
    watched['year'] = watched['year'].astype(int)
    year_groups  = watched.groupby('year')['score'].apply(list)
    for year, scores in year_groups.items():
        if len(scores) >= 5:
            preferences['year_scores'][year] = np.mean(scores)
    
    return preferences

In [34]:
def predict_personal_score(anime_row,user_prefs):
    """
    
    :param anime_row:  row with anime characteristics
    :param user_prefs: user preferences
    :return: expected score for new anime
    """
    score = user_prefs['overall_mean']
    adjustments = []

    
    # RULE 1: Genres
    genre_cols = [col for col in anime_row.index if col.startswith('Genre_')]
    anime_genres = [col.replace('Genre_', '') for col in genre_cols if anime_row[col] == 1]
    
    for genre in anime_genres:
        if genre in user_prefs['genre_scores']:
            # Stronger adjustment: difference from mean, not absolute value
            genre_diff = user_prefs['genre_scores'][genre] - score
            adjustments.append(genre_diff * 0.5) 
        
    # RULE 2: Studios
    studios_cols = [col for col in anime_row.index if col.startswith('Studios_')]
    anime_studios = [col.replace('Studios_', '') for col in studios_cols if anime_row[col] == 1]
    for studio in anime_studios:
        if studio in user_prefs['studios_scores']:
            studio_diff = user_prefs['studios_scores'][studio] - score
            adjustments.append(studio_diff * 0.3) 

        
    # RULE 3: Type
    type_cols = [col for col in anime_row.index if col.startswith('Type_')]
    anime_type = [col.replace('Type_', '') for col in type_cols if anime_row[col] == 1]
    
    if anime_type and anime_type[0] in user_prefs['type_scores']:
        type_diff = user_prefs['type_scores'][anime_type[0]] - score
        adjustments.append(type_diff * 0.2)
    
    # RULE 4: Rating
    rating_cols = [col for col in anime_row.index if col.startswith('Rating_')]
    anime_rating = [col.replace('Rating_', '') for col in rating_cols if anime_row[col] == 1]
    
    if anime_rating and anime_rating[0] in user_prefs['rating_scores']:
        rating_score = user_prefs['rating_scores'][anime_rating[0]]
        
    if adjustments:
        final_score = score + sum(adjustments)
    else:
        final_score = score
    
    return np.clip(final_score, 0, 10)

#### PREPARING DATAFRAMES

In [35]:
df = df[df['score'] > 0].copy()
df_ml = df.copy()
df_rb = df.copy()

# df_ml = prepare_for_ml(df_ml)   # This should create 'score_class'
df_rb = prepare_for_rules(df_rb) # This keeps 'score' as continuous

train_idx, test_idx = train_test_split(
    df.index, 
    test_size=0.2, 
    random_state=42
)

# For ML model (classification)
df_train_ml = df_ml.loc[train_idx].copy()
df_test_ml = df_ml.loc[test_idx].copy()

q33 = df_train_ml['score'].quantile(0.33)
q66 = df_train_ml['score'].quantile(0.66)

df_train_ml = prepare_for_ml(df_train_ml, q33=q33, q66=q66)
df_test_ml = prepare_for_ml(df_test_ml, q33=q33, q66=q66)

df_train_ml, df_test_ml = group_rare_features(df_train_ml, df_test_ml)

# For rule-based model (regression)
df_train_rb = df_rb.loc[train_idx].copy()
df_test_rb = df_rb.loc[test_idx].copy()

# For ML model
X_train_ml = df_train_ml.drop(columns=['score', 'title', 'id', 'score_class'])
y_train_ml = df_train_ml['score_class']
X_test_ml = df_test_ml.drop(columns=['score', 'title', 'id', 'score_class'])
y_test_ml = df_test_ml['score_class']

# For rule-based model
X_train_rb = df_train_rb.drop(columns=['score', 'title', 'id'])
y_train_rb = df_train_rb['score']
X_test_rb = df_test_rb.drop(columns=['score', 'title', 'id'])
y_test_rb = df_test_rb['score']

X_train_ml, X_test_ml = use_affinity(X_train_ml, X_test_ml, y_train_rb) # rb contains 'score' while y_train_ml 'score_class'

user_prefs = analyze_user_preferences(df_train_rb)

In [18]:
y_pred_rules = []
for idx in range(len(X_test_rb)):
    anime = X_test_rb.iloc[idx]
    pred = predict_personal_score(anime, user_prefs)
    y_pred_rules.append(pred)

y_pred_rules = np.array(y_pred_rules)

#### TRAINING MODELS

In [19]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train_ml),
    y=y_train_ml
)
class_weight_dict = dict(enumerate(class_weights))

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [ 200, 300, 400],
    'max_depth': [15, 20 ],
    'min_samples_split': [ 10, 20],
    'min_samples_leaf': [ 4, 8, 10],
    'max_features': ['sqrt', 0.3],
    'class_weight': ['balanced_subsample']
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring='balanced_accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_ml, y_train_ml)

print("Best params:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

# Use best model
ml_model = grid_search.best_estimator_

In [21]:
from scipy.stats import mode

most_common = mode(y_train_ml, keepdims=True).mode[0]
baseline_acc = (y_test_ml == most_common).mean()

print(f"Baseline (always predict {most_common}): {baseline_acc:.3f}")
print(f"Model: 0.622")
print(f"Improvement over baseline: {0.622 - baseline_acc:.3f}")

In [22]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

models = {
    'Random Forest': RandomForestClassifier(n_estimators=300,min_samples_leaf=2,min_samples_split=10, class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=300, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=300, random_state=42),
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    acc = accuracy_score(y_test_ml, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.3f}")

# Show best
best_model = max(results, key=results.get)
print(f"\n🏆 Best: {best_model} ({results[best_model]:.3f})")

In [30]:
ml_model = RandomForestClassifier(n_estimators=300,min_samples_leaf=2,min_samples_split=10, class_weight='balanced', random_state=42)
ml_model.fit(X_train_ml, y_train_ml)

#### COMPARING 2 APROACHES  

In [31]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predictions
y_pred_ml = ml_model.predict(X_test_ml)  # ML model predictions
y_pred_rules = []
for idx in range(len(X_test_rb)):
    anime = X_test_rb.iloc[idx]
    pred = predict_personal_score(anime, user_prefs)
    y_pred_rules.append(pred)

# Convert rule-based continuous scores to classes
y_pred_rules_class = [classify_3_classes(score, q33,q66) for score in y_pred_rules]

# Compare accuracies
ml_accuracy = accuracy_score(y_test_ml, y_pred_ml)
rules_accuracy = accuracy_score(y_test_ml, y_pred_rules_class)

print(f"ML Model Accuracy: {ml_accuracy:.3f}")
print(f"Rule-Based Accuracy: {rules_accuracy:.3f}")

In [32]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

print("="*80)
print("BASELINE MODELS")
print("="*80)

from scipy.stats import mode
most_common = mode(y_train_ml, keepdims=True).mode[0]
y_pred_baseline = np.full(len(y_test_ml), most_common)
baseline_acc = accuracy_score(y_test_ml, y_pred_baseline)

print(f"Always predict class {most_common}: {baseline_acc:.3f}")

# Baseline 2: Random predictions
np.random.seed(42)
y_pred_random = np.random.choice([0, 1, 2], size=len(y_test_ml))
random_acc = accuracy_score(y_test_ml, y_pred_random)

print(f"Random predictions: {random_acc:.3f}")

print(f"\nYour ML model: {ml_accuracy}")
print(f"Your Rule-based: {rules_accuracy}")

if ml_accuracy < baseline_acc:
    print("\nML model is WORSE than baseline!")



print("\n" + "="*80)
print("CLASS DISTRIBUTION ANALYSIS")
print("="*80)

def analyze_distribution(y, name):
    print(f"\n{name}:")
    unique, counts = np.unique(y, return_counts=True)
    for cls, count in zip(unique, counts):
        print(f"  Class {cls}: {count:4d} ({count/len(y)*100:.1f}%)")

analyze_distribution(y_train_ml, "Training set")
analyze_distribution(y_test_ml, "Test set")


print("\n" + "="*80)
print("CONFUSION MATRIX ANALYSIS")
print("="*80)

def plot_and_analyze_cm(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    
    print(f"\n{model_name}:")
    print("Confusion Matrix (rows=true, cols=predicted):")
    print("        Bad  Avg  Good")
    for i, row in enumerate(cm):
        class_names = ['Bad   ', 'Average', 'Good  ']
        print(f"{class_names[i]}: {row}")
    
    for i in range(3):
        total = cm[i].sum()
        correct = cm[i][i]
        print(f"\nClass {i} ({['Bad', 'Average', 'Good'][i]}):")
        print(f"  Correct: {correct}/{total} ({correct/total*100:.1f}%)")
        
        for j in range(3):
            if i != j and cm[i][j] > 0:
                print(f"  Misclassified as {['Bad', 'Average', 'Good'][j]}: {cm[i][j]} ({cm[i][j]/total*100:.1f}%)")

plot_and_analyze_cm(y_test_ml, y_pred_ml, "ML Model")
plot_and_analyze_cm(y_test_ml, y_pred_rules_class, "Rule-Based Model")


### Regressor model

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

df_train_reg = df_train_ml.copy()
X_train_reg = df_train_reg.drop(columns=['score', 'score_class', 'title', 'id'])
y_train_reg = df_train_reg['score']

# Train regressor
rf_reg = RandomForestRegressor(
    n_estimators=200,
    min_samples_leaf=2,
    min_samples_split=10,
    random_state=42
)
rf_reg.fit(X_train_reg, y_train_reg)

X_test_reg = df_test_ml.drop(columns=['score', 'score_class', 'title', 'id'])
y_test_reg = df_test_ml['score']

y_pred_reg = rf_reg.predict(X_test_reg)
mae = mean_absolute_error(y_test_reg, y_pred_reg)

# Compare to baseline
baseline_mae = mean_absolute_error(y_test_reg, np.full(len(y_test_reg), y_train_reg.mean()))

print(f"Baseline MAE: {baseline_mae:.3f}")
print(f"Model MAE: {mae:.3f}")
print(f"Improvement: {(baseline_mae - mae)/baseline_mae * 100:.1f}%")