In [1]:
import optuna

import pandas as pd
import catboost as cb

from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

from typing import List, Tuple

random_state = 42

## Data processing

In [2]:
# columns
target_gender = "sex"
target_age = "age_class"

cat_features = ["region", "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "category"]
text_feature = "title"
date_feature = "event_timestamp"

drop_feature = "age"

id_columns = ["viewer_uid", "rutube_video_id", "author_id"]

In [3]:
dataset = pd.read_csv('/kaggle/input/catboost-dataset/catboost_dataset.csv').drop(
    drop_feature, axis=1
).fillna('none')

# new feature
dataset['videos_per_day'] = dataset.groupby(['viewer_uid', 'day']).transform('size')

# drop unimportant old features
dataset = dataset.drop(["second", "minute", "month","year"], axis=1)

## Utils

In [4]:
def split_data(data: pd.DataFrame, target: str, test_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits the data into training and testing sets based on the specified test size.
    """
    train, test,= train_test_split(
        data, 
        test_size=test_size, 
        stratify=data[target],
        random_state=random_state
    )
    return train, test


def set_pool(data: pd.DataFrame, target: str, id_columns: List[str]) -> Pool:
    """
    Prepares the data as a CatBoost Pool object, separating features, target, and categorical/text features.
    """
    return Pool(
        data=data.drop(id_columns + [target], axis=1),
        cat_features=cat_features,
        text_features=[text_feature],
        label=data[target]
    )


def fit_model(
    train_pool: cb.core.Pool, 
    test_pool: cb.core.Pool,
    verbose: int, 
    multiclass: bool = True, 
    **kwargs
) -> cb.core.CatBoostClassifier:
    """
    Trains a CatBoost classifier model on the given training data and evaluates it on the test set.
    """
    loss_function = 'MultiClass'
    eval_metric = 'TotalF1'
    
    if not multiclass:
        loss_function = 'Logloss'
        eval_metric = 'Accuracy'
        
        
    model = CatBoostClassifier(
        task_type='GPU',
        loss_function=loss_function,
        eval_metric=eval_metric,
        od_type='Iter',
        od_wait=100,
        random_seed=random_state,
        **kwargs
    )
    return model.fit(
        train_pool, 
        eval_set=test_pool, 
        verbose=verbose,
        use_best_model=True,
    )


def print_classification_result(real: pd.Series, pred: pd.Series, multiclass: bool = True):
    """
    Prints the classification metrics (Accuracy or F1-score) and detailed classification report.
    """
    if not multiclass: 
        result_score = f'Accuracy: {accuracy_score(real, pred)}'
    else:
        result_score = f'F1: {f1_score(real, pred, average="weighted")}'
    
    print(
        result_score,
        classification_report(real, pred),
        sep='\n'
    )
    
    
def aggregate_score_by_user(test: pd.DataFrame, preds: pd.DataFrame):
    """
    Aggregates predictions by user ID by taking the mode of the predicted values.

    This function adds the predictions to the test DataFrame and then computes 
    the most common prediction (mode) for each user based on their viewer_uid.
    """
    test_with_preds = test
    test_with_preds['preds'] = preds

    value_counts_preds = test_with_preds.groupby('viewer_uid')['preds'].apply(lambda x: x.mode()[0])
    value_counts_preds = value_counts_preds.to_dict()
    final_preds = test['viewer_uid'].apply(lambda x: value_counts_preds[x])
    return final_preds


def final_score(
    gender_real: pd.DataFrame,
    gender_pred: pd.DataFrame, 
    age_real: pd.DataFrame, 
    age_pred: pd.DataFrame
):
    """
    Computes the final score as a weighted combination of gender accuracy and age F1-score.
    """
    gender = accuracy_score(gender_real, gender_pred)
    age = f1_score(age_real, age_pred, average="weighted")
    result = 0.3 * gender + 0.7 * age
    print(f'Final score: {result}')
    return result

## Optuna

### Age model

In [5]:
age_train, age_test = split_data(
        data=dataset, 
        target=target_age,
        test_size=0.2,
    )

age_train_pool = set_pool(age_train.drop(target_gender, axis=1), target_age, id_columns)
age_test_pool = set_pool(age_test.drop(target_gender, axis=1), target_age, id_columns)

def age_objective(trial):
    age_params = {
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "Poisson", None]),
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["Balanced", "SqrtBalanced", None]),
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 1)
    }
    
    if age_params["bootstrap_type"] == "Bayesian":
        age_params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif age_params["bootstrap_type"] == "Bernoulli":
        age_params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    age_model = fit_model(
        age_train_pool, 
        age_test_pool,
        verbose=500,
        iterations=1500,
        tokenizers=[
            {
                'tokenizer_id': 'Sense',
                'separator_type': 'BySense',
                'lowercasing': 'True',
                'token_types':['Word', 'Number', 'SentenceBreak'],
                'sub_tokens_policy':'SeveralTokens'
            }      
        ],
        dictionaries = [
            {
                'dictionary_id': 'Word',
                'dictionary_type': 'Bpe',
                'max_dictionary_size': '5000',
                'num_bpe_units': 1
            }
        ],
        feature_calcers = [
            'BoW:top_tokens_count=5000'
        ],
        **age_params
    )
    
    age_preds = age_model.predict(age_test_pool)
    age_final_preds = aggregate_score_by_user(age_test, age_preds)
    score = f1_score(age_test[target_age], age_final_preds, average="weighted")
    
    return score

In [6]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(age_objective, n_trials=50, show_progress_bar=True)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-09-28 14:05:22,742] A new study created in memory with name: no-name-f24adb1e-f7ea-449f-a45f-870703c1af9c


  0%|          | 0/50 [00:00<?, ?it/s]

0:	learn: 0.3458689	test: 0.3451817	best: 0.3451817 (0)	total: 305ms	remaining: 7m 36s
500:	learn: 0.5138562	test: 0.5012122	best: 0.5014804 (498)	total: 21.8s	remaining: 43.4s
1000:	learn: 0.5485370	test: 0.5253016	best: 0.5253016 (1000)	total: 44.6s	remaining: 22.2s
1499:	learn: 0.5707331	test: 0.5365499	best: 0.5366102 (1498)	total: 1m 7s	remaining: 0us
bestTest = 0.5366102259
bestIteration = 1498
Shrink model to first 1499 iterations.
[I 2024-09-28 14:07:17,137] Trial 0 finished with value: 0.582901934479547 and parameters: {'bootstrap_type': 'Poisson', 'auto_class_weights': 'SqrtBalanced', 'learning_rate': 0.84600719756453}. Best is trial 0 with value: 0.582901934479547.
0:	learn: 0.3888688	test: 0.3875615	best: 0.3875615 (0)	total: 114ms	remaining: 2m 51s
500:	learn: 0.5309650	test: 0.5207257	best: 0.5210048 (498)	total: 21.7s	remaining: 43.2s
1000:	learn: 0.5610653	test: 0.5414834	best: 0.5414953 (999)	total: 44.4s	remaining: 22.1s
1499:	learn: 0.5796354	test: 0.5531859	best: 0.