In [1]:
import optuna

import pandas as pd
import numpy as np
import catboost as cb

from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

from typing import List, Tuple

random_state = 42

In [2]:
import os
print(os.listdir("../input"))

['catboost_dataset.csv']


## Data processing

In [3]:
# columns
target_gender = "sex"
target_age = "age_class"

cat_features = ["region", "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "category"]
text_feature = "title"
date_feature = "event_timestamp"

drop_feature = "age"

id_columns = ["viewer_uid", "rutube_video_id", "author_id"]

In [4]:
dataset = pd.read_csv('/kaggle/input/catboost_dataset.csv').drop(
    drop_feature, axis=1
).fillna('none')

# new feature
dataset['videos_per_day'] = dataset.groupby(['viewer_uid', 'day']).transform('size')

# drop unimportant old features
dataset = dataset.drop(["second", "minute", "month","year"], axis=1)

## Utils

In [5]:
def split_data(data: pd.DataFrame, target: str, test_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits the data into training and testing sets based on the specified test size.
    """
    train, test,= train_test_split(
        data, 
        test_size=test_size, 
        stratify=data[target],
        random_state=random_state
    )
    return train, test


def set_pool(data: pd.DataFrame, target: str, id_columns: List[str]) -> Pool:
    """
    Prepares the data as a CatBoost Pool object, separating features, target, and categorical/text features.
    """
    return Pool(
        data=data.drop(id_columns + [target], axis=1),
        cat_features=cat_features,
        text_features=[text_feature],
        label=data[target]
    )


def fit_model(
    train_pool: cb.core.Pool, 
    test_pool: cb.core.Pool,
    verbose: int, 
    **kwargs
) -> cb.core.CatBoostClassifier:
    """
    Trains a CatBoost classifier model on the given training data and evaluates it on the test set.
    """
    eval_metric = 'Accuracy'
    loss_function = 'Logloss'
        
    model = CatBoostClassifier(
        task_type='GPU',
        eval_metric=eval_metric,
        loss_function=loss_function,
        od_type='Iter',
        od_wait=100,
        random_seed=random_state,
        **kwargs
    )
    return model.fit(
        train_pool, 
        eval_set=test_pool, 
        verbose=verbose,
        use_best_model=True,
    )


def print_classification_result(real: pd.Series, pred: pd.Series, multiclass: bool = True):
    """
    Prints the classification metrics (Accuracy or F1-score) and detailed classification report.
    """
    if not multiclass: 
        result_score = f'Accuracy: {accuracy_score(real, pred)}'
    else:
        result_score = f'F1: {f1_score(real, pred, average="weighted")}'
    
    print(
        result_score,
        classification_report(real, pred),
        sep='\n'
    )
    
    
def aggregate_score_by_user(test: pd.DataFrame, preds: pd.DataFrame):
    """
    Aggregates predictions by user ID by taking the mode of the predicted values.

    This function adds the predictions to the test DataFrame and then computes 
    the most common prediction (mode) for each user based on their viewer_uid.
    """
    test_with_preds = test
    test_with_preds['preds'] = preds

    final_preds = test_with_preds.groupby('viewer_uid')['preds'].apply(lambda x: x.mode()[0])

    return final_preds


def final_score(
    gender_real: pd.DataFrame,
    gender_pred: pd.DataFrame, 
    age_real: pd.DataFrame, 
    age_pred: pd.DataFrame
):
    """
    Computes the final score as a weighted combination of gender accuracy and age F1-score.
    """
    gender = accuracy_score(gender_real, gender_pred)
    age = f1_score(age_real, age_pred, average="weighted")
    result = 0.3 * gender + 0.7 * age
    print(f'Final score: {result}')
    return result

## Optuna

### Gender model

In [6]:
gender_train, gender_test = split_data(
        data=dataset, 
        target=target_gender,
        test_size=0.2,
    )

gender_train_pool = set_pool(gender_train.drop(target_age, axis=1), target_gender, id_columns)
gender_test_pool = set_pool(gender_test.drop(target_age, axis=1), target_gender, id_columns)

def gender_objective(trial):
    gender_params = {
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "Poisson", None]),
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["Balanced", "SqrtBalanced", None]),
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.5)
    }
    
    if gender_params["bootstrap_type"] == "Bayesian":
        gender_params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif gender_params["bootstrap_type"] == "Bernoulli":
        gender_params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    gender_model = fit_model(
        gender_train_pool, 
        gender_test_pool,
        verbose=500,
        iterations=1500,
        tokenizers=[
            {
                'tokenizer_id': 'Sense',
                'separator_type': 'BySense',
                'lowercasing': 'True',
                'token_types':['Word', 'Number', 'SentenceBreak'],
                'sub_tokens_policy':'SeveralTokens'
            }      
        ],
        dictionaries = [
            {
                'dictionary_id': 'Word',
                'dictionary_type': 'Bpe',
                'max_dictionary_size': '5000',
                'num_bpe_units': 1
            }
        ],
        feature_calcers = [
            'BoW:top_tokens_count=5000'
        ],
        **gender_params
    )
    
    gender_preds = gender_model.predict(gender_test_pool)
    gender_final_preds = aggregate_score_by_user(gender_test, gender_preds)
    aggregated_gender_test = gender_test.groupby('viewer_uid')[target_gender].apply(lambda x: x.mode()[0])
    score = accuracy_score(aggregated_gender_test, gender_final_preds)
    
    return score

In [7]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(gender_objective, n_trials=60, show_progress_bar=True)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-09-28 21:39:21,567] A new study created in memory with name: no-name-8cb9c8d5-11c8-4571-8fee-c1e019e6ce1f


  0%|          | 0/60 [00:00<?, ?it/s]

0:	learn: 0.7109818	test: 0.7113572	best: 0.7113572 (0)	total: 7.26s	remaining: 3h 1m 24s
500:	learn: 0.7843659	test: 0.7823443	best: 0.7823965 (496)	total: 28s	remaining: 55.7s
1000:	learn: 0.7904797	test: 0.7874476	best: 0.7875439 (978)	total: 48.3s	remaining: 24.1s
1499:	learn: 0.7940982	test: 0.7896017	best: 0.7896044 (1498)	total: 1m 8s	remaining: 0us
bestTest = 0.7896044163
bestIteration = 1498
Shrink model to first 1499 iterations.
[I 2024-09-28 21:41:41,469] Trial 0 finished with value: 0.7658953474584773 and parameters: {'bootstrap_type': 'Poisson', 'auto_class_weights': 'Balanced', 'learning_rate': 0.4583935426469351}. Best is trial 0 with value: 0.7658953474584773.
0:	learn: 0.7098725	test: 0.7105277	best: 0.7105277 (0)	total: 57.3ms	remaining: 1m 25s
500:	learn: 0.7864027	test: 0.7837476	best: 0.7837476 (500)	total: 20.9s	remaining: 41.6s
1000:	learn: 0.7935589	test: 0.7895440	best: 0.7895556 (998)	total: 42.1s	remaining: 21s
1499:	learn: 0.7977633	test: 0.7921570	best: 0.7