In [1]:
import optuna

import pandas as pd
import catboost as cb

from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

from typing import List, Tuple

random_state = 42

In [2]:
import os
print(os.listdir("../input"))

['catboost_dataset.csv']


## Data processing

In [3]:
# columns
target_gender = "sex"
target_age = "age_class"

cat_features = ["region", "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "category"]
text_feature = "title"
date_feature = "event_timestamp"

drop_feature = "age"

id_columns = ["viewer_uid", "rutube_video_id", "author_id"]

In [4]:
dataset = pd.read_csv('/kaggle/input/catboost_dataset.csv').drop(
    drop_feature, axis=1
).fillna('none')

# new feature
dataset['videos_per_day'] = dataset.groupby(['viewer_uid', 'day']).transform('size')

# drop unimportant old features
dataset = dataset.drop(["second", "minute", "month","year"], axis=1)

## Utils

In [9]:
def split_data(data: pd.DataFrame, target: str, test_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits the data into training and testing sets based on the specified test size.
    """
    train, test,= train_test_split(
        data, 
        test_size=test_size, 
        stratify=data[target],
        random_state=random_state
    )
    return train, test


def set_pool(data: pd.DataFrame, target: str, id_columns: List[str]) -> Pool:
    """
    Prepares the data as a CatBoost Pool object, separating features, target, and categorical/text features.
    """
    return Pool(
        data=data.drop(id_columns + [target], axis=1),
        cat_features=cat_features,
        text_features=[text_feature],
        label=data[target]
    )


def fit_model(
    train_pool: cb.core.Pool, 
    test_pool: cb.core.Pool,
    verbose: int, 
    **kwargs
) -> cb.core.CatBoostClassifier:
    """
    Trains a CatBoost classifier model on the given training data and evaluates it on the test set.
    """
    eval_metric = 'Accuracy'
    loss_function = 'Logloss'
        
    model = CatBoostClassifier(
        task_type='GPU',
        eval_metric=eval_metric,
        loss_function=loss_function,
        od_type='Iter',
        od_wait=100,
        random_seed=random_state,
        **kwargs
    )
    return model.fit(
        train_pool, 
        eval_set=test_pool, 
        verbose=verbose,
        use_best_model=True,
    )


def print_classification_result(real: pd.Series, pred: pd.Series, multiclass: bool = True):
    """
    Prints the classification metrics (Accuracy or F1-score) and detailed classification report.
    """
    if not multiclass: 
        result_score = f'Accuracy: {accuracy_score(real, pred)}'
    else:
        result_score = f'F1: {f1_score(real, pred, average="weighted")}'
    
    print(
        result_score,
        classification_report(real, pred),
        sep='\n'
    )
    
    
def aggregate_score_by_user(test: pd.DataFrame, preds: pd.DataFrame):
    """
    Aggregates predictions by user ID by taking the mode of the predicted values.

    This function adds the predictions to the test DataFrame and then computes 
    the most common prediction (mode) for each user based on their viewer_uid.
    """
    test_with_preds = test
    test_with_preds['preds'] = preds

    value_counts_preds = test_with_preds.groupby('viewer_uid')['preds'].apply(lambda x: x.mode()[0])
    value_counts_preds = value_counts_preds.to_dict()
    final_preds = test['viewer_uid'].apply(lambda x: value_counts_preds[x])
    return final_preds


def final_score(
    gender_real: pd.DataFrame,
    gender_pred: pd.DataFrame, 
    age_real: pd.DataFrame, 
    age_pred: pd.DataFrame
):
    """
    Computes the final score as a weighted combination of gender accuracy and age F1-score.
    """
    gender = accuracy_score(gender_real, gender_pred)
    age = f1_score(age_real, age_pred, average="weighted")
    result = 0.3 * gender + 0.7 * age
    print(f'Final score: {result}')
    return result

## Optuna

### Gender model

In [10]:
gender_train, gender_test = split_data(
        data=dataset, 
        target=target_gender,
        test_size=0.2,
    )

gender_train_pool = set_pool(gender_train.drop(target_age, axis=1), target_gender, id_columns)
gender_test_pool = set_pool(gender_test.drop(target_age, axis=1), target_gender, id_columns)

def gender_objective(trial):
    gender_params = {
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "Poisson", None]),
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["Balanced", "SqrtBalanced", None]),
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.5)
    }
    
    if gender_params["bootstrap_type"] == "Bayesian":
        gender_params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif gender_params["bootstrap_type"] == "Bernoulli":
        gender_params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    gender_model = fit_model(
        gender_train_pool, 
        gender_test_pool,
        verbose=500,
        iterations=1500 ,
        tokenizers=[
            {
                'tokenizer_id': 'Sense',
                'separator_type': 'BySense',
                'lowercasing': 'True',
                'token_types':['Word', 'Number', 'SentenceBreak'],
                'sub_tokens_policy':'SeveralTokens'
            }      
        ],
        dictionaries = [
            {
                'dictionary_id': 'Word',
                'dictionary_type': 'Bpe',
                'max_dictionary_size': '5000',
                'num_bpe_units': 1
            }
        ],
        feature_calcers = [
            'BoW:top_tokens_count=5000'
        ],
        **gender_params
    )
    
    gender_preds = gender_model.predict(gender_test_pool)
    gender_final_preds = aggregate_score_by_user(gender_test, gender_preds)
    score = f1_score(gender_test[target_gender], gender_final_preds, average="weighted")
    
    return score

In [11]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(gender_objective, n_trials=50, show_progress_bar=True)

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-09-28 14:28:14,298] A new study created in memory with name: no-name-12b2b4b6-93b4-4e09-a47c-8a3953a3da63


  0%|          | 0/50 [00:00<?, ?it/s]

0:	learn: 0.7109818	test: 0.7113572	best: 0.7113572 (0)	total: 72ms	remaining: 1m 47s
500:	learn: 0.7904616	test: 0.7878487	best: 0.7878800 (499)	total: 21.6s	remaining: 43.1s
1000:	learn: 0.8005670	test: 0.7957440	best: 0.7957609 (996)	total: 43.7s	remaining: 21.8s
1499:	learn: 0.8078075	test: 0.8009057	best: 0.8009057 (1499)	total: 1m 6s	remaining: 0us
bestTest = 0.8009056778
bestIteration = 1499
[I 2024-09-28 14:30:16,185] Trial 0 finished with value: 0.826623875248326 and parameters: {'bootstrap_type': None, 'auto_class_weights': 'Balanced', 'learning_rate': 0.3450282514286508}. Best is trial 0 with value: 0.826623875248326.
0:	learn: 0.7109818	test: 0.7113572	best: 0.7113572 (0)	total: 53.5ms	remaining: 1m 20s
500:	learn: 0.7753572	test: 0.7744524	best: 0.7744626 (499)	total: 21.1s	remaining: 42s
1000:	learn: 0.7817469	test: 0.7802245	best: 0.7802245 (1000)	total: 41.6s	remaining: 20.7s
1499:	learn: 0.7852265	test: 0.7828991	best: 0.7829083 (1488)	total: 1m 2s	remaining: 0us
bestT