In [1]:
import pandas as pd
import catboost as cb

from catboost import Pool, CatBoostClassifier

from typing import List, Tuple

random_state = 42

## Data Processing

In [2]:
# columns
target_gender = "sex"
target_age = "age_class"

cat_features = ["region", "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "category"]
text_feature = "title"
date_feature = "event_timestamp"

drop_feature = "age"

id_columns = ["viewer_uid", "rutube_video_id", "author_id"]

In [3]:
dataset = pd.read_csv('/kaggle/input/catboost-dataset/catboost_dataset.csv').drop(
    drop_feature, axis=1
).fillna('none')

# new feature
dataset['videos_per_day'] = dataset.groupby(['viewer_uid', 'day']).transform('size')

# drop unimportant old features
dataset = dataset.drop(["second", "minute", "month","year"], axis=1)

## Utils

In [4]:
def set_pool(data: pd.DataFrame, target: str, id_columns: List[str]) -> Pool:
    """
    Prepares the data as a CatBoost Pool object, separating features, target, and categorical/text features.
    """
    return Pool(
        data=data.drop(id_columns + [target], axis=1),
        cat_features=cat_features,
        text_features=[text_feature],
        label=data[target]
    )


def fit_model(
    train_pool: cb.core.Pool, 
    verbose: int, 
    multiclass: bool = True, 
    **kwargs
) -> cb.core.CatBoostClassifier:
    """
    Trains a CatBoost classifier model on the given training data and evaluates it on the test set.
    """
    loss_function = 'MultiClass'
    eval_metric = 'TotalF1'
    
    if not multiclass:
        loss_function = 'Logloss'
        eval_metric = 'Accuracy'
        
        
    model = CatBoostClassifier(
        task_type='GPU',
        loss_function=loss_function,
        eval_metric=eval_metric,
        od_type='Iter',
        od_wait=100,
        random_seed=random_state,
        **kwargs
    )
    return model.fit(
        train_pool, 
        verbose=verbose,
    )

## Age model

Best optun params:

```yaml
Params: 
    bootstrap_type: Bernoulli
    auto_class_weights: None
    learning_rate: 0.4976669440909651
    subsample: 0.9354177622692733

```

In [5]:
age_train_pool = set_pool(dataset.drop(target_gender, axis=1), target_age, id_columns)

In [6]:
age_model = fit_model(
    age_train_pool,
    iterations=5000,
    bootstrap_type='Bernoulli',
    learning_rate=0.4976669440909651,
    subsample=0.9354177622692733,
    verbose=100,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'dictionary_type': 'Bpe',
            'max_dictionary_size': '5000',
            'num_bpe_units': 1
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=5000'
    ],
)

0:	learn: 0.3879344	total: 4.73s	remaining: 6h 33m 52s
100:	learn: 0.4769632	total: 10.4s	remaining: 8m 25s
200:	learn: 0.4959503	total: 16.4s	remaining: 6m 32s
300:	learn: 0.5091697	total: 22.7s	remaining: 5m 54s
400:	learn: 0.5194496	total: 28.9s	remaining: 5m 31s
500:	learn: 0.5283364	total: 35.1s	remaining: 5m 15s
600:	learn: 0.5356860	total: 41.4s	remaining: 5m 3s
700:	learn: 0.5419071	total: 47.9s	remaining: 4m 53s
800:	learn: 0.5477768	total: 54.3s	remaining: 4m 44s
900:	learn: 0.5528745	total: 1m	remaining: 4m 36s
1000:	learn: 0.5578871	total: 1m 7s	remaining: 4m 28s
1100:	learn: 0.5625843	total: 1m 13s	remaining: 4m 21s
1200:	learn: 0.5672158	total: 1m 20s	remaining: 4m 13s
1300:	learn: 0.5714387	total: 1m 26s	remaining: 4m 7s
1400:	learn: 0.5755397	total: 1m 33s	remaining: 4m
1500:	learn: 0.5786810	total: 1m 40s	remaining: 3m 53s
1600:	learn: 0.5816910	total: 1m 46s	remaining: 3m 46s
1700:	learn: 0.5851990	total: 1m 53s	remaining: 3m 39s
1800:	learn: 0.5887442	total: 2m	remai

In [7]:
age_model.save_model(
    "age-catboost-model.cbm",
    format="cbm",
    pool=age_train_pool
)

## Gender model

Best optun params:
```yaml
Params: 
    bootstrap_type: Bernoulli
    auto_class_weights: None
    learning_rate: 0.4983777521803222
    subsample: 0.9606212019353756
```

In [8]:
gender_train_pool = set_pool(dataset.drop(target_age, axis=1), target_gender, id_columns)

In [9]:
gender_model = fit_model(
    gender_train_pool,
    iterations=5000,
    bootstrap_type='Bernoulli',
    learning_rate=0.4983777521803222,
    subsample=0.9606212019353756,
    verbose=100,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'dictionary_type': 'Bpe',
            'max_dictionary_size': '5000',
            'num_bpe_units': 1
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=5000'
    ],
)

0:	learn: 0.7099180	total: 59.5ms	remaining: 4m 57s
100:	learn: 0.7783696	total: 3.44s	remaining: 2m 47s
200:	learn: 0.7862559	total: 7.12s	remaining: 2m 49s
300:	learn: 0.7917883	total: 10.7s	remaining: 2m 47s
400:	learn: 0.7967212	total: 14.4s	remaining: 2m 45s
500:	learn: 0.8013284	total: 18.2s	remaining: 2m 43s
600:	learn: 0.8047856	total: 22s	remaining: 2m 40s
700:	learn: 0.8077177	total: 25.8s	remaining: 2m 38s
800:	learn: 0.8101531	total: 29.7s	remaining: 2m 35s
900:	learn: 0.8126063	total: 33.6s	remaining: 2m 32s
1000:	learn: 0.8144257	total: 37.5s	remaining: 2m 29s
1100:	learn: 0.8164586	total: 41.5s	remaining: 2m 26s
1200:	learn: 0.8185886	total: 45.5s	remaining: 2m 23s
1300:	learn: 0.8205268	total: 49.5s	remaining: 2m 20s
1400:	learn: 0.8221318	total: 53.4s	remaining: 2m 17s
1500:	learn: 0.8240110	total: 57.3s	remaining: 2m 13s
1600:	learn: 0.8256800	total: 1m 1s	remaining: 2m 10s
1700:	learn: 0.8268657	total: 1m 5s	remaining: 2m 6s
1800:	learn: 0.8280073	total: 1m 9s	remain

In [10]:
gender_model.save_model(
    "gender-catboost-model.cbm",
    format="cbm",
    pool=gender_train_pool
)