In [1]:
import pandas as pd
import catboost as cb

from catboost import Pool, CatBoostClassifier

from typing import List, Tuple

random_state = 42

## Data Processing

In [2]:
# columns
target_gender = "sex"
target_age = "age_class"

cat_features = ["region", "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "category"]
text_feature = "title"
date_feature = "event_timestamp"

drop_feature = "age"

id_columns = ["viewer_uid", "rutube_video_id", "author_id"]

In [3]:
dataset = pd.read_csv('/kaggle/input/catboost-dataset/catboost_dataset.csv').drop(
    drop_feature, axis=1
).fillna('none')

# new feature
dataset['videos_per_day'] = dataset.groupby(['viewer_uid', 'day']).transform('size')

# drop unimportant old features
dataset = dataset.drop(["second", "minute", "month","year"], axis=1)

## Utils

In [4]:
def set_pool(data: pd.DataFrame, target: str, id_columns: List[str]) -> Pool:
    """
    Prepares the data as a CatBoost Pool object, separating features, target, and categorical/text features.
    """
    return Pool(
        data=data.drop(id_columns + [target], axis=1),
        cat_features=cat_features,
        text_features=[text_feature],
        label=data[target]
    )


def fit_model(
    train_pool: cb.core.Pool, 
    verbose: int, 
    multiclass: bool = True, 
    **kwargs
) -> cb.core.CatBoostClassifier:
    """
    Trains a CatBoost classifier model on the given training data and evaluates it on the test set.
    """
    loss_function = 'MultiClass'
    eval_metric = 'TotalF1'
    
    if not multiclass:
        loss_function = 'Logloss'
        eval_metric = 'Accuracy'
        
        
    model = CatBoostClassifier(
        task_type='GPU',
        loss_function=loss_function,
        eval_metric=eval_metric,
        od_type='Iter',
        od_wait=100,
        random_seed=random_state,
        **kwargs
    )
    return model.fit(
        train_pool, 
        verbose=verbose,
    )

## Age model

Best optun params:

```yaml
Params: 
    bootstrap_type: Bernoulli
    auto_class_weights: None
    learning_rate: 0.8715306415556401
    subsample: 0.9110538663058473

```

In [5]:
age_train_pool = set_pool(dataset.drop(target_gender, axis=1), target_age, id_columns)

In [6]:
age_model = fit_model(
    age_train_pool,
    iterations=5000,
    bootstrap_type='Bernoulli',
    learning_rate=0.8715306415556401,
    subsample=0.9110538663058473,
    verbose=100,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'dictionary_type': 'Bpe',
            'max_dictionary_size': '5000',
            'num_bpe_units': 1
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=5000'
    ],
)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.3879344	total: 5.09s	remaining: 7h 4m 19s
100:	learn: 0.4817955	total: 10.7s	remaining: 8m 39s
200:	learn: 0.5033066	total: 16.7s	remaining: 6m 37s
300:	learn: 0.5171349	total: 22.7s	remaining: 5m 54s
400:	learn: 0.5280119	total: 28.9s	remaining: 5m 30s
500:	learn: 0.5371418	total: 35.1s	remaining: 5m 15s
600:	learn: 0.5460257	total: 41.4s	remaining: 5m 3s
700:	learn: 0.5525213	total: 47.6s	remaining: 4m 52s
800:	learn: 0.5596827	total: 54.1s	remaining: 4m 43s
900:	learn: 0.5656622	total: 1m	remaining: 4m 35s
1000:	learn: 0.5714563	total: 1m 6s	remaining: 4m 27s
1100:	learn: 0.5767122	total: 1m 13s	remaining: 4m 19s
1200:	learn: 0.5813507	total: 1m 19s	remaining: 4m 12s
1300:	learn: 0.5858698	total: 1m 26s	remaining: 4m 5s
1400:	learn: 0.5900489	total: 1m 32s	remaining: 3m 57s
1500:	learn: 0.5936704	total: 1m 38s	remaining: 3m 50s
1600:	learn: 0.5972904	total: 1m 45s	remaining: 3m 43s
1700:	learn: 0.6009403	total: 1m 51s	remaining: 3m 37s
1800:	learn: 0.6048167	total: 1m 58

In [8]:
age_model.save_model(
    "age-catboost-model.cbm",
    format="cbm",
    pool=age_train_pool
)

## Gender model

Best optun params:
```yaml
Params: 
    bootstrap_type: Bernoulli
    auto_class_weights: None
    learning_rate: 0.48678853590907367
    subsample: 0.9887397939188028
```

In [9]:
gender_train_pool = set_pool(dataset.drop(target_age, axis=1), target_gender, id_columns)

In [10]:
gender_model = fit_model(
    gender_train_pool,
    iterations=5000,
    bootstrap_type='Bernoulli',
    learning_rate=0.48678853590907367,
    subsample=0.9887397939188028,
    verbose=100,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'dictionary_type': 'Bpe',
            'max_dictionary_size': '5000',
            'num_bpe_units': 1
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=5000'
    ],
)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.7120940	total: 60.6ms	remaining: 5m 2s
100:	learn: 0.7780596	total: 3.52s	remaining: 2m 50s
200:	learn: 0.7864304	total: 7.04s	remaining: 2m 48s
300:	learn: 0.7918160	total: 10.6s	remaining: 2m 46s
400:	learn: 0.7965729	total: 14.4s	remaining: 2m 45s
500:	learn: 0.8003984	total: 18.3s	remaining: 2m 44s
600:	learn: 0.8037760	total: 22.2s	remaining: 2m 42s
700:	learn: 0.8071587	total: 26.1s	remaining: 2m 40s
800:	learn: 0.8100635	total: 30.1s	remaining: 2m 37s
900:	learn: 0.8125585	total: 34s	remaining: 2m 34s
1000:	learn: 0.8148148	total: 37.9s	remaining: 2m 31s
1100:	learn: 0.8168899	total: 41.8s	remaining: 2m 28s
1200:	learn: 0.8188357	total: 45.8s	remaining: 2m 24s
1300:	learn: 0.8205654	total: 49.7s	remaining: 2m 21s
1400:	learn: 0.8222122	total: 53.7s	remaining: 2m 17s
1500:	learn: 0.8236927	total: 57.7s	remaining: 2m 14s
1600:	learn: 0.8250337	total: 1m 1s	remaining: 2m 10s
1700:	learn: 0.8265311	total: 1m 5s	remaining: 2m 7s
1800:	learn: 0.8279136	total: 1m 9s	remaini

In [11]:
gender_model.save_model(
    "gender-catboost-model.cbm",
    format="cbm",
    pool=gender_train_pool
)