In [1]:
import pandas as pd
import numpy as np
import catboost as cb

from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

from typing import List, Tuple

## Dataset processing

In [2]:
dataset = pd.read_csv('/kaggle/input/catboost-dataset/catboost_dataset.csv')
random_state = 42

In [3]:
# columns
target_gender = "sex"
target_age = "age_class"

cat_features = ["region", "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "category"]
text_feature = "title"
number_feature = ["total_watchtime", "duration"]
date_feature = "event_timestamp"

drop_feature = "age"

id_columns = ["viewer_uid", "rutube_video_id", "author_id"]

In [4]:
dataset.head()

Unnamed: 0,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,age,sex,...,year,month,day,hour,minute,second,day_of_week,is_weekend,hour_sin,hour_cos
0,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,video_133074,10067243,20,female,...,2024,6,1,8,40,58,5,1,0.866025,-0.5
1,Chelyabinsk,desktop,browser,Windows,Yandex Browser,851,video_61152,10067243,20,female,...,2024,6,1,8,10,14,5,1,0.866025,-0.5
2,Chelyabinsk,desktop,browser,Windows,Yandex Browser,5516,video_96775,10067243,20,female,...,2024,6,2,8,15,7,6,1,0.866025,-0.5
3,Chelyabinsk,desktop,browser,Windows,Yandex Browser,3615,video_402535,10067243,20,female,...,2024,6,3,19,35,40,0,0,-0.965926,0.258819
4,Chelyabinsk,desktop,browser,Windows,Yandex Browser,6144,video_180483,10067243,20,female,...,2024,6,5,21,20,33,2,0,-0.707107,0.707107


In [5]:
# drop target int column AGE (not gonna use in classification task)
dataset = dataset.drop(drop_feature, axis=1).fillna('none')

In [6]:
dataset.columns

Index(['region', 'ua_device_type', 'ua_client_type', 'ua_os', 'ua_client_name',
       'total_watchtime', 'rutube_video_id', 'viewer_uid', 'sex', 'age_class',
       'title', 'category', 'duration', 'author_id', 'value_counts', 'year',
       'month', 'day', 'hour', 'minute', 'second', 'day_of_week', 'is_weekend',
       'hour_sin', 'hour_cos'],
      dtype='object')

## Utils

In [7]:
def split_data(data: pd.DataFrame, target: str, test_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits the data into training and testing sets based on the specified test size.
    """
    train, test,= train_test_split(
        data, 
        test_size=test_size, 
        stratify=data[target],
        random_state=random_state
    )
    return train, test

In [8]:
def set_pool(data: pd.DataFrame, target: str, id_columns: List[str]) -> Pool:
    """
    Prepares the data as a CatBoost Pool object, separating features, target, and categorical/text features.
    """
    return Pool(
        data=data.drop(id_columns + [target], axis=1),
        cat_features=cat_features,
        text_features=[text_feature],
        label=data[target]
    )

In [9]:
def fit_model(
    train_pool: cb.core.Pool, 
    test_pool: cb.core.Pool, 
    multiclass: bool = True, 
    **kwargs
) -> cb.core.CatBoostClassifier:
    """
    Trains a CatBoost classifier model on the given training data and evaluates it on the test set.
    """
    loss_function = 'MultiClass'
    eval_metric = 'TotalF1'
    
    if not multiclass:
        loss_function = 'Logloss'
        eval_metric = 'Accuracy'
        
        
    model = CatBoostClassifier(
        task_type='GPU',
        loss_function=loss_function,
        eval_metric=eval_metric,
        od_type='Iter',
        od_wait=100,
        random_seed=random_state,
        **kwargs
    )
    return model.fit(
        train_pool, 
        eval_set=test_pool, 
        verbose=100,
        use_best_model=True,
    )

In [10]:
def print_classification_result(real: pd.Series, pred: pd.Series, multiclass: bool = True):
    """
    Prints the classification metrics (Accuracy or F1-score) and detailed classification report.
    """
    if not multiclass: 
        result_score = f'Accuracy: {accuracy_score(real, pred)}'
    else:
        result_score = f'F1: {f1_score(real, pred, average="weighted")}'
    
    print(
        result_score,
        classification_report(real, pred),
        sep='\n'
    )

In [11]:
def aggregate_score_by_user(test: pd.DataFrame, preds: pd.DataFrame):
    """
    Aggregates predictions by user ID by taking the mode of the predicted values.

    This function adds the predictions to the test DataFrame and then computes 
    the most common prediction (mode) for each user based on their viewer_uid.
    """
    test_with_preds = test
    test_with_preds['preds'] = preds

    value_counts_preds = test_with_preds.groupby('viewer_uid')['preds'].apply(lambda x: x.mode()[0])
    value_counts_preds = value_counts_preds.to_dict()
    final_preds = test['viewer_uid'].apply(lambda x: value_counts_preds[x])
    return final_preds

In [12]:
def final_score(
    gender_real: pd.DataFrame,
    gender_pred: pd.DataFrame, 
    age_real: pd.DataFrame, 
    age_pred: pd.DataFrame
):
    """
    Computes the final score as a weighted combination of gender accuracy and age F1-score.
    """
    gender = accuracy_score(gender_real, gender_pred)
    age = f1_score(age_real, age_pred, average="weighted")
    result = 0.3 * gender + 0.7 * age
    print(f'Final score: {result}')
    return result

## Age target

In [13]:
age_train, age_test = split_data(
    data=dataset, 
    target=target_age,
    test_size=0.2,
)

In [14]:
age_train_pool = set_pool(age_train.drop(target_gender, axis=1), target_age, id_columns)
age_test_pool = set_pool(age_test.drop(target_gender, axis=1), target_age, id_columns)

In [16]:
age_model = fit_model(
    age_train_pool, 
    age_test_pool,
    iterations=5000,
    auto_class_weights='SqrtBalanced',
    learning_rate=0.1,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }      
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'dictionary_type': 'Bpe',
            'max_dictionary_size': '10000'
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=10000'
    ]
)

0:	learn: 0.3458689	test: 0.3451817	best: 0.3451817 (0)	total: 9.03s	remaining: 12h 32m 38s
100:	learn: 0.4084335	test: 0.4076540	best: 0.4076540 (100)	total: 17.4s	remaining: 14m 1s
200:	learn: 0.4228786	test: 0.4218426	best: 0.4218426 (200)	total: 24.8s	remaining: 9m 53s
300:	learn: 0.4330425	test: 0.4314734	best: 0.4314734 (300)	total: 32.6s	remaining: 8m 29s
400:	learn: 0.4415456	test: 0.4395170	best: 0.4395170 (400)	total: 40.4s	remaining: 7m 43s
500:	learn: 0.4477857	test: 0.4451344	best: 0.4451344 (500)	total: 48.4s	remaining: 7m 14s
600:	learn: 0.4532647	test: 0.4499612	best: 0.4499612 (600)	total: 56.2s	remaining: 6m 51s
700:	learn: 0.4585291	test: 0.4547666	best: 0.4547666 (700)	total: 1m 4s	remaining: 6m 33s
800:	learn: 0.4628052	test: 0.4582491	best: 0.4582491 (800)	total: 1m 11s	remaining: 6m 17s
900:	learn: 0.4663728	test: 0.4618475	best: 0.4618475 (900)	total: 1m 19s	remaining: 6m 3s
1000:	learn: 0.4693393	test: 0.4646124	best: 0.4646124 (1000)	total: 1m 27s	remaining: 5

In [17]:
age_preds = age_model.predict(age_test_pool)

In [19]:
print_classification_result(age_test[target_age], age_preds)

F1: 0.5306669757183082
              precision    recall  f1-score   support

           0       0.50      0.29      0.37     13259
           1       0.56      0.63      0.59    130752
           2       0.53      0.47      0.50    134760
           3       0.50      0.53      0.51     73153

    accuracy                           0.53    351924
   macro avg       0.52      0.48      0.49    351924
weighted avg       0.53      0.53      0.53    351924



In [22]:
age_final_preds = aggregate_score_by_user(age_test, age_preds)

In [24]:
print_classification_result(age_test[target_age], age_final_preds)

F1: 0.5497243546667704
              precision    recall  f1-score   support

           0       0.53      0.33      0.41     13259
           1       0.56      0.68      0.62    130752
           2       0.55      0.48      0.51    134760
           3       0.54      0.52      0.53     73153

    accuracy                           0.55    351924
   macro avg       0.55      0.50      0.52    351924
weighted avg       0.55      0.55      0.55    351924



## Gender target

In [25]:
gender_train, gender_test = split_data(
    data=dataset, 
    target=target_gender,
    test_size=0.2,
)

In [26]:
gender_train_pool = set_pool(gender_train.drop(target_age, axis=1), target_gender, id_columns)
gender_test_pool = set_pool(gender_test.drop(target_age, axis=1), target_gender, id_columns)

In [27]:
gender_model = fit_model(
    gender_train_pool, 
    gender_test_pool,
    multiclass=False,
    iterations=5000,
    auto_class_weights='SqrtBalanced',
    learning_rate=0.1,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }      
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'dictionary_type': 'Bpe',
            'max_dictionary_size': '10000'
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=10000'
    ]
)

0:	learn: 0.7060282	test: 0.7065187	best: 0.7065187 (0)	total: 6.57s	remaining: 9h 7m 36s
100:	learn: 0.7606267	test: 0.7608294	best: 0.7608294 (100)	total: 13.1s	remaining: 10m 33s
200:	learn: 0.7685760	test: 0.7684209	best: 0.7684209 (200)	total: 18s	remaining: 7m 10s
300:	learn: 0.7727842	test: 0.7725103	best: 0.7725103 (300)	total: 23s	remaining: 5m 58s
400:	learn: 0.7759257	test: 0.7754169	best: 0.7754169 (400)	total: 27.9s	remaining: 5m 20s
500:	learn: 0.7783096	test: 0.7774217	best: 0.7774276 (499)	total: 32.9s	remaining: 4m 55s
600:	learn: 0.7801938	test: 0.7792211	best: 0.7792458 (599)	total: 37.8s	remaining: 4m 36s
700:	learn: 0.7820154	test: 0.7810042	best: 0.7810042 (700)	total: 42.9s	remaining: 4m 22s
800:	learn: 0.7837542	test: 0.7823490	best: 0.7823490 (800)	total: 47.9s	remaining: 4m 11s
900:	learn: 0.7851487	test: 0.7834650	best: 0.7834709 (895)	total: 53.1s	remaining: 4m 1s
1000:	learn: 0.7863878	test: 0.7846385	best: 0.7846385 (1000)	total: 58.2s	remaining: 3m 52s
11

In [28]:
gender_preds = gender_model.predict(gender_test_pool)

In [30]:
print_classification_result(gender_test[target_gender], gender_preds)

F1: 0.8047062144622759
              precision    recall  f1-score   support

      female       0.85      0.84      0.84    216903
        male       0.74      0.75      0.75    135021

    accuracy                           0.80    351924
   macro avg       0.79      0.79      0.79    351924
weighted avg       0.81      0.80      0.80    351924



In [34]:
gender_final_preds = aggregate_score_by_user(gender_test, gender_preds)

In [38]:
print_classification_result(gender_test[target_gender], gender_final_preds, multiclass=False)

Accuracy: 0.8231038519680386
              precision    recall  f1-score   support

      female       0.85      0.87      0.86    216903
        male       0.78      0.75      0.77    135021

    accuracy                           0.82    351924
   macro avg       0.81      0.81      0.81    351924
weighted avg       0.82      0.82      0.82    351924



## Final score

In [39]:
_ = final_score(
    gender_test[target_gender], 
    gender_final_preds, 
    age_test[target_age], 
    age_final_preds
)

Final score: 0.6317382038571508


## Models analysis

### Age model

In [45]:
age_fi = age_model.get_feature_importance()

In [47]:
pd.DataFrame(
    {
        'feature_importance': age_fi, 
        'feature_names': age_test_pool.get_feature_names()}).sort_values(
    by=['feature_importance'], 
    ascending=False
)

Unnamed: 0,feature_importance,feature_names
6,40.251881,title
9,20.613221,value_counts
0,8.663417,region
7,5.867908,category
3,5.124102,ua_os
8,4.285915,duration
4,4.116798,ua_client_name
5,3.120645,total_watchtime
1,2.616837,ua_device_type
2,1.771027,ua_client_type


In [51]:
gender_fi = gender_model.get_feature_importance()

In [52]:
pd.DataFrame(
    {
        'feature_importance': gender_fi, 
        'feature_names': gender_test_pool.get_feature_names()}).sort_values(
    by=['feature_importance'], 
    ascending=False
)

Unnamed: 0,feature_importance,feature_names
6,55.076348,title
9,18.266888,value_counts
7,5.678123,category
1,4.188016,ua_device_type
8,3.811734,duration
2,2.40998,ua_client_type
3,2.364783,ua_os
0,2.172856,region
4,1.869559,ua_client_name
12,1.07297,day


## Итого

Выводы для улучшений:
- можно убрать последние N признаков, исходя из feature importance (например, ["second", "minute", "month","year"])
- можно перебрать ряд параметров через optuna (например, авто балансировку классов, тип бустинга, буфстрапа и т.д.)
- возможно добавить новые смысловые переменные
- подумать над бейзлайн параметрами моделей