In [1]:
import pandas as pd
import catboost as cb

from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report

from typing import List

random_state = 42

## Data Processing

In [2]:
# columns
target_gender = "sex"
target_age = "age_class"

cat_features = ["region", "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "category"]
text_feature = "title"
date_feature = "event_timestamp"

drop_feature = "age"

id_columns = ["viewer_uid", "rutube_video_id", "author_id"]

In [3]:
dataset = pd.read_csv('../../catboost_test.csv').fillna('none')

# new feature
dataset['videos_per_day'] = dataset.groupby(['viewer_uid', 'day']).transform('size')

# drop unimportant old features
dataset = dataset.drop(["second", "minute", "month","year"], axis=1)

## Utils

In [4]:
def set_pool(data: pd.DataFrame, id_columns: List[str]) -> cb.core.Pool:
    """
    Prepares the data as a CatBoost Pool object, separating features, target, and categorical/text features.
    """
    return Pool(
        data=data.drop(id_columns, axis=1),
        cat_features=cat_features,
        text_features=[text_feature],
    )


def print_classification_result(real: pd.Series, pred: pd.Series, multiclass: bool = True):
    """
    Prints the classification metrics (Accuracy or F1-score) and detailed classification report.
    """
    if not multiclass: 
        result_score = f'Accuracy: {accuracy_score(real, pred)}'
    else:
        result_score = f'F1: {f1_score(real, pred, average="weighted")}'
    
    print(
        result_score,
        classification_report(real, pred),
        sep='\n'
    )
    
    
def aggregate_score_by_user(test: pd.DataFrame, preds: pd.DataFrame):
    """
    Aggregates predictions by user ID by taking the mode of the predicted values.

    This function adds the predictions to the test DataFrame and then computes 
    the most common prediction (mode) for each user based on their viewer_uid.
    """
    test_with_preds = test
    test_with_preds['preds'] = pd.DataFrame(preds)

    value_counts_preds = test_with_preds.groupby('viewer_uid')['preds'].apply(lambda x: x.mode()[0])

    return value_counts_preds


def final_score(
    gender_real: pd.DataFrame,
    gender_pred: pd.DataFrame, 
    age_real: pd.DataFrame, 
    age_pred: pd.DataFrame
):
    """
    Computes the final score as a weighted combination of gender accuracy and age F1-score.
    """
    gender = accuracy_score(gender_real, gender_pred)
    age = f1_score(age_real, age_pred, average="weighted")
    result = 0.3 * gender + 0.7 * age
    print(f'Final score: {result}')
    return result

## Predict

In [5]:
# age model
age_test_pool = set_pool(dataset, id_columns)

age_model = CatBoostClassifier().load_model('../models/age-catboost-model.cbm')

In [6]:
# gender model 
gender_test_pool = set_pool(dataset, id_columns)

gender_model = CatBoostClassifier().load_model('../models/gender-catboost-model.cbm')

In [7]:
age_predict = age_model.predict(age_test_pool)
gender_predict = gender_model.predict(gender_test_pool)

In [10]:
dataset_submission = dataset.copy()
dataset_submission['sex'] = pd.DataFrame(gender_predict)
dataset_submission['age_class'] = pd.DataFrame(age_predict)
dataset_submission = dataset_submission[['viewer_uid', 'sex', 'age_class']]

In [11]:
dataset_submission

Unnamed: 0,viewer_uid,sex,age_class
0,22206,female,1
1,34531,male,3
2,25830,male,1
3,14838,female,3
4,13718,female,2
...,...,...,...
587730,23170,male,3
587731,43241,male,3
587732,31204,male,3
587733,40889,male,1


In [12]:
final_age_predict = aggregate_score_by_user(dataset_submission, age_predict)
final_gender_predict = aggregate_score_by_user(dataset_submission, gender_predict)

In [15]:
import numpy as np
np.sort(dataset_submission['viewer_uid'].unique())

array([    0,     1,     2, ..., 60001, 60002, 60003])

In [32]:
submission = pd.DataFrame(columns=['viewer_uid', 'age', 'sex', 'age_class'])

submission['viewer_uid'] = final_gender_predict.index
submission['age'] = final_age_predict
submission['sex'] = final_gender_predict
submission['age_class'] = final_age_predict

In [33]:
submission.head()

Unnamed: 0,viewer_uid,age,sex,age_class
0,0,3,male,3
1,1,3,male,3
2,2,1,male,1
3,3,2,male,2
4,4,1,female,1


In [34]:
sub = pd.read_csv('../test_dataset/subm.csv')
sub

Unnamed: 0,viewer_uid,age,sex,age_class
0,14416,39,female,0
1,5190,12,male,1
2,8887,23,male,0
3,55417,18,female,3
4,8980,48,female,3
...,...,...,...,...
59999,16343,44,female,0
60000,47183,26,female,3
60001,23370,58,male,0
60002,12750,22,male,1


In [35]:
viewer_uid = sub.viewer_uid

In [36]:
final_sub = pd.merge(viewer_uid, submission, on='viewer_uid')
final_sub

Unnamed: 0,viewer_uid,age,sex,age_class
0,14416,1,female,1
1,5190,2,male,2
2,8887,2,female,2
3,55417,1,male,1
4,8980,1,female,1
...,...,...,...,...
59999,16343,1,female,1
60000,47183,3,female,3
60001,23370,1,female,1
60002,12750,1,female,1


In [37]:
final_sub.to_csv(
    "./submission.csv",
    index=False 
)

In [38]:
final_sub.shape

(60004, 4)

In [39]:
viewer_uid.shape

(60004,)