In [4]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [5]:
import bisect

import pandas as pd
import numpy as np
import dask.dataframe as dd
import pycaret.classification as pc

from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm, cv
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

## Препроцессинг

In [6]:
df = dd.read_parquet('..\\full_data\\figma_age\\full_age_data.parquet').compute()

#берем нужные колонки url
df = df.loc[:,['user_id']+df.columns[12:-6].to_list()]
top = np.load('.\\npy_files\\top1500_features_age_urls.npy',allow_pickle=True)
df = df.loc[:,(df.columns.isin(top))|(df.columns.isin(['user_id']))]
# добавляем target, удаляем nan
df = df.merge(pd.read_parquet('..\\full_data\\public_train.pqt',columns=['user_id','age'])[['age','user_id']], on = 'user_id', how = 'inner')
df = df.loc[~(df['age'].isna()) & (df['age'] != 'NA')]
df['age'] = df['age'].astype('int8')

def age_bucket(x):
    return bisect.bisect_left([25,35,45,55,65], x)
df['age'] = df['age'].map(age_bucket)

display(df.head())
display(df.info())

Unnamed: 0,user_id,googleads.g.doubleclick.net,yandex.ru,i.ytimg.com,vk.com,avatars.mds.yandex.net,ad.mail.ru,yastatic.net,apple.com,instagram.com,...,other_url,bask_4cpe_type_cd,bask_5cpe_type_cd,bask_0url_host,bask_1url_host,bask_2url_host,bask_3url_host,bask_4url_host,bask_5url_host,age
0,0,0.196891,0.031088,0.025907,0.041451,0.046632,0.031088,0.051813,0.0,0.0,...,0.031088,0.069146,0.013774,0.134186,0.342372,0.281677,0.149741,0.076944,0.01508,1
1,1,0.072588,0.112703,0.040115,0.023878,0.104107,0.034384,0.081184,0.0,0.0,...,0.013372,0.069146,0.013774,0.130193,0.341496,0.28469,0.151575,0.077067,0.014979,2
2,2,0.092457,0.046229,0.126521,0.141119,0.072993,0.038929,0.046229,0.0,0.0,...,0.002433,0.069146,0.013774,0.144648,0.349024,0.276713,0.143092,0.072281,0.014242,0
3,3,0.021818,0.061818,0.007273,0.178182,0.094545,0.08,0.069091,0.0,0.0,...,0.010909,0.069146,0.013774,0.148331,0.353465,0.277041,0.139772,0.068262,0.013129,3
4,4,0.061776,0.030888,0.048906,0.083655,0.046332,0.061776,0.045045,0.0,0.003861,...,0.011583,0.069146,0.013774,0.145739,0.350527,0.276666,0.14205,0.070993,0.014025,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 269999 entries, 0 to 269998
Columns: 1502 entries, user_id to age
dtypes: float32(1492), float64(8), int32(1), int64(1)
memory usage: 1.5 GB


None

In [17]:
display(df.isna().sum().sum())
for name in df['cpe_manufacturer_name'].unique():
    df.loc[(df['cpe_manufacturer_name']==name)&(df['price'].isna()),'price'] = \
        df.loc[(df['cpe_manufacturer_name']==name)&(~df['price'].isna()),'price'].median()
df.loc[df['price']<1000,'price'] = 1000
display(df.isna().sum().sum())

6799

215

## Тест на 10 выборках

In [7]:
total_train = 0
total_test = 0
cv = 4
for iter in [1000,5000,10000,20000,30000]:
    for i in range(cv):
        x_train, x_test, y_train, y_test = train_test_split(df.drop(['age','user_id'], axis = 1),df['age'],
                                                            test_size = 0.40,random_state = i,stratify=df['age'])

        pool_train = Pool(x_train, y_train,
                          cat_features = list(x_train.select_dtypes(include=['object']).columns))
        pool_test = Pool(x_test,y_test,
                         cat_features = list(x_train.select_dtypes(include=['object']).columns))

        model = CatBoostClassifier(iterations=iter,
                                   learning_rate = 0.01,
                                   random_state=42,
                                   task_type='GPU',
                                   eval_metric='AUC')

        model.fit(pool_train,eval_set=(x_test,y_test), verbose=False,plot=True)
        print(classification_report(y_test, model.predict(x_test), \
                                    target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))
'''        print(2 * roc_auc_score(y_test, model.predict_proba(x_test)[:,1]) - 1)
        total_test += 2 * roc_auc_score(y_test, model.predict_proba(x_test)[:,1]) - 1
        total_train += 2 * roc_auc_score(y_train, model.predict_proba(x_train)[:,1]) - 1
    print(f'итог тест {total_test/cv}')
    print(f'итог трейн {total_train/cv}')
    print(f'итог разница {total_train/cv-total_test/cv}')'''

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


ValueError: multi_class must be in ('ovo', 'ovr')

## Отбор фич

In [4]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(['age','user_id'], axis = 1),df['age'],
                                                        test_size = 0.33,random_state = 42,stratify=df['age'])
pool_train = Pool(x_train, y_train,
                  cat_features = list(x_train.select_dtypes(include=['object']).columns))
pool_test = Pool(x_test,y_test,
                 cat_features = list(x_train.select_dtypes(include=['object']).columns))

model = CatBoostClassifier(iterations=1000,
                           random_state=42,
                           task_type='GPU')
summary = model.select_features(
    pool_train,
    eval_set=pool_test,
    features_for_select=list(range(pool_train.num_col())),
    num_features_to_select=1100,
    steps=1,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    plot=True,
    verbose=False
)
print('Selected features:', summary['selected_features_names'])
#np.save('top400_features.npy', summary['selected_features_names'], allow_pickle=True)
#model.fit(pool_train, eval_set=(x_test,y_test), verbose=100,plot=True)

#print(f'GINI по полу {2 * roc_auc_score(y_test, model.predict_proba(x_test)[:,1]) - 1:2.6f}')
#print(classification_report(y_test, model.predict(x_test)))

print(classification_report(y_test, model.predict(x_test), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.177759
Step #1 out of 1
bestTest = 1.23447829
bestIteration = 998
Shrink model to first 999 iterations.
Feature #98 eliminated
Feature #167 eliminated
Feature #320 eliminated
Feature #241 eliminated
Feature #336 eliminated
Feature #313 eliminated
Feature #93 eliminated
Feature #592 eliminated
Feature #550 eliminated
Feature #201 eliminated
Feature #67 eliminated
Feature #694 eliminated
Feature #231 eliminated
Feature #440 eliminated
Feature #259 eliminated
Feature #205 eliminated
Feature #1427 eliminated
Feature #948 eliminated
Feature #51 eliminated
Feature #112 eliminated
Feature #367 eliminated
Feature #1000 eliminated
Feature #129 eliminated
Feature #505 eliminated
Feature #208 eliminated
Feature #872 eliminated
Feature #943 eliminated
Feature #85 eliminated
Feature #265 eliminated
Feature #190 eliminated
Feature #115 eliminated
Feature #235 eliminated
Feature #453 eliminated
Feature #855 eliminated
Feature #961 eliminated
Feature #214 eliminated
Feature #497

              precision    recall  f1-score   support

       18-25       0.57      0.40      0.47     11127
       25-34       0.52      0.64      0.57     28799
       35-44       0.42      0.53      0.47     25571
       45-54       0.40      0.26      0.32     14006
       55-65       0.43      0.23      0.30      7781
         65+       0.45      0.03      0.06      1816

    accuracy                           0.47     89100
   macro avg       0.47      0.35      0.37     89100
weighted avg       0.47      0.47      0.46     89100



In [5]:
print(len(summary['selected_features_names']))
np.save('.\\npy_files\\top1500_features_age_urls.npy', summary['selected_features_names'], allow_pickle=True)

1500


## CV Catboost

In [None]:
x = df.drop(['is_male','user_id'],axis=1)
y = df['is_male']

params = {'iterations': 1000,
          'random_state': 42,
          'task_type': 'GPU',
          'loss_function': 'Logloss',
          'learning_rate': 0.0025}

cv_fit = cv(
    params = params,
    pool = Pool(x,label = y,cat_features = list(x.select_dtypes(include=['object']).columns)),
    fold_count = 5,
    shuffle = True,
    partition_random_seed = 42,
    stratified = True,
    verbose = False,
    plot = True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
bestTest = 0.5332199718
bestIteration = 999
Training on fold [1/5]


## Подбор гиперпараметров

In [None]:
x = df.drop(['is_male','user_id'],axis=1)
y = df['is_male']

pool = Pool(x,label = y,cat_features = list(x.select_dtypes(include=['object']).columns))

model = CatBoostClassifier(iterations=20000,
                           learning_rate = 0.01,
                           random_state=42,
                           task_type='GPU',
                           eval_metric='AUC')

grid = {'l2_leaf_reg': [2,3,4,5,6,7,8,9,10],
        'random_strength': [1,2,4]}

grid_search_result = model.grid_search(grid, pool,stratified=True,train_size=0.6, plot=True,verbose=False)
