In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import pycaret.classification as pc

from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm, cv
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

## Препроцессинг

In [4]:
df = dd.read_parquet('..\\full_data\\figma_txt_features\\full_gender_data.parquet').compute()

#берем нужные колонки url
#df = df.loc[:,df.columns[12:-1].to_list()+['user_id']]
top1100 = np.load('.\\npy_files\\top1100_features_full.npy',allow_pickle=True)
df = df.loc[:,(df.columns.isin(top1100))|(df.columns.isin(['user_id']))]
# добавляем target, удаляем nan
df = df.merge(pd.read_parquet('..\\full_data\\public_train.pqt',columns=['user_id','is_male'])[['is_male','user_id']], on = 'user_id', how = 'inner')
df = df.loc[~(df['is_male'].isna()) & (df['is_male'] != 'NA')]
df['is_male'] = df['is_male'].astype('int8')

display(df.head())
display(df.info())

Unnamed: 0,user_id,region_name,male_fraction_region,city_name,city_count,cpe_manufacturer_name,price,holyday_fraction,morning_fraction,day_fraction,...,megapteka-ru.turbopages.org,127.0.0.1,bukvaprava.ru,ngs24-ru.turbopages.org,forum.mfd.ru,other_url,other_female_urls_frac,other_male_urls_frac,male_probability_by_urls,catboost_top400urls
0,0,Москва,0.461904,Москва,1,Samsung,2990.0,0.243523,0.119,0.554,...,0.0,0.0,0.0,0.0,0.0,0.031088,0.005181,0.0,0.500491,0.076492
1,1,Москва,0.461904,Москва,6,Xiaomi,,0.200573,0.323,0.347,...,0.0,0.0,0.0,0.0,0.0,0.013372,0.00191,0.000955,0.523982,0.105959
2,2,Республика Коми,0.472075,Печора,1,Huawei,5915.0,0.29927,0.187,0.482,...,0.0,0.0,0.0,0.0,0.0,0.002433,0.0,0.0,0.525968,0.458972
3,3,Воронежская область,0.458936,Воронеж,1,Huawei Device Company Limited,13990.0,0.181818,0.178,0.353,...,0.0,0.0,0.0,0.0,0.0,0.010909,0.003636,0.003636,0.519823,0.359567
4,4,Краснодарский край,0.463641,Анапа,9,Huawei,12990.0,0.326898,0.372,0.349,...,0.0,0.0,0.0,0.0,0.0,0.011583,0.003861,0.0,0.51007,0.054218


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415317 entries, 0 to 415316
Columns: 1101 entries, user_id to catboost_top400urls
dtypes: float32(1096), int32(1), int8(1), object(3)
memory usage: 1.7+ GB


None

In [3]:
display(df.isna().sum().sum())
for name in df['cpe_manufacturer_name'].unique():
    df.loc[(df['cpe_manufacturer_name']==name)&(df['price'].isna()),'price'] = \
        df.loc[(df['cpe_manufacturer_name']==name)&(~df['price'].isna()),'price'].median()
df.loc[df['price']<1000,'price'] = 1000
display(df.isna().sum().sum())

6799

215

## Тест на 10 выборках

In [None]:
total_train = 0
total_test = 0
cv = 2
for i in range(cv):
    x_train, x_test, y_train, y_test = train_test_split(df.drop(['is_male','user_id'], axis = 1),df['is_male'],
                                                        test_size = 0.40,random_state = i,stratify=df['is_male'])

    pool_train = Pool(x_train, y_train,
                      cat_features = list(x_train.select_dtypes(include=['object']).columns))
    pool_test = Pool(x_test,y_test,
                     cat_features = list(x_train.select_dtypes(include=['object']).columns))

    model = CatBoostClassifier(iterations=i,
                               learning_rate = 0.01,
                               random_strength = 1,
                               l2_leaf_reg = 8,
                               random_state=42,
                               task_type='GPU',
                               eval_metric='AUC')

    model.fit(pool_train,eval_set=(x_test,y_test), verbose=False,plot=True)
    print(2 * roc_auc_score(y_test, model.predict_proba(x_test)[:,1]) - 1)
    total_test += 2 * roc_auc_score(y_test, model.predict_proba(x_test)[:,1]) - 1
    total_train += 2 * roc_auc_score(y_train, model.predict_proba(x_train)[:,1]) - 1
print(f'итог тест {total_test/cv}')
print(f'итог трейн {total_train/cv}')
print(f'итог разница {total_train/cv-total_test/cv}')

## Отбор фич

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(['is_male','user_id'], axis = 1),df['is_male'],
                                                        test_size = 0.33,random_state = 42,stratify=df['is_male'])
pool_train = Pool(x_train, y_train,
                  cat_features = list(x_train.select_dtypes(include=['object']).columns))
pool_test = Pool(x_test,y_test,
                 cat_features = list(x_train.select_dtypes(include=['object']).columns))

model = CatBoostClassifier(iterations=1000,
                           random_state=42,
                           task_type='GPU')
summary = model.select_features(
    pool_train,
    eval_set=pool_test,
    features_for_select=list(range(pool_train.num_col())),
    num_features_to_select=1100,
    steps=5,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    plot=True,
    verbose=False
)
print('Selected features:', summary['selected_features_names'])
#np.save('top400_features.npy', summary['selected_features_names'], allow_pickle=True)
#model.fit(pool_train, eval_set=(x_test,y_test), verbose=100,plot=True)

print(f'GINI по полу {2 * roc_auc_score(y_test, model.predict_proba(x_test)[:,1]) - 1:2.6f}')
print(classification_report(y_test, model.predict(x_test)))

In [11]:
print(len(summary['selected_features_names']))
np.save('.\\npy_files\\top1100_features_full.npy', summary['selected_features_names'], allow_pickle=True)

1100


## CV Catboost

In [None]:
x = df.drop(['is_male','user_id'],axis=1)
y = df['is_male']

params = {'iterations': 1000,
          'random_state': 42,
          'task_type': 'GPU',
          'loss_function': 'Logloss',
          'learning_rate': 0.0025}

cv_fit = cv(
    params = params,
    pool = Pool(x,label = y,cat_features = list(x.select_dtypes(include=['object']).columns)),
    fold_count = 5,
    shuffle = True,
    partition_random_seed = 42,
    stratified = True,
    verbose = False,
    plot = True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
bestTest = 0.5332199718
bestIteration = 999
Training on fold [1/5]


## Подбор гиперпараметров

In [None]:
x = df.drop(['is_male','user_id'],axis=1)
y = df['is_male']

pool = Pool(x,label = y,cat_features = list(x.select_dtypes(include=['object']).columns))

model = CatBoostClassifier(iterations=20000,
                           learning_rate = 0.01,
                           random_state=42,
                           task_type='GPU',
                           eval_metric='AUC')

grid = {'l2_leaf_reg': [2,3,4,5,6,7,8,9,10],
        'random_strength': [1,2,4]}

grid_search_result = model.grid_search(grid, pool,stratified=True,train_size=0.6, plot=True,verbose=False)
