In [1]:
import bisect

import pandas as pd
import numpy as np
import dask.dataframe as dd
import pycaret.classification as pc

from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm, cv
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

# top1500

## train

In [2]:
#df = dd.read_parquet('..\\full_data\\figma_gender\\full_gender_data.parquet').compute()
df = dd.read_parquet('..\\full_data\\figma_age\\full_age_data.parquet').compute()
#берем нужные
#df = df.loc[:,['user_id']+df.columns[12:-6].to_list()]
top = np.load('.\\npy_files\\top1500_features_full_age.npy',allow_pickle=True)
df = df.loc[:,(df.columns.isin(top))|(df.columns.isin(['user_id']))]

# добавляем target, удаляем nan
df = df.merge(pd.read_parquet('..\\full_data\\public_train.pqt',columns=['user_id','age'])[['age','user_id']], on = 'user_id', how = 'inner')
df = df.loc[~(df['age'].isna()) & (df['age'] != 'NA')]
df['age'] = df['age'].astype('int8')

def age_bucket(x):
    return bisect.bisect_left([25,35,45,55,65], x)
df['age'] = df['age'].map(age_bucket)

display(df.head())
display(df.info())

Unnamed: 0,user_id,region_name,region_count,city_name,city_count,cpe_manufacturer_name,price,holyday_fraction,morning_fraction,day_fraction,...,bask_2part_of_day,bask_3part_of_day,bask_5part_of_day,cat_bask_0_urls,cat_bask_1_urls,cat_bask_2_urls,cat_bask_3_urls,cat_bask_4_urls,cat_bask_5_urls,age
0,0,Москва,1,Москва,1,Samsung,2990.0,0.243523,0.119,0.554,...,0.268501,0.135631,0.013485,0.023031,0.18712,0.444597,0.242201,0.092669,0.010382,1
1,1,Москва,3,Москва,6,Xiaomi,,0.200573,0.323,0.347,...,0.270245,0.138227,0.013728,0.006237,0.102354,0.39565,0.340144,0.138546,0.017069,2
2,2,Республика Коми,1,Печора,1,Huawei,5915.0,0.29927,0.187,0.482,...,0.269442,0.136896,0.013536,0.322408,0.387423,0.191007,0.06373,0.027053,0.008379,0
3,3,Воронежская область,1,Воронеж,1,Huawei Device Company Limited,13990.0,0.181818,0.178,0.353,...,0.269107,0.136476,0.013581,0.178116,0.332993,0.306053,0.126063,0.046528,0.010247,3
4,4,Краснодарский край,5,Анапа,9,Huawei,12990.0,0.326898,0.372,0.349,...,0.270618,0.138505,0.013725,0.147992,0.623704,0.194991,0.025293,0.006143,0.001877,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 269999 entries, 0 to 269998
Columns: 1502 entries, user_id to age
dtypes: float32(1495), int32(1), int64(1), int8(2), object(3)
memory usage: 1.5+ GB


None

In [3]:
display(df.isna().sum().sum())
for name in df['cpe_manufacturer_name'].unique():
    df.loc[(df['cpe_manufacturer_name']==name)&(df['price'].isna()),'price'] = \
        df.loc[(df['cpe_manufacturer_name']==name)&(~df['price'].isna()),'price'].median()
df.loc[df['price']<1000,'price'] = 1000
display(df.isna().sum().sum())

6949

221

In [4]:
x_train = df.drop(['age','user_id'],axis=1)
y_train = df['age']

pool_train = Pool(x_train, y_train,
                  cat_features = list(x_train.select_dtypes(include=['object']).columns))

model = CatBoostClassifier(#iterations=2000,
                           #learning_rate = 0.01,
                           random_state=42,
                           task_type='GPU')
model.fit(pool_train, verbose=100)

print(classification_report(y_train, model.predict(x_train), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

Learning rate set to 0.203367
0:	learn: 1.6088945	total: 313ms	remaining: 5m 12s
100:	learn: 1.2117337	total: 9.56s	remaining: 1m 25s
200:	learn: 1.1902373	total: 17.9s	remaining: 1m 11s
300:	learn: 1.1739048	total: 26.1s	remaining: 1m
400:	learn: 1.1608619	total: 34.1s	remaining: 51s
500:	learn: 1.1492494	total: 42.1s	remaining: 41.9s
600:	learn: 1.1394301	total: 49.7s	remaining: 33s
700:	learn: 1.1305854	total: 57.4s	remaining: 24.5s
800:	learn: 1.1217309	total: 1m 4s	remaining: 16.1s
900:	learn: 1.1143172	total: 1m 12s	remaining: 7.96s
999:	learn: 1.1066063	total: 1m 19s	remaining: 0us
              precision    recall  f1-score   support

       18-25       0.63      0.49      0.55     33718
       25-34       0.58      0.68      0.63     87270
       35-44       0.50      0.61      0.55     77486
       45-54       0.52      0.37      0.43     42442
       55-65       0.53      0.35      0.42     23580
         65+       0.74      0.10      0.17      5503

    accuracy            

## create pred

In [6]:
df = dd.read_parquet('..\\full_data\\figma_age\\full_age_data.parquet').compute()

#берем нужные
#df = df.loc[:,['user_id']+df.columns[12:-6].to_list()]
top = np.load('.\\npy_files\\top1500_features_full_age.npy',allow_pickle=True)
df = df.loc[:,df.columns.isin(top)]

display(df.head())
display(df.info())

Unnamed: 0,region_name,region_count,city_name,city_count,cpe_manufacturer_name,price,holyday_fraction,morning_fraction,day_fraction,evening_fraction,...,bask_1part_of_day,bask_2part_of_day,bask_3part_of_day,bask_5part_of_day,cat_bask_0_urls,cat_bask_1_urls,cat_bask_2_urls,cat_bask_3_urls,cat_bask_4_urls,cat_bask_5_urls
0,Москва,1,Москва,1,Samsung,2990.0,0.243523,0.119,0.554,0.321,...,0.356866,0.268501,0.135631,0.013485,0.023031,0.18712,0.444597,0.242201,0.092669,0.010382
1,Москва,3,Москва,6,Xiaomi,,0.200573,0.323,0.347,0.295,...,0.354712,0.270245,0.138227,0.013728,0.006237,0.102354,0.39565,0.340144,0.138546,0.017069
2,Республика Коми,1,Печора,1,Huawei,5915.0,0.29927,0.187,0.482,0.316,...,0.35598,0.269442,0.136896,0.013536,0.322408,0.387423,0.191007,0.06373,0.027053,0.008379
3,Воронежская область,1,Воронеж,1,Huawei Device Company Limited,13990.0,0.181818,0.178,0.353,0.455,...,0.356166,0.269107,0.136476,0.013581,0.178116,0.332993,0.306053,0.126063,0.046528,0.010247
4,Краснодарский край,5,Анапа,9,Huawei,12990.0,0.326898,0.372,0.349,0.265,...,0.354653,0.270618,0.138505,0.013725,0.147992,0.623704,0.194991,0.025293,0.006143,0.001877


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414723 entries, 0 to 414722
Columns: 1500 entries, region_name to cat_bask_5_urls
dtypes: float32(1495), int8(2), object(3)
memory usage: 2.3+ GB


None

In [7]:
pred_df = pd.DataFrame(data=model.predict_proba(df), index=df.index, columns=['cat_bask_0_urls',
                                                                         'cat_bask_1_urls',
                                                                         'cat_bask_2_urls',
                                                                         'cat_bask_3_urls',
                                                                         'cat_bask_4_urls',
                                                                         'cat_bask_5_urls'])
pred_df = pred_df.astype('float32')
pred_df.head()

Unnamed: 0,cat_bask_0_urls,cat_bask_1_urls,cat_bask_2_urls,cat_bask_3_urls,cat_bask_4_urls,cat_bask_5_urls
0,0.007288,0.182208,0.399055,0.309578,0.097486,0.004386
1,0.001327,0.047935,0.471607,0.32972,0.143314,0.006097
2,0.46701,0.289203,0.139272,0.077972,0.021021,0.005522
3,0.196409,0.288458,0.288645,0.169502,0.05019,0.006795
4,0.042321,0.749595,0.191641,0.011733,0.003298,0.001412


In [13]:
pred_df.to_parquet('..\\full_data\\figma_age\\age_proba_by_1500full.parquet', index=False)

## test

In [21]:
df = dd.read_parquet('..\\full_data\\figma_age\\full_age_data.parquet').compute()

#берем нужные
#df = df.loc[:,['user_id']+df.columns[12:-6].to_list()]
top = np.load('.\\npy_files\\top1500_features_full_age.npy',allow_pickle=True)
# добавляем target, удаляем nan
submit = dd.read_parquet('..\\full_data\\submit_2.pqt').compute()
submit = submit.sort_values('user_id', ignore_index=True)
test_df = submit.merge(df, on='user_id',how ='left')


display(df.head())
display(df.info())

Unnamed: 0,user_id,region_name,reg_count_people,region_count,city_name,city_count,cpe_manufacturer_name,cpe_type_cd,price,holyday_fraction,...,bask_2part_of_day,bask_3part_of_day,bask_4part_of_day,bask_5part_of_day,cat_bask_0_urls,cat_bask_1_urls,cat_bask_2_urls,cat_bask_3_urls,cat_bask_4_urls,cat_bask_5_urls
0,0,Москва,1.0,1,Москва,1,Samsung,smartphone,2990.0,0.243523,...,0.268501,0.135631,0.068173,0.013485,0.023031,0.18712,0.444597,0.242201,0.092669,0.010382
1,1,Москва,1.0,3,Москва,6,Xiaomi,smartphone,,0.200573,...,0.270245,0.138227,0.069619,0.013728,0.006237,0.102354,0.39565,0.340144,0.138546,0.017069
2,2,Республика Коми,0.060277,1,Печора,1,Huawei,smartphone,5915.0,0.29927,...,0.269442,0.136896,0.068823,0.013536,0.322408,0.387423,0.191007,0.06373,0.027053,0.008379
3,3,Воронежская область,0.178155,1,Воронеж,1,Huawei Device Company Limited,smartphone,13990.0,0.181818,...,0.269107,0.136476,0.06865,0.013581,0.178116,0.332993,0.306053,0.126063,0.046528,0.010247
4,4,Краснодарский край,0.448167,5,Анапа,9,Huawei,smartphone,12990.0,0.326898,...,0.270618,0.138505,0.069731,0.013725,0.147992,0.623704,0.194991,0.025293,0.006143,0.001877


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414723 entries, 0 to 414722
Columns: 3038 entries, user_id to cat_bask_5_urls
dtypes: float32(3031), int32(1), int8(2), object(4)
memory usage: 4.7+ GB


None

In [22]:
display(test_df.isna().sum().sum())
for name in df['cpe_manufacturer_name'].unique():
    test_df.loc[(test_df['cpe_manufacturer_name']==name)&(test_df['price'].isna()),'price'] = \
        df.loc[(df['cpe_manufacturer_name']==name)&(~df['price'].isna()),'price'].median()
test_df.loc[test_df['price']<1000,'price'] = 1000
display(test_df.isna().sum().sum())

3801

105

In [23]:
test_df.head()

Unnamed: 0,user_id,region_name,reg_count_people,region_count,city_name,city_count,cpe_manufacturer_name,cpe_type_cd,price,holyday_fraction,...,bask_2part_of_day,bask_3part_of_day,bask_4part_of_day,bask_5part_of_day,cat_bask_0_urls,cat_bask_1_urls,cat_bask_2_urls,cat_bask_3_urls,cat_bask_4_urls,cat_bask_5_urls
0,6,Московская область,0.613485,1,Егорьевск,1,Huawei,smartphone,8990.0,0.265931,...,0.269247,0.136917,0.068976,0.013713,0.109744,0.351965,0.326049,0.140286,0.05756,0.014395
1,7,Москва,1.0,2,Москва,6,Huawei,smartphone,5490.0,0.259758,...,0.267011,0.13629,0.069008,0.013881,0.140196,0.428928,0.288132,0.095871,0.035866,0.011007
2,9,Нижегородская область,0.246186,1,Кстово,2,Apple,smartphone,45368.0,0.301075,...,0.270146,0.138018,0.069489,0.013696,0.409509,0.395715,0.128465,0.046141,0.015689,0.004482
3,10,Нижегородская область,0.246186,1,Нижний Новгород,2,Samsung,smartphone,56815.0,0.1633,...,0.270809,0.13882,0.069891,0.013727,0.025641,0.377277,0.466471,0.105612,0.022054,0.002945
4,11,Приморский край,0.144427,1,Большой Камень,3,Huawei,smartphone,14190.0,0.13,...,0.271976,0.140212,0.070558,0.013736,0.000847,0.008255,0.051437,0.229927,0.544088,0.165446


In [24]:
test_df = test_df.drop('user_id',axis=1)
test_df = Pool(test_df,
               cat_features = list(test_df.select_dtypes(include=['object']).columns))

submit['age'] = model.predict(test_df)
submit['age'] += 1
submit['is_male'] = pd.Series(data=-1.0,index=submit.index)
display(submit)

Unnamed: 0,user_id,age,is_male
0,6,2,-1.0
1,7,2,-1.0
2,9,2,-1.0
3,10,3,-1.0
4,11,5,-1.0
...,...,...,...
144719,415306,2,-1.0
144720,415310,3,-1.0
144721,415314,2,-1.0
144722,415315,3,-1.0


In [25]:
submit.to_csv('age_top_1500_full.csv', index=False)

# full