In [1]:
import bisect

import pandas as pd
import numpy as np
import dask.dataframe as dd
import pycaret.classification as pc

from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm, cv
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

# top1500

## train

In [2]:
#df = dd.read_parquet('..\\full_data\\figma_gender\\full_gender_data.parquet').compute()
df = dd.read_parquet('..\\full_data\\figma_age\\full_age_data.parquet').compute()
#берем нужные
df = df.loc[:,['user_id']+df.columns[12:-6].to_list()]
top = np.load('.\\npy_files\\top1500_features_age_urls.npy',allow_pickle=True)
df = df.loc[:,(df.columns.isin(top))|(df.columns.isin(['user_id']))]

# добавляем target, удаляем nan
df = df.merge(pd.read_parquet('..\\full_data\\public_train.pqt',columns=['user_id','age'])[['age','user_id']], on = 'user_id', how = 'inner')
df = df.loc[~(df['age'].isna()) & (df['age'] != 'NA')]
df['age'] = df['age'].astype('int8')

def age_bucket(x):
    return bisect.bisect_left([25,35,45,55,65], x)
df['age'] = df['age'].map(age_bucket)

display(df.head())
display(df.info())

Unnamed: 0,user_id,googleads.g.doubleclick.net,yandex.ru,i.ytimg.com,vk.com,avatars.mds.yandex.net,ad.mail.ru,yastatic.net,apple.com,instagram.com,...,other_url,bask_4cpe_type_cd,bask_5cpe_type_cd,bask_0url_host,bask_1url_host,bask_2url_host,bask_3url_host,bask_4url_host,bask_5url_host,age
0,0,0.196891,0.031088,0.025907,0.041451,0.046632,0.031088,0.051813,0.0,0.0,...,0.031088,0.069146,0.013774,0.134186,0.342372,0.281677,0.149741,0.076944,0.01508,1
1,1,0.072588,0.112703,0.040115,0.023878,0.104107,0.034384,0.081184,0.0,0.0,...,0.013372,0.069146,0.013774,0.130193,0.341496,0.28469,0.151575,0.077067,0.014979,2
2,2,0.092457,0.046229,0.126521,0.141119,0.072993,0.038929,0.046229,0.0,0.0,...,0.002433,0.069146,0.013774,0.144648,0.349024,0.276713,0.143092,0.072281,0.014242,0
3,3,0.021818,0.061818,0.007273,0.178182,0.094545,0.08,0.069091,0.0,0.0,...,0.010909,0.069146,0.013774,0.148331,0.353465,0.277041,0.139772,0.068262,0.013129,3
4,4,0.061776,0.030888,0.048906,0.083655,0.046332,0.061776,0.045045,0.0,0.003861,...,0.011583,0.069146,0.013774,0.145739,0.350527,0.276666,0.14205,0.070993,0.014025,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 269999 entries, 0 to 269998
Columns: 1502 entries, user_id to age
dtypes: float32(1492), float64(8), int32(1), int64(1)
memory usage: 1.5 GB


None

In [None]:
display(df.isna().sum().sum())
for name in df['cpe_manufacturer_name'].unique():
    df.loc[(df['cpe_manufacturer_name']==name)&(df['price'].isna()),'price'] = \
        df.loc[(df['cpe_manufacturer_name']==name)&(~df['price'].isna()),'price'].median()
df.loc[df['price']<1000,'price'] = 1000
display(df.isna().sum().sum())

In [3]:
x_train = df.drop(['age','user_id'],axis=1)
y_train = df['age']

pool_train = Pool(x_train, y_train,
                  cat_features = list(x_train.select_dtypes(include=['object']).columns))

model = CatBoostClassifier(iterations=2000,
                           learning_rate = 0.01,
                           random_state=42,
                           task_type='GPU')
model.fit(pool_train, verbose=100)

print(classification_report(y_train, model.predict(x_train), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

0:	learn: 1.7848758	total: 123ms	remaining: 4m 6s
100:	learn: 1.4710747	total: 10.4s	remaining: 3m 15s
200:	learn: 1.3882294	total: 20.3s	remaining: 3m 1s
300:	learn: 1.3558812	total: 29.4s	remaining: 2m 45s
400:	learn: 1.3388889	total: 37.9s	remaining: 2m 31s
500:	learn: 1.3272154	total: 46.4s	remaining: 2m 18s
600:	learn: 1.3183713	total: 54.9s	remaining: 2m 7s
700:	learn: 1.3113354	total: 1m 4s	remaining: 2m
800:	learn: 1.3053262	total: 1m 17s	remaining: 1m 56s
900:	learn: 1.3000167	total: 1m 26s	remaining: 1m 45s
1000:	learn: 1.2953946	total: 1m 37s	remaining: 1m 37s
1100:	learn: 1.2910439	total: 1m 45s	remaining: 1m 25s
1200:	learn: 1.2871009	total: 1m 52s	remaining: 1m 15s
1300:	learn: 1.2835815	total: 2m 1s	remaining: 1m 5s
1400:	learn: 1.2803290	total: 2m 11s	remaining: 56.4s
1500:	learn: 1.2772614	total: 2m 19s	remaining: 46.5s
1600:	learn: 1.2743665	total: 2m 27s	remaining: 36.8s
1700:	learn: 1.2716068	total: 2m 35s	remaining: 27.3s
1800:	learn: 1.2690040	total: 2m 42s	remain

## create pred

In [6]:
df = dd.read_parquet('..\\full_data\\figma_age\\full_age_data.parquet').compute()

#берем нужные
df = df.loc[:,['user_id']+df.columns[12:-6].to_list()]
top = np.load('.\\npy_files\\top1500_features_age_urls.npy',allow_pickle=True)
df = df.loc[:,df.columns.isin(top)]

display(df.head())
display(df.info())

Unnamed: 0,googleads.g.doubleclick.net,yandex.ru,i.ytimg.com,vk.com,avatars.mds.yandex.net,ad.mail.ru,yastatic.net,apple.com,instagram.com,ads.adfox.ru,...,7info.ru,other_url,bask_4cpe_type_cd,bask_5cpe_type_cd,bask_0url_host,bask_1url_host,bask_2url_host,bask_3url_host,bask_4url_host,bask_5url_host
0,0.196891,0.031088,0.025907,0.041451,0.046632,0.031088,0.051813,0.0,0.0,0.025907,...,0.0,0.031088,0.069146,0.013774,0.134186,0.342372,0.281677,0.149741,0.076944,0.01508
1,0.072588,0.112703,0.040115,0.023878,0.104107,0.034384,0.081184,0.0,0.0,0.019102,...,0.0,0.013372,0.069146,0.013774,0.130193,0.341496,0.28469,0.151575,0.077067,0.014979
2,0.092457,0.046229,0.126521,0.141119,0.072993,0.038929,0.046229,0.0,0.0,0.026764,...,0.0,0.002433,0.069146,0.013774,0.144648,0.349024,0.276713,0.143092,0.072281,0.014242
3,0.021818,0.061818,0.007273,0.178182,0.094545,0.08,0.069091,0.0,0.0,0.0,...,0.0,0.010909,0.069146,0.013774,0.148331,0.353465,0.277041,0.139772,0.068262,0.013129
4,0.061776,0.030888,0.048906,0.083655,0.046332,0.061776,0.045045,0.0,0.003861,0.028314,...,0.0,0.011583,0.069146,0.013774,0.145739,0.350527,0.276666,0.14205,0.070993,0.014025


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414723 entries, 0 to 414722
Columns: 1500 entries, googleads.g.doubleclick.net to bask_5url_host
dtypes: float32(1492), float64(8)
memory usage: 2.3 GB


None

In [12]:
pred_df = pd.DataFrame(data=model.predict_proba(df), index=df.index, columns=['cat_bask_0_urls',
                                                                         'cat_bask_1_urls',
                                                                         'cat_bask_2_urls',
                                                                         'cat_bask_3_urls',
                                                                         'cat_bask_4_urls',
                                                                         'cat_bask_5_urls'])
pred_df = pred_df.astype('float32')
pred_df.head()

Unnamed: 0,cat_bask_0_urls,cat_bask_1_urls,cat_bask_2_urls,cat_bask_3_urls,cat_bask_4_urls,cat_bask_5_urls
0,0.023031,0.18712,0.444597,0.242201,0.092669,0.010382
1,0.006237,0.102354,0.39565,0.340144,0.138546,0.017069
2,0.322408,0.387423,0.191007,0.06373,0.027053,0.008379
3,0.178116,0.332993,0.306053,0.126063,0.046528,0.010247
4,0.147992,0.623704,0.194991,0.025293,0.006143,0.001877


In [13]:
pred_df.to_parquet('..\\full_data\\figma_age\\age_proba_by_1500urls.parquet', index=False)

## test

In [None]:
df = dd.read_parquet('..\\full_data\\figma_age\\full_age_data.parquet').compute()

#берем нужные
df = df.loc[:,['user_id']+df.columns[12:-6].to_list()]

# добавляем target, удаляем nan
submit = dd.read_parquet('..\\full_data\\submit_2.pqt').compute()
submit = submit.sort_values('user_id', ignore_index=True)
test_df = submit.merge(df, on='user_id',how ='left')

display(df.head())
display(df.info())

In [None]:
display(test_df.isna().sum().sum())
for name in df['cpe_manufacturer_name'].unique():
    test_df.loc[(test_df['cpe_manufacturer_name']==name)&(test_df['price'].isna()),'price'] = \
        df.loc[(df['cpe_manufacturer_name']==name)&(~df['price'].isna()),'price'].median()
test_df.loc[test_df['price']<1000,'price'] = 1000
display(test_df.isna().sum().sum())

In [None]:
submit['age'] = model.predict(test_df.drop('user_id',axis=1))
submit['age'] += 1
submit['is_male'] = pd.Series(data=-1.0,index=submit.index)
display(submit)

In [None]:
submit.to_csv('age_urls_top_1500_only.csv', index=False)

# full