In [1]:
import bisect

import pandas as pd
import numpy as np
import dask.dataframe as dd
import pycaret.classification as pc

from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm, cv
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

# full

In [3]:
df = dd.read_parquet('..\\full_data\\figma_plan\\full_gender_data_txt.parquet').compute()

#берем нужные
df = df.loc[:,df.columns[:-3].to_list()]

# добавляем target, удаляем nan
df = df.merge(pd.read_parquet('..\\full_data\\public_train.pqt',columns=['user_id','age'])[['age','user_id']], on = 'user_id', how = 'inner')
df = df.loc[~(df['age'].isna()) & (df['age'] != 'NA')]
df['age'] = df['age'].astype('int8')

def age_bucket(x):
    return bisect.bisect_left([25,35,45,55,65], x)
df['age'] = df['age'].map(age_bucket)

display(df.head())
display(df.info())

Unnamed: 0,user_id,region_name,region_count,city_name,city_count,cpe_manufacturer_name,cpe_type_cd,price,holyday_fraction,morning_fraction,...,megapteka-ru.turbopages.org,127.0.0.1,bukvaprava.ru,belnovosti-by.turbopages.org,ngs24-ru.turbopages.org,forum.mfd.ru,yomed.ru,7info.ru,other_url,age
0,0,Москва,1,Москва,1,Samsung,smartphone,2990.0,0.243523,0.119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031088,1
1,1,Москва,3,Москва,6,Xiaomi,smartphone,,0.200573,0.323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013372,2
2,2,Республика Коми,1,Печора,1,Huawei,smartphone,5915.0,0.29927,0.187,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002433,0
3,3,Воронежская область,1,Воронеж,1,Huawei Device Company Limited,smartphone,13990.0,0.181818,0.178,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010909,3
4,4,Краснодарский край,5,Анапа,9,Huawei,smartphone,12990.0,0.326898,0.372,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011583,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 269999 entries, 0 to 269999
Columns: 3014 entries, user_id to age
dtypes: float32(3006), int32(1), int64(1), int8(2), object(4)
memory usage: 3.0+ GB


None

In [4]:
display(df.isna().sum().sum())
for name in df['cpe_manufacturer_name'].unique():
    df.loc[(df['cpe_manufacturer_name']==name)&(df['price'].isna()),'price'] = \
        df.loc[(df['cpe_manufacturer_name']==name)&(~df['price'].isna()),'price'].median()
df.loc[df['price']<1000,'price'] = 1000
display(df.isna().sum().sum())

6949

221

In [5]:
x_train = df.drop(['age','user_id'],axis=1)
y_train = df['age']

pool_train = Pool(x_train, y_train,
                  cat_features = list(x_train.select_dtypes(include=['object']).columns))

model = CatBoostClassifier(random_state=42,
                           task_type='GPU')
model.fit(pool_train, verbose=100)

print(classification_report(y_train, model.predict(x_train), \
                            target_names = ['18-25','25-34', '35-44', '45-54', '55-65', '65+']))

Learning rate set to 0.203367
0:	learn: 1.6626736	total: 380ms	remaining: 6m 20s
100:	learn: 1.2560147	total: 17.1s	remaining: 2m 31s
200:	learn: 1.2239644	total: 32.1s	remaining: 2m 7s
300:	learn: 1.2041472	total: 47.7s	remaining: 1m 50s
400:	learn: 1.1879346	total: 1m 2s	remaining: 1m 32s
500:	learn: 1.1750685	total: 1m 16s	remaining: 1m 15s
600:	learn: 1.1644510	total: 1m 30s	remaining: 59.8s
700:	learn: 1.1540671	total: 1m 44s	remaining: 44.4s
800:	learn: 1.1448172	total: 1m 58s	remaining: 29.3s
900:	learn: 1.1361966	total: 2m 12s	remaining: 14.5s
999:	learn: 1.1278655	total: 2m 25s	remaining: 0us
              precision    recall  f1-score   support

       18-25       0.65      0.46      0.54     33718
       25-34       0.57      0.70      0.63     87270
       35-44       0.49      0.61      0.54     77486
       45-54       0.51      0.35      0.41     42442
       55-65       0.54      0.32      0.40     23580
         65+       0.77      0.08      0.14      5503

    accurac

## test

In [6]:
df = dd.read_parquet('..\\full_data\\figma_plan\\full_gender_data_txt.parquet').compute()

#берем нужные
df = df.loc[:,df.columns[:-3].to_list()]

# добавляем target, удаляем nan
submit = dd.read_parquet('..\\full_data\\submit_2.pqt').compute()
submit = submit.sort_values('user_id', ignore_index=True)
test_df = submit.merge(df, on='user_id',how ='left')

display(df.head())
display(df.info())

Unnamed: 0,user_id,region_name,region_count,city_name,city_count,cpe_manufacturer_name,cpe_type_cd,price,holyday_fraction,morning_fraction,...,psy-magic.org,megapteka-ru.turbopages.org,127.0.0.1,bukvaprava.ru,belnovosti-by.turbopages.org,ngs24-ru.turbopages.org,forum.mfd.ru,yomed.ru,7info.ru,other_url
0,0,Москва,1,Москва,1,Samsung,smartphone,2990.0,0.243523,0.119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031088
1,1,Москва,3,Москва,6,Xiaomi,smartphone,,0.200573,0.323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013372
2,2,Республика Коми,1,Печора,1,Huawei,smartphone,5915.0,0.29927,0.187,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002433
3,3,Воронежская область,1,Воронеж,1,Huawei Device Company Limited,smartphone,13990.0,0.181818,0.178,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010909
4,4,Краснодарский край,5,Анапа,9,Huawei,smartphone,12990.0,0.326898,0.372,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011583


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415317 entries, 0 to 415316
Columns: 3013 entries, user_id to other_url
dtypes: float32(3006), int32(1), int8(2), object(4)
memory usage: 4.7+ GB


None

In [7]:
display(test_df.isna().sum().sum())
for name in df['cpe_manufacturer_name'].unique():
    test_df.loc[(test_df['cpe_manufacturer_name']==name)&(test_df['price'].isna()),'price'] = \
        df.loc[(df['cpe_manufacturer_name']==name)&(~df['price'].isna()),'price'].median()
test_df.loc[test_df['price']<1000,'price'] = 1000
display(test_df.isna().sum().sum())

3801

105

In [10]:
submit['age'] = model.predict(test_df.drop('user_id',axis=1))
submit['is_male'] = pd.Series(data=-1,index=submit.index)

In [12]:
submit.to_csv('prediction.csv', index=False)