In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import pycaret.classification as pc

from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm, cv
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
df = dd.read_parquet('..\\full_data\\figma_plan\\full_gender_data_txt.parquet').compute()

# берем нужные колонки url
df = df.loc[:,df.columns[12:-1].to_list()+['user_id']]
top400 = np.load('top400_features.npy',allow_pickle=True)
df = df.loc[:,(df.columns.isin(top400))|(df.columns.isin(['user_id']))]
# добавляем target, удаляем nan
df = df.loc[:,(df.columns.isin(top400))|(df.columns.isin(['user_id']))]
df = df.merge(pd.read_parquet('..\\full_data\\public_train.pqt',columns=['user_id','is_male'])[['is_male','user_id']], on = 'user_id', how = 'inner')
df = df.loc[~(df['is_male'].isna()) & (df['is_male'] != 'NA')]
df['is_male'] = df['is_male'].astype('int8')

display(df.head())
display(df.info())

Unnamed: 0,googleads.g.doubleclick.net,yandex.ru,i.ytimg.com,vk.com,avatars.mds.yandex.net,ad.mail.ru,yastatic.net,apple.com,instagram.com,ads.adfox.ru,...,manrule.ru,programma-peredach.com,mcpehub.org,go64.ru,nail-styles.ru,other_url,other_female_urls_frac,other_male_urls_frac,user_id,is_male
0,0.196891,0.031088,0.025907,0.041451,0.046632,0.031088,0.051813,0.0,0.0,0.025907,...,0.0,0.0,0.0,0.0,0.0,0.031088,0.005181,0.0,0,0
1,0.072588,0.112703,0.040115,0.023878,0.104107,0.034384,0.081184,0.0,0.0,0.019102,...,0.0,0.0,0.0,0.0,0.0,0.013372,0.00191,0.000955,1,0
2,0.092457,0.046229,0.126521,0.141119,0.072993,0.038929,0.046229,0.0,0.0,0.026764,...,0.0,0.0,0.0,0.0,0.0,0.002433,0.0,0.0,2,0
3,0.021818,0.061818,0.007273,0.178182,0.094545,0.08,0.069091,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.010909,0.003636,0.003636,3,1
4,0.061776,0.030888,0.048906,0.083655,0.046332,0.061776,0.045045,0.0,0.003861,0.028314,...,0.0,0.0,0.0,0.0,0.0,0.011583,0.003861,0.0,4,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 264326 entries, 0 to 269999
Columns: 402 entries, googleads.g.doubleclick.net to is_male
dtypes: float32(400), int32(1), int8(1)
memory usage: 406.6 MB


None

# train

In [3]:
x_train = df.drop(['is_male','user_id'],axis=1)
y_train = df['is_male']

pool_train = Pool(x_train, y_train,
                  cat_features = list(x_train.select_dtypes(include=['object']).columns))

model = CatBoostClassifier(iterations=2000,
                           learning_rate = 0.01,
                           random_strength = 1,
                           l2_leaf_reg = 8,
                           random_state=42,
                           task_type='GPU')

model.fit(pool_train, verbose=False,plot=True)
print(2 * roc_auc_score(y_train, model.predict_proba(x_train)[:,1]) - 1)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0.7289354188693253


In [None]:
model.save_model('catboost_gender_urls')

In [4]:
model = CatBoostClassifier()
model.load_model('catboost_gender_urls')
print(2 * roc_auc_score(y_train, model.predict_proba(x_train)[:,1]) - 1)

0.7289353568885151


# test

In [2]:
df = dd.read_parquet('..\\full_data\\figma_plan\\full_gender_data_txt.parquet').compute()

#берем нужные колонки url
df = df.loc[:,df.columns[12:-1].to_list()+['user_id']]
top400 = np.load('top400_features.npy',allow_pickle=True)
df = df.loc[:,(df.columns.isin(top400))|(df.columns.isin(['user_id']))]

#оставляем id для теста
submit = dd.read_parquet('..\\full_data\\submit_2.pqt').compute()
submit = submit.sort_values('user_id', ignore_index=True)
test_df = submit.merge(df, on='user_id',how ='left')

In [4]:
model = CatBoostClassifier()
model.load_model('catboost_gender_urls')
display(model.predict_proba(test_df)[:,1])

array([0.20734899, 0.71953531, 0.17580539, ..., 0.46747215, 0.48323137,
       0.48212971])

In [6]:
submit['is_male'] = model.predict_proba(test_df)[:,1]
submit['age'] = pd.Series(data=-1,index=submit.index)

In [7]:
submit.to_csv('top400_male.csv', index=False)