In [1]:
import random
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score
import gc
import optuna

In [4]:
seed = 777

random.seed(seed)
np.random.seed(seed)

train_payments = 'data/payments_train.csv'
train_targets = 'data/target_train.csv'
test_payments = 'data/payments_test.csv'
test_cliend_ids = 'data/client_id_test.csv'

payments_dtypes = {
    'client_id': str,
    'contractor_id': str,
    'is_outgoing': bool,
    'amount': 'uint64',
    'dt_day': 'uint16',
    'dt_hour': 'uint8',
    'channel': pd.CategoricalDtype()
}
for i in range(12):
    payments_dtypes[f'flag_{i}'] = bool
payments = pd.read_csv(train_payments, dtype=payments_dtypes)

target_dtypes = {'client_id': str}
for i in range(35):
    target_dtypes[f'type_{i}'] = int

target = pd.read_csv(train_targets, dtype=target_dtypes).set_index('client_id')

In [7]:
def generate_features(pay):
    pay['dt_day_of_week'] = pay['dt_day'] % 7
    pay['dt_hour_day'] = pay['dt_day_of_week'].astype('str') + '_' + pay['dt_hour'].astype('str')
    contract_dict = pay[~pay['contractor_id'].isna()]['contractor_id'].value_counts().to_dict()
    pay.loc[~pay['contractor_id'].isna(), 'contractor_num_deals'] = pay.loc[~pay['contractor_id'].isna(), 'contractor_id'].apply(lambda x: contract_dict[x])
    contract_dict = pay[~pay['contractor_id'].isna()][['client_id', 'contractor_id']].drop_duplicates()['contractor_id'].value_counts().to_dict()
    pay.loc[~pay['contractor_id'].isna(), 'contractor_num_clients'] = pay.loc[~pay['contractor_id'].isna(), 'contractor_id'].apply(lambda x: contract_dict[x])
    pay['category'] = ''

    for i in range(4): pay[f'flag_{i+12}'] = pay['channel']==['app', 'web', 'pos', 'amt'][i]
    for i in range(16): pay['category'] = pay['category'].astype('str')+pay[f'flag_{i}'].astype('int').astype('str')

    fts = pay.groupby('client_id')['amount'].agg(['mean', 'median', 'std', 'min', 'max', 'count'])
    fts.columns = [f'amount_{x}' for x in fts.columns]
    
    for x in range(16):
        tmp = pay[pay[f'flag_{x}']==True].groupby('client_id')['amount'].agg(['median', 'count', 'mean', 'sum'])
        tmp.columns = [f'flag_{x}_{col}' for col in tmp.columns]
        fts = fts.merge(right=tmp, how='left', left_index=True, right_index=True)
        fts[f'flag_{x}_count'] /= fts['amount_count']

    fts['unique_active_days'] = pay.groupby('client_id')['dt_day'].nunique()
    fts['avg_day'] = pay.groupby('client_id')['dt_day'].mean()
    fts['median_day'] = pay.groupby('client_id')['dt_day'].median()
    fts['unique_contractor'] = pay[~pay['contractor_id'].isna()].groupby('client_id')['contractor_id'].nunique()
    fts['num_deals_with_contractor'] = pay[~pay['contractor_id'].isna()].groupby('client_id')['contractor_id'].count()
    fts['avg_deals_with_contractor'] = fts['num_deals_with_contractor'] / fts['unique_contractor']
    fts['median_contractor_deals'] = pay[~pay['contractor_id'].isna()].groupby('client_id')['contractor_num_deals'].median()
    fts['mean_contractor_deals'] = pay[~pay['contractor_id'].isna()].groupby('client_id')['contractor_num_deals'].mean()
    fts['median_contractor_clients'] = pay[~pay['contractor_id'].isna()].groupby('client_id')['contractor_num_clients'].median()
    fts['unique_categories'] = pay.groupby('client_id')['category'].nunique()

    outgoing = pay[pay['is_outgoing']==True].groupby('client_id')['amount'].agg(['mean', 'median', 'count'])
    outgoing.columns = [f'outgoing_{x}' for x in outgoing.columns]
    ingoing = pay[pay['is_outgoing']==False].groupby('client_id')['amount'].agg(['mean', 'median', 'count'])
    ingoing.columns = [f'ingoing_{x}' for x in ingoing.columns]

    payments_per_hour = pd.pivot_table( # По часам
                            pay[['client_id', 'dt_hour', 'dt_day']].drop_duplicates(),
                            index='client_id',
                            columns='dt_hour',
                            values='dt_day',
                            aggfunc='count'
                            )
    payments_per_hour['summs'] = payments_per_hour.sum(axis=1)
    for i in payments_per_hour.columns[:-1]:
        payments_per_hour[i] /= payments_per_hour['summs']
    payments_per_hour.columns = [f'per_hour_{x}' for x in payments_per_hour.columns]
    
    payments_per_day = pd.pivot_table(  # По дням недели
                            pay[['client_id', 'dt_hour', 'dt_day', 'dt_day_of_week']].drop_duplicates(),
                            index='client_id',
                            columns='dt_day_of_week',
                            values='dt_hour',
                            aggfunc='count'
                            )
    payments_per_day['summs'] = payments_per_day.sum(axis=1)
    for i in payments_per_day.columns[:-1]:
        payments_per_day[i] /= payments_per_day['summs']
    payments_per_day.columns = [f'per_day_{x}' for x in payments_per_day.columns]

    cats_embed = pd.pivot_table(
                            pay,
                            index='client_id',
                            columns='category',
                            values='amount',
                            aggfunc=['count', 'sum', 'mean']
                            )

    fts = fts.merge(right=payments_per_hour, how='left', left_index=True, right_index=True
                        ).merge(right=payments_per_day, how='left', left_index=True, right_index=True
                            ).merge(right=cats_embed, how='left', left_index=True, right_index=True
                                ).merge(right=outgoing, how='left', left_index=True, right_index=True
                                    ).merge(right=ingoing, how='left', left_index=True, right_index=True).fillna(0)
    fts['outgoing_count'] /= fts['amount_count']
    fts['ingoing_count'] /= fts['amount_count']

    gc.collect()
    return fts

In [9]:
features = generate_features(payments)

features['cli'] = features.index
features = features.sort_values('cli').drop('cli', axis=1)
target['cli'] = target.index
target = target.sort_values('cli').drop('cli', axis=1)

In [None]:
def objective(trial):
    new_preds = valid_preds.copy(deep=True)
    borders = [trial.suggest_float(f'{x}', 0.05, 0.95) for x in range(35)]
    for x in range(35):
        new_preds[f'type_{x}'] = (new_preds[f'type_{x}'] > borders[x]).astype('int')
    score = fbeta_score(y_valid, new_preds, beta=0.5, average='micro', zero_division=0)
    return score

client_ids = features.index
kfold = KFold(n_splits=6, random_state=seed, shuffle=True)
fold_scores = []

payments_test = pd.read_csv(test_payments, dtype=payments_dtypes)
features_test = generate_features(payments_test)

test_predictions = np.zeros((len(features_test), 35))

for i, (train_index, valid_index) in enumerate(kfold.split(client_ids)):
    X_train, y_train = features.iloc[train_index], target.iloc[train_index]
    X_valid, y_valid = features.iloc[valid_index], target.iloc[valid_index]

    model = CatBoostClassifier(
        depth=5,
        iterations=1500,
        loss_function='MultiCrossEntropy',
        random_seed=seed,
        thread_count=7,
        save_snapshot=False,
        early_stopping_rounds=200,
    )
    feature_list = X_train.columns.tolist()
    model.fit(Pool(X_train, y_train), eval_set=Pool(X_valid, y_valid), plot=True, verbose=50)
    valid_preds = pd.DataFrame(model.predict_proba(X_valid), index=X_valid.index, columns=[f'type_{i}' for i in range(35)])

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=3000)
    borders = [study.best_params[str(x)] for x in range(35)]
    for x in range(35): valid_preds[f'type_{x}'] = (valid_preds[f'type_{x}'] > borders[x]).astype('int')
    final_score = fbeta_score(y_valid, valid_preds, beta=0.5, average='micro', zero_division=0)
    fold_scores.append(final_score)

    for x in feature_list:
        if x not in features_test: features_test[x] = 0
    preds_test = model.predict_proba(features_test[feature_list])
    preds_test -= borders
    test_predictions += preds_test

In [11]:
f'FINAL VALIDATION SCORE: {np.mean(fold_scores)}'

'FINAL VALIDATION SCORE: 0.41805122135321165'

In [12]:
sub_df = pd.DataFrame(test_predictions, index=features_test.index, columns=[f'type_{i}' for i in range(35)])
sub_df = (sub_df > 0).astype('int')

sub_df.to_csv('submit.csv')