In [1]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
from model import predict
from check_budget import check_budget
from sklearn import metrics
import pickle
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter('ignore')

SEED = 123253132
random.seed(SEED)
np.random.seed(SEED)

In [2]:
bins_path = "../data/model/nn_bins.pickle"
model_path = "../data/model/nn_weights.ckpt"
quantiles_path = "../data/quantiles.json"
BUDGET = 10

train_tr_path = '../data/transactions_finetune.csv'
train_target_path = '../data/target_finetune.csv'

attack_path = '../data/sample_submission.csv'

submit_path = 'submit.csv'

quantiles = json.load(open(quantiles_path, 'r'))
target = pd.read_csv(train_target_path)


class CFG:
    num_folds_prediction = 20 # Кол-во раз будем перемешивать датасет и делать предсказания, для получения большого скора
    num_tr_check = 120 # Кол-во транзакций у каждого пользователя, которые мы будем оценивать
    top_users = 100 # Кол-во юзеров, транзакции которых будем брать для проверки на изменение скора

Предсказываем дефолт клиента по среднему предсказанию модели на перемешанных данных, чтобы улучшить точность предсказаний модели

In [3]:
def big_predict(tr_path, final_col='target'):
    df_transactions = pd.read_csv(
        tr_path,
        parse_dates=["transaction_dttm"],
        dtype={"user_id": int, "mcc_code": int, "currency_rk": int, "transaction_amt": float},
    )

    a = []
    for _ in tqdm(range(CFG.num_folds_prediction)):
        cur_df = df_transactions.sample(frac=1)
        cur_res = predict(bins_path, model_path, random_seed=SEED, final_col=final_col, source_df=cur_df)
        a.append(cur_res[final_col].values)

    mean_preds = np.mean(a, axis=0)

    cur_res[final_col] = mean_preds
    return cur_res

- Для каждого пользователя найдем транзакции, которые сильнее всего влияют на предсказание модели
- Надежным пользователям добавляем рискованные транзакции
- Рискованным пользователям добавляем надежные транзакции

In [4]:
def attack_file(attacked_file_path, final_path):

    df_transactions = pd.read_csv(
        attacked_file_path,
        parse_dates=["transaction_dttm"],
        dtype={"user_id": np.int32, "mcc_code": np.int16, "currency_rk": np.int8, "transaction_amt": float},
    )

    a = []
    users = df_transactions.user_id.unique()
    d = {f'{users[x//CFG.num_tr_check]}_{x%CFG.num_tr_check}': x for x in range(len(users)*CFG.num_tr_check)}
    d_rev = {y: x for x,y in d.items()}

    for x in range(CFG.num_tr_check):
        # Будем брать 300,299,298... транзакций и предсказывать дефолт
        # Таким образом мы сможем посмотреть, как изменяется предсказание модели после добавления новой транзакции
        cur_df = df_transactions.groupby('user_id').head(300-x)
        cur_df['user_id'] = cur_df['user_id'].apply(lambda k: d[f'{str(k)}_{x}'])
        a.append(cur_df)

    a = pd.concat(a)
    a['user_id'] = a['user_id'].astype(np.int32)
    
    display('Сгенерированы все последовательности', a)


    num_checks = 15 # Чтобы хватило ОЗУ при предсказании, нужно поделить датасет на несколько частей
    change = [] # Здесь будут храниться изменения предсказания модели после добавление транзакции
    un_users = a.user_id.unique().tolist()
    step = len(un_users) // num_checks + 1
    for x in tqdm(range(num_checks)):
        cur = un_users[step*x:step*(x+1)]
        cur = a[a.user_id.isin(cur)]
        change.append(predict(bins_path, model_path, random_seed=SEED, final_col='pred', source_df=cur))
        del cur

    change = pd.concat(change).reset_index(drop=True)
    del a

    change['user_id_new'] = change['user_id'].apply(lambda x: d_rev[x])
    change['user_id'] = change['user_id_new'].apply(lambda x: int(x.split('_')[0]))
    change['num'] = change['user_id_new'].apply(lambda x: int(x.split('_')[1]))

    change['prev_pred'] = change.groupby('user_id')['pred'].shift(-1)
    change['diff'] = change['pred'] - change['prev_pred']
    change = change.dropna().sort_values('diff').drop('user_id_new',axis=1)
    
    display('Сделаны предсказания для всех последовательностей', change)



    df_transactions = pd.read_csv(
        attacked_file_path,
        parse_dates=["transaction_dttm"],
        dtype={"user_id": int, "mcc_code": int, "currency_rk": int, "transaction_amt": float},
    )

    result = predict(bins_path, model_path, random_seed=SEED, final_col='target', source_df=df_transactions)
    res_new = big_predict(attacked_file_path, final_col='pred_big')
    result = result.merge(res_new, how='left', on='user_id')

    threshold = result.pred_big.max() / 2

    one_idx = result.index[result['pred_big']>threshold]
    zero_idx = result.index[result['pred_big'] <= threshold]

    reliable_users = result.sort_values('target').user_id.values[:CFG.top_users] # Берем n юзеров, с наименьшими предсказаниями
    risk_users = result.sort_values('target').user_id.values[-CFG.top_users:] # Берем n юзером, с наибольшими предсказаниями

    users = result.user_id.values

    one_users = users[one_idx]
    zero_users = users[zero_idx]

    starter_df = df_transactions.copy(deep=True)



    # У надежных клиентов находим транзакции, которые сильнее всех уменьшают скор
    # Их будем добавлять рискованным клиентам
    reliable_idx_from = {}
    for rel_user in reliable_users:
        cur = list(reversed(df_transactions[df_transactions.user_id==rel_user].index))
        r = list(reversed(change[change.user_id==rel_user].head(BUDGET).num.values))
        cur_idx = [cur[x] for x in r]
        reliable_idx_from[rel_user] = cur_idx


    # У клиетов с большим риском дефолта находим транзакции, которые сильнее всех увеличивают скор
    # Их будет добавлять надежным клиентам
    risk_idx_from = {}
    for risk_us in risk_users:
        cur = list(reversed(df_transactions[df_transactions.user_id==risk_us].index))
        r = change[change.user_id==risk_us].tail(BUDGET).num.values
        cur_idx = [cur[x] for x in r]
        risk_idx_from[risk_us] = cur_idx


    # У клиетов с большим риском дефолта находим транзакции, которые сильнее всех увеличивают скор
    # Которые заменим на безопасные транзакции
    one_idx_to = {}
    for one_user in one_users:
        cur = list(reversed(df_transactions[df_transactions.user_id==one_user].index))
        r = change[change.user_id==one_user].tail(BUDGET).num.values
        cur_idx = [cur[x] for x in r]
        one_idx_to[one_user] = cur_idx


    # У надежных клиентов находим транзакции, которые сильнее всех уменьшают скор
    # Которые заменим на рискованные транзакции
    zero_idx_to = {}
    for zero_user in zero_users:
        cur = list(reversed(df_transactions[df_transactions.user_id==zero_user].index))
        r = list(reversed(change[change.user_id==zero_user].head(BUDGET).num.values))
        cur_idx = [cur[x] for x in r]
        zero_idx_to[zero_user] = cur_idx


    # Находим надежных пользователей, транзакции которых сильнее всего изменяют скор модели у конкретного пользователя
    all_preds = []
    for change_user in reliable_users:
        cur_df = starter_df.copy(deep=True)
        idx_from = pd.Int64Index(reliable_idx_from[change_user])
        sign_from = np.sign(cur_df.loc[idx_from, 'transaction_amt'].values)
        
        for one_user in one_users:
            idx_to = pd.Int64Index(one_idx_to[one_user])
            sign_to = np.sign(cur_df.loc[idx_to, "transaction_amt"].values)
            sign_mask = (sign_to == sign_from)
            cur_df.loc[idx_to[sign_mask], 'mcc_code'] = cur_df.loc[idx_from[sign_mask], 'mcc_code'].values
            cur_df.loc[idx_to[sign_mask], 'transaction_amt'] = cur_df.loc[idx_from[sign_mask], 'transaction_amt'].values

        cur_res = predict(bins_path, model_path, random_seed=SEED, final_col=str(change_user), source_df=cur_df[cur_df.user_id.isin(one_users)])
        all_preds.append(cur_res)

    cur_res = [x.values[:, 1] for x in all_preds]
    cur_res = np.array(cur_res).argmin(axis=0)
    cur_res = [reliable_users[x] for x in cur_res]

    for id, one_user in enumerate(one_users):
        idx_to = pd.Int64Index(one_idx_to[one_user])
        sign_to = np.sign(df_transactions.loc[idx_to, "transaction_amt"].values)
        idx_from = pd.Int64Index(reliable_idx_from[cur_res[id]])
        sign_from = np.sign(df_transactions.loc[idx_from, 'transaction_amt'].values)
        sign_mask = (sign_to == sign_from)
        df_transactions.loc[idx_to[sign_mask], 'mcc_code'] = starter_df.loc[idx_from[sign_mask], 'mcc_code'].values
        df_transactions.loc[idx_to[sign_mask], 'transaction_amt'] = starter_df.loc[idx_from[sign_mask], 'transaction_amt'].values


    # Находим рискованных пользователей, транзакции которых сильнее всего изменяют скор модели у конкретного пользователя
    all_preds = []
    for change_user in tqdm(risk_users):
        cur_df = starter_df.copy(deep=True)
        idx_from = pd.Int64Index(risk_idx_from[change_user])
        sign_from = np.sign(cur_df.loc[idx_from, 'transaction_amt'].values)

        for zero_user in zero_users:
            idx_to = pd.Int64Index(zero_idx_to[zero_user])
            sign_to = np.sign(cur_df.loc[idx_to, "transaction_amt"].values)
            sign_mask = (sign_to == sign_from)
            cur_df.loc[idx_to[sign_mask], 'mcc_code'] = cur_df.loc[idx_from[sign_mask], 'mcc_code'].values
            cur_df.loc[idx_to[sign_mask], 'transaction_amt'] = cur_df.loc[idx_from[sign_mask], 'transaction_amt'].values

        cur_res = predict(bins_path, model_path, random_seed=SEED, final_col=str(change_user), source_df=cur_df[cur_df.user_id.isin(zero_users)])
        all_preds.append(cur_res)

    cur_res = [x.values[:, 1] for x in all_preds]
    cur_res = np.array(cur_res).argmax(axis=0)
    cur_res = [risk_users[x] for x in cur_res]

    for id, zero_user in enumerate(zero_users):
        idx_to = pd.Int64Index(zero_idx_to[zero_user])
        sign_to = np.sign(df_transactions.loc[idx_to, "transaction_amt"].values)
        idx_from = pd.Int64Index(risk_idx_from[cur_res[id]])
        sign_from = np.sign(starter_df.loc[idx_from, 'transaction_amt'].values)
        sign_mask = (sign_to == sign_from)
        df_transactions.loc[idx_to[sign_mask], 'mcc_code'] = starter_df.loc[idx_from[sign_mask], 'mcc_code'].values
        df_transactions.loc[idx_to[sign_mask], 'transaction_amt'] = starter_df.loc[idx_from[sign_mask], 'transaction_amt'].values

    df_transactions.to_csv(final_path, index=False)

    bud = check_budget(attacked_file_path, final_path, quantiles_path)
    print('check_budget:', bud)

In [5]:
at_path = 'tmp.csv'
attack_file(train_tr_path, at_path)

'Сгенерированы все последовательности'

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,0,5541,48,-342.897920,2021-03-05 02:52:36
1,0,5533,48,-1251.881200,2021-03-05 09:43:28
2,0,5331,48,-87.309240,2021-03-05 11:17:23
3,0,5921,48,-1822.177000,2021-03-05 13:41:03
4,0,5311,48,-427.123630,2021-03-05 19:14:23
...,...,...,...,...,...
2123876,849599,4111,48,10.918672,2021-01-01 09:21:31
2123877,849599,4111,48,-108.903710,2021-01-03 02:25:37
2123878,849599,4111,48,-127.501495,2021-01-03 02:35:05
2123879,849599,5411,48,-1689.813200,2021-01-04 08:25:50


100%|██████████| 15/15 [03:06<00:00, 12.46s/it]


'Сделаны предсказания для всех последовательностей'

Unnamed: 0,user_id,pred,num,prev_pred,diff
40905,627842,0.223474,0,0.298688,-0.075213
40129,615643,0.106336,0,0.167135,-0.060799
24617,372727,0.124304,0,0.184854,-0.060550
20433,307587,0.142665,0,0.201862,-0.059197
47713,729485,0.242050,0,0.301118,-0.059068
...,...,...,...,...,...
25065,379712,0.200174,0,0.145239,0.054935
17473,262664,0.243206,0,0.188028,0.055177
35186,539818,0.123908,1,0.064188,0.059720
48817,746479,0.172169,0,0.112266,0.059903


100%|██████████| 20/20 [00:37<00:00,  1.88s/it]
100%|██████████| 100/100 [09:45<00:00,  5.85s/it]
100%|██████████| 2124000/2124000 [00:27<00:00, 76122.18it/s]

check_budget: True





In [6]:
target = pd.read_csv(train_target_path)
start_pred = predict(bins_path, model_path, random_seed=SEED, final_col='start_pred', source_file='../data/transactions_finetune.csv')
end_pred = predict(bins_path, model_path, random_seed=SEED, final_col='end_pred', source_file=at_path)
target = target.merge(start_pred, how='left', on='user_id').merge(end_pred, how='left', on='user_id')

print('SMALL PREDICT')
print(round(metrics.roc_auc_score(target['target'], target['start_pred']), 7))
print(round(metrics.roc_auc_score(target['target'], target['end_pred']), 7))


target = pd.read_csv(train_target_path)
start_pred = big_predict('../data/transactions_finetune.csv', final_col='start_pred')
end_pred = big_predict(at_path, final_col='end_pred')
target = target.merge(start_pred, how='left', on='user_id').merge(end_pred, how='left', on='user_id')

print('_________________________')
print('BIG PREDICT')
print(round(metrics.roc_auc_score(target['target'], target['start_pred']), 7))
print(round(metrics.roc_auc_score(target['target'], target['end_pred']), 7))

SMALL PREDICT
0.6818659
0.6331338


100%|██████████| 20/20 [00:37<00:00,  1.88s/it]
100%|██████████| 20/20 [00:37<00:00,  1.86s/it]

_________________________
BIG PREDICT
0.6897968
0.6737117





In [7]:
attack_file(attack_path, submit_path)

'Сгенерированы все последовательности'

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,0,5812,48,-1842.949000,2021-05-17 11:30:42
1,0,4112,48,-283.283720,2021-05-17 12:59:02
2,0,5814,48,-73.820390,2021-05-17 14:04:09
3,0,4111,48,10.971557,2021-05-17 15:03:10
4,0,7991,48,-214.796420,2021-05-17 15:33:43
...,...,...,...,...,...
1259876,503999,5533,48,-396.953670,2021-04-29 02:12:22
1259877,503999,5193,48,-2312.781200,2021-04-29 03:07:41
1259878,503999,5533,48,-954.333200,2021-04-29 08:57:08
1259879,503999,6012,48,-6561.576000,2021-04-29 12:15:14


100%|██████████| 15/15 [01:42<00:00,  6.84s/it]


'Сделаны предсказания для всех последовательностей'

Unnamed: 0,user_id,pred,num,prev_pred,diff
12185,313630,0.232315,0,0.302342,-0.070028
29369,758107,0.079547,0,0.146352,-0.066805
7753,207241,0.143522,0,0.209989,-0.066467
5785,155636,0.189799,0,0.253383,-0.063584
28921,747174,0.235816,0,0.295240,-0.059424
...,...,...,...,...,...
21729,554372,0.145422,0,0.096626,0.048796
1546,40989,0.132914,1,0.083308,0.049606
25548,652935,0.135059,3,0.083921,0.051138
31021,803412,0.124163,4,0.071764,0.052399


100%|██████████| 20/20 [00:22<00:00,  1.14s/it]
100%|██████████| 100/100 [05:46<00:00,  3.47s/it]
100%|██████████| 1260000/1260000 [00:16<00:00, 75034.64it/s]

check_budget: True





TO DO:
- Искать суммы транзакций, наиболее сильно воздействующие на модель
- По другому ранжировать транзакции у каждого пользователя, сейчас используется далеко не оптимальный вариант
- Искать одиночные транзакции, которые сильнее всего изменяют скор, а не копировать транзакции другого клиента
- Более точно подобрать threshold