In [35]:
import gc
import wandb
import pickle
import numpy as np 
import pandas as pd
from tqdm import tqdm 

import torch 
from torch import nn
from torch.utils.data import DataLoader 

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from src.model.utils import load_embeddings
from src.model.data_utils import APNDataset, EMBDataset
from src.model.model import SModel
from src.model.train_utils import train_model
from src.utils import load_code_to_idx

from src.transactions_emb import create_transactions_embeddings
from src.clickstream_emb import create_clickstream_embeddings


from src.model.data_utils import EMBDataset
from src.model.utils import load_model
from typing import Dict

In [36]:
TRAIN_MATCHING_PATH = "/home/glebk/VSProjects/projects/Matching/data/train_matching.csv"
TRANSACTIONS_PATH = "/home/glebk/VSProjects/projects/Matching/data/transactions.csv"
CLICKSTREAM_PATH = "/home/glebk/VSProjects/projects/Matching/data/clickstream.csv"


In [37]:
def create_vtb_embeddings_for_model(
    transactions, model, mcc_code_to_idx, device
) -> Dict:

    bank_trans_emb = create_transactions_embeddings(transactions, mcc_code_to_idx)

    emb_dataset = EMBDataset(bank_trans_emb, "vtb")
    emb_dataset_loader = DataLoader(emb_dataset, 256, shuffle=False)

    embs = []
    emb_iter = iter(emb_dataset_loader)
    for i in range(len(emb_iter)):
        data = next(emb_iter)
        emb_batch = model(data.to(device), mode="anchor")
        embs.append(emb_batch)

    embs = torch.cat(embs, dim=0)
    embs = embs.detach().cpu().numpy()

    uids = bank_trans_emb["user_id"].values

    embs_dict = dict(zip(uids, embs))

    print("vtb embs shape: ", embs.shape)

    return embs_dict

In [38]:
def create_rtk_embeddings_for_model(
    clickstream, model, cat_code_to_idx, device
) -> Dict:

    rtk_emb = create_clickstream_embeddings(clickstream, cat_code_to_idx)

    emb_dataset = EMBDataset(rtk_emb, "rtk")
    emb_dataset_loader = DataLoader(emb_dataset, 256, shuffle=False)

    embs = []
    emb_iter = iter(emb_dataset_loader)
    for i in range(len(emb_iter)):
        data = next(emb_iter)
        emb_batch = model(data.to(device), mode="positive")
        embs.append(emb_batch)

    embs = torch.cat(embs, dim=0)
    embs = embs.detach().cpu().numpy()

    uids = rtk_emb["user_id"].values

    embs_dict = dict(zip(uids, embs))

    print("rtk embs shape: ", embs.shape)

    return embs_dict

In [39]:
# Load raw transactions:

transactions = pd.read_csv(TRANSACTIONS_PATH)
print(transactions.shape)

(19821910, 5)


In [40]:
# Load raw clickstream:

clickstream = pd.read_csv(CLICKSTREAM_PATH)
clickstream.head()


Unnamed: 0,user_id,cat_id,timestamp,new_uid
0,000143baebad4467a23b98c918ccda19,165,2021-01-30 20:08:12,1873448
1,000143baebad4467a23b98c918ccda19,165,2021-01-31 20:06:29,1873448
2,000143baebad4467a23b98c918ccda19,308,2021-01-31 20:12:00,1873448
3,000143baebad4467a23b98c918ccda19,931,2021-01-31 22:12:00,1873448
4,000143baebad4467a23b98c918ccda19,931,2021-02-01 16:57:00,1873448


In [41]:
WEIGHTS = "./weights/SModel_0.606.pth"
device = "cuda:0"
model = SModel().to(device)
model = load_model(WEIGHTS, model, device)

In [42]:
mcc_code_to_idx, cat_code_to_idx = load_code_to_idx(
    "./submission/mcc_code_to_idx.pickle",
    "./submission/cat_code_to_idx.pickle",
)

In [43]:
vtb_emb = create_vtb_embeddings_for_model(
    transactions, model, mcc_code_to_idx, device
)

vtb embs shape:  (22533, 256)


In [44]:
rtk_emb = create_rtk_embeddings_for_model(
    clickstream, model, cat_code_to_idx, device
)

rtk embs shape:  (19623, 256)


## Prepare train df:

In [50]:
train_match = pd.read_csv(TRAIN_MATCHING_PATH)


tgt = np.zeros(len(train_match))
tgt[:14671] = 1

train_match["tgt"] = tgt.astype(np.int16)
train_match.head()


Unnamed: 0,bank,rtk,tgt
0,178b387813ac4a63a2200274a9e3876e,e19c0f141e9442c5bdb0d5751f6fba1c,1
1,47cffa46e6b04389ba91a70735d2c3ff,7df3371aabd349e4ac2de8299238627d,1
2,f73b767cfd72472aa06df9a69e746aaf,b23d62c7e41145a7ad140a4fcfaafefa,1
3,48da649603734185b98dcea93484dbc7,63ad789541c54463ad54d0a707b3ab8b,1
4,37304ef19de542ee8fe8a6f050236525,c0e96de5dd594d948a8b9fde9e6980cb,1


In [51]:
rtk_ids = train_match[:14671].rtk.values

for i in range(14671, len(train_match)):
    train_match.loc[i, "rtk"] = np.random.choice(rtk_ids)

In [52]:
def get_negative_sample(match_true: pd.DataFrame, pos_sample: str) -> str:
    found = False
    while not found:
        sample = np.random.choice(match_true.rtk)
        if sample != pos_sample:
            return sample

In [53]:
N = len(train_match)
for i in tqdm(range(12000)):
    idx = np.random.randint(len(train_match[:14671]))
    vtb_id = train_match.iloc[idx].bank
    pos_id = train_match.iloc[idx].rtk
    neg_sample = get_negative_sample(train_match, pos_id)
    data = [vtb_id, neg_sample, 0]
    train_match.loc[N + i] = data

100%|██████████| 12000/12000 [00:17<00:00, 680.66it/s]


In [54]:
sum(train_match.tgt == 1), sum(train_match.tgt == 0)

(14671, 14910)

In [55]:
train_match.head(-10)

Unnamed: 0,bank,rtk,tgt
0,178b387813ac4a63a2200274a9e3876e,e19c0f141e9442c5bdb0d5751f6fba1c,1
1,47cffa46e6b04389ba91a70735d2c3ff,7df3371aabd349e4ac2de8299238627d,1
2,f73b767cfd72472aa06df9a69e746aaf,b23d62c7e41145a7ad140a4fcfaafefa,1
3,48da649603734185b98dcea93484dbc7,63ad789541c54463ad54d0a707b3ab8b,1
4,37304ef19de542ee8fe8a6f050236525,c0e96de5dd594d948a8b9fde9e6980cb,1
...,...,...,...
29566,69e2a51e498340218dac121b8e095d0b,23f446e84f3c4fb3abc82712856c7623,0
29567,405fef9c60c441c1957aa3ae865b7822,1bf1c3919d834aa5aa0e01eb708b8382,0
29568,c48fb7c619cf45b59afc9a163e6ee48c,63e5fe68837b4bd3b9d10fa6f67a18b5,0
29569,95cd33c2f87d42c381c394dc2e61f0e0,a1c263b422894d059f945fc618062fea,0


## Prepare train test split:

In [56]:
X = []
for i in train_match.index:
    vtb_id = train_match.iloc[i].bank
    rtk_id = train_match.iloc[i].rtk
    vtb_emb_ = vtb_emb[vtb_id]
    rtk_emb_ = rtk_emb[rtk_id]
    x = np.concatenate([vtb_emb_, rtk_emb_])
    X.append(x)

X = np.vstack(X)
y = train_match.tgt.values

print(X.shape)
print(y.shape)


(29581, 512)
(29581,)


In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42, test_size=0.2
)

print(len(X_train), len(y_train))
print(len(X_test), len(y_test))


23664 23664
5917 5917


## Train XGBoost classifier:

In [59]:
import xgboost as xgb

In [78]:
# training, we set the early stopping rounds parameter
xgb_cl = xgb.XGBClassifier(
    n_estimators=10000,
    max_depth=4,
    learning_rate=0.01,
    subsample=0.8,
    gamma=0.5,
    reg_lambda=10,
    reg_alpha=5,
    colsample_bytree=1
)

xgb_cl.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=20,
)




[0]	validation_0-logloss:0.69261	validation_1-logloss:0.69271
[1]	validation_0-logloss:0.69222	validation_1-logloss:0.69241
[2]	validation_0-logloss:0.69175	validation_1-logloss:0.69204
[3]	validation_0-logloss:0.69117	validation_1-logloss:0.69164
[4]	validation_0-logloss:0.69059	validation_1-logloss:0.69120
[5]	validation_0-logloss:0.69027	validation_1-logloss:0.69100
[6]	validation_0-logloss:0.68989	validation_1-logloss:0.69072
[7]	validation_0-logloss:0.68939	validation_1-logloss:0.69037
[8]	validation_0-logloss:0.68890	validation_1-logloss:0.69000
[9]	validation_0-logloss:0.68851	validation_1-logloss:0.68970
[10]	validation_0-logloss:0.68816	validation_1-logloss:0.68943
[11]	validation_0-logloss:0.68785	validation_1-logloss:0.68919
[12]	validation_0-logloss:0.68722	validation_1-logloss:0.68868
[13]	validation_0-logloss:0.68683	validation_1-logloss:0.68842
[14]	validation_0-logloss:0.68630	validation_1-logloss:0.68803
[15]	validation_0-logloss:0.68596	validation_1-logloss:0.68784
[1

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.5, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=10000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=5, reg_lambda=10, ...)

## Score accuracy:

In [80]:
from sklearn.metrics import accuracy_score

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)

0.7049180327868853

In [81]:
# Save model:
xgb_cl.save_model("./submission/model_xgboost.txt")