In [1]:
import gc
import wandb
import pickle
import numpy as np 
import pandas as pd
from tqdm import tqdm 

import torch 
from torch import nn
from torch.utils.data import DataLoader 

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from src.model.utils import load_embeddings
from src.model.data_utils import APNDataset, EMBDataset
from src.model.model import SModel
from src.model.train_utils import train_model
from src.utils import load_code_to_idx, load_embs

from src.transactions_emb import create_transactions_embeddings
from src.clickstream_emb import create_clickstream_embeddings


In [2]:
TRAIN_MATCHING_PATH = "./data/train_matching.csv"
TRANSACTIONS_PATH = "./data/transactions.csv"
CLICKSTREAM_PATH = "./data/clickstream.csv"

In [3]:
train_match = pd.read_csv(TRAIN_MATCHING_PATH)
train_match.head()

Unnamed: 0,bank,rtk
0,178b387813ac4a63a2200274a9e3876e,e19c0f141e9442c5bdb0d5751f6fba1c
1,47cffa46e6b04389ba91a70735d2c3ff,7df3371aabd349e4ac2de8299238627d
2,f73b767cfd72472aa06df9a69e746aaf,b23d62c7e41145a7ad140a4fcfaafefa
3,48da649603734185b98dcea93484dbc7,63ad789541c54463ad54d0a707b3ab8b
4,37304ef19de542ee8fe8a6f050236525,c0e96de5dd594d948a8b9fde9e6980cb


In [4]:
# Separate matched and unmatched:

idx = 14671

train_match_paired = train_match[:idx]
train_match_unpaired = train_match[idx:]

# Sanity check:
assert len(train_match_paired) + len(train_match_unpaired) == len(train_match)
assert train_match_paired.loc[len(train_match_paired) - 1].rtk != "0"
assert train_match_unpaired.iloc[0].rtk == "0"


In [5]:
# K-Fold split:

folds = {}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, idxs in enumerate(kf.split(train_match_paired)):
    train_index, test_index = idxs
    train = train_match_paired.iloc[train_index]
    test = train_match_paired.iloc[test_index]
    folds[i] = {"train": train.copy(), "test": test.copy()}

In [8]:
cfg = {
    "wandb_logging": False,
    "fold": 0,
    "test_size": 0.2,
    "n_iters": 7000,
    "n_iters_val": 1000,
    "lr": 3e-4,
    "bs": 32,
    "save_best_val": True,
    "checkpoint_path": "./weights",
}


In [9]:
if cfg["wandb_logging"]:
    wandb.init(project="matching", entity="glebk")
    wandb.config = cfg

In [10]:
mcc_code_to_idx, cat_code_to_idx = load_code_to_idx(
    "./submission/mcc_code_to_idx.pickle",
    "./submission/cat_code_to_idx.pickle",
)


In [11]:
mcc_embs, clc_embs = load_embs(
    "./submission/mcc_code_emb_seq.pickle",
    "./submission/clc_code_emb_seq.pickle",
)


In [12]:
# Load raw transactions:

transactions = pd.read_csv(TRANSACTIONS_PATH)
print(transactions.shape)

# Remove transactions with mcc == -1:
transactions = transactions[transactions.mcc_code != -1]
print(transactions.shape)

transactions.head()


(19821910, 5)
(19198619, 5)


Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,000932580e404dafbecd5916d4640938,5411,48,-361.0723,2020-08-03 08:05:23
1,000932580e404dafbecd5916d4640938,5499,48,-137.31398,2020-08-05 01:27:40
2,000932580e404dafbecd5916d4640938,5499,48,-138.84981,2020-08-05 03:28:11
3,000932580e404dafbecd5916d4640938,4829,48,-309.47653,2020-08-06 00:36:29
4,000932580e404dafbecd5916d4640938,5411,48,-133.4737,2020-08-09 00:30:13


In [13]:
# Process transactions:

bank_trans_emb = create_transactions_embeddings(transactions, mcc_code_to_idx)
bank_trans_emb.head()


Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384
0,000932580e404dafbecd5916d4640938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
1,0009e36b42cb4caeb928a1e596819495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
2,000b29acb6bd44f99473c1be5ca28f3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
3,000c5327d99941fe934169838c65b92c,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,0.0,0.0
4,000e0d54d7c945ebb8f6f855972e8396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Load raw clickstream:

clickstream = pd.read_csv(CLICKSTREAM_PATH)
clickstream.head()


Unnamed: 0,user_id,cat_id,timestamp,new_uid
0,000143baebad4467a23b98c918ccda19,165,2021-01-30 20:08:12,1873448
1,000143baebad4467a23b98c918ccda19,165,2021-01-31 20:06:29,1873448
2,000143baebad4467a23b98c918ccda19,308,2021-01-31 20:12:00,1873448
3,000143baebad4467a23b98c918ccda19,931,2021-01-31 22:12:00,1873448
4,000143baebad4467a23b98c918ccda19,931,2021-02-01 16:57:00,1873448


In [15]:
# Process clickstream:

rtk_emb = create_clickstream_embeddings(clickstream, cat_code_to_idx)
rtk_emb.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401
0,000143baebad4467a23b98c918ccda19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,33.0,0.0,0.0,7.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000a8d3cdef3455d990e97730a2cef43,52.0,0.0,0.0,0.0,0.0,233.0,0.0,1.0,41.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000c399352314e759041dc1651fe6980,5.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,278.0,0.0,1.0,25.0,8.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,4.0,0.0,3.0,130.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000eb2d39f4f4df0afd630c4f11ca049,0.0,0.0,0.0,0.0,0.0,0.0,94.0,0.0,11.0,0.0,2.0,4.0,9.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,000fc5be9e974980a1c9728f4866db11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,18.0,16.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
train_df = folds[cfg["fold"]]["train"]
test_df = folds[cfg["fold"]]["test"]

train_dataset = APNDataset(train_df, bank_trans_emb, rtk_emb, mcc_embs, clc_embs)
test_dataset = APNDataset(test_df, bank_trans_emb, rtk_emb, mcc_embs, clc_embs)
print("Train size: ", len(train_dataset))
print("Test size: ", len(test_dataset))


Train size:  11736
Test size:  2935


In [19]:
train_loader = DataLoader(train_dataset, batch_size=cfg["bs"], shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=cfg["bs"], shuffle=False)

In [20]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [21]:
model = SModel().to(device)

In [22]:
criterion = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1.0 - nn.functional.cosine_similarity(x, y)
)

optimizer = torch.optim.Adam(model.parameters(), lr=cfg["lr"])
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=1000, gamma=0.9, verbose=True
)


Adjusting learning rate of group 0 to 3.0000e-04.


In [None]:
train_model(
    model, train_loader, val_loader, criterion, device, optimizer, scheduler, cfg
)
