In [1]:
!pip install numpy
!pip install pandas
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu114
!pip install pyarrow
!pip install ipywidgets==7.4.2
!pip install tqdm

Looking in indexes: https://download.pytorch.org/whl/cu114


In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.backends.cudnn as cudnn
from tqdm import tqdm
import os

%load_ext autoreload

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cudnn.enabled = True
cudnn.benchmark = False
cudnn.deterministic = True

In [4]:
data = pd.read_parquet('data/train_transactions/part_000_0_to_23646.parquet')

In [5]:
from nppr_dataset import NPPRDataset
from nppr_module import NPPRModel
from utils import prepare_data, train, encode

In [6]:
# Define columns
numeric_cols = ['amnt']
categorical_cols = [
    'currency', 'operation_kind', 'card_type', 'operation_type', 'operation_type_group',
    'ecommerce_flag', 'payment_system', 'income_flag', 'mcc', 'country',
    'city', 'mcc_category', 'day_of_week', 'hour', 'days_before', 'weekofyear'
]
time_col = 'hour_diff'

In [7]:
dataloader = prepare_data(data, numeric_cols, categorical_cols, time_col, max_past_events=5, batch_size=32, train=True)

In [8]:
embedding_dims = {
        feature_name: {"in": data[feature_name].nunique(), "out": 16}
        for feature_name in categorical_cols
    }
embedding_size = 512
hidden_size_enc = 512
hidden_size_dec = 512
learning_rate = 1e-3
alpha = 0.001
lambda_param = 2 * 30 * 24
epochs = 5
num_numerical_features = 1
num_categories = [dims["in"] for dims in embedding_dims.values()]

In [9]:
model = NPPRModel(
    embedding_dims=embedding_dims,
    embedding_size=embedding_size,
    hidden_size_enc=hidden_size_enc,
    hidden_size_dec=hidden_size_dec,
    num_numerical_features=num_numerical_features,
    num_categories=num_categories
)

optimizer = Adam(model.parameters(), lr=learning_rate)

In [10]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
torch.autograd.set_detect_anomaly(True)

model = train(
    model=model,
    dataloader=dataloader,
    optimizer=optimizer,
    epochs=1,
    lambda_param=lambda_param,
    alpha=alpha,
    save_path="/home/efadeev/nppr/weights/nppr_test_run_2.pth",
    device=device
)

Epoch 1/1: 100%|██████████| 711/711 [06:30<00:00,  1.82it/s]

Epoch 1: Loss = 993.3914
Model saved to /home/efadeev/nppr/weights/nppr_test_run_2.pth





In [12]:
embeddings = encode(model, dataloader, load_path='/home/efadeev/nppr/weights/nppr_test_run.pth', device=device)

Encoding:   0%|          | 0/711 [00:00<?, ?it/s]

Encoding: 100%|██████████| 711/711 [02:27<00:00,  4.83it/s]


In [13]:
embeddings.shape

(22743, 512)

In [14]:
embeddings

array([[0.72747993, 0.18430959, 0.4124095 , ..., 0.15258698, 0.31361198,
        0.23356082],
       [0.6927131 , 0.14980339, 0.236428  , ..., 0.17197116, 0.12143982,
        0.20507906],
       [0.717432  , 0.23072107, 0.33503148, ..., 0.20522703, 0.278514  ,
        0.26679495],
       ...,
       [0.72902757, 0.11140182, 0.28956112, ..., 0.06299622, 0.03436748,
        0.09194176],
       [0.609064  , 0.22386388, 0.30721483, ..., 0.24465068, 0.17251596,
        0.19983017],
       [0.74629885, 0.16407868, 0.2701781 , ..., 0.12362931, 0.13276584,
        0.19838779]], dtype=float32)