In [None]:
import sys
if 'google.colab' in str(get_ipython()):
  ! {sys.executable} -m pip install pytorch-lifestream
  ! pip install "torch<2"
  ! pip install torchvision==0.13.1

In [11]:
import torch
import pytorch_lightning as pl

from sklearn.metrics import accuracy_score, classification_report

import logging

from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn
import torch.optim as optim


In [None]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/ptls-datasets/age-prediction-nti-sbebank-2019.zip
    ! unzip -j -o age-prediction-nti-sbebank-2019.zip 'data/*.csv' -d data
    ! mv age-prediction-nti-sbebank-2019.zip data/

In [8]:
import os
import pandas as pd

data_path = 'data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
source_data

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341
...,...,...,...,...
26450572,43300,727,25,7.602
26450573,43300,727,15,3.709
26450574,43300,727,1,6.448
26450575,43300,727,11,24.669


In [12]:
df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))

In [13]:
df_target_train, df_target_test = train_test_split(
    df_target, test_size=7000, stratify=df_target['bins'], random_state=142)


df_target_train, df_target_valid = train_test_split(
    df_target_train, test_size=3000, stratify=df_target_train['bins'], random_state=142)


print('Split {} records to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_target, df_target_train, df_target_valid, df_target_test]]))

Split 30000 records to train: 20000, valid: 3000, test: 7000


In [14]:
df_trx = pd.read_csv(os.path.join(data_path,'transactions_train.csv'))
df_trx.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [15]:
df_trx_train = pd.merge(df_trx, df_target_train['client_id'], on='client_id', how='inner')

df_trx_valid = pd.merge(df_trx, df_target_valid['client_id'], on='client_id', how='inner')

df_trx_test = pd.merge(df_trx, df_target_test['client_id'], on='client_id', how='inner')


print('Split {} transactions to train: {}, valid: {}, test: {}'.format(
    *[len(df) for df in [df_trx, df_trx_train, df_trx_valid, df_trx_test]]))

Split 26450577 transactions to train: 17622321, valid: 2634248, test: 6194008


In [16]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=False,
)

In [17]:
%%time
df_data_train_no_target = preprocessor.fit_transform(df_trx_train)
df_data_valid_no_target = preprocessor.transform(df_trx_valid)
df_data_test_no_target = preprocessor.transform(df_trx_test)

CPU times: user 52.1 s, sys: 10.2 s, total: 1min 2s
Wall time: 1min 7s


In [18]:
print('Record in dataset, train {}, valid {}, test {}\nEach record is a client with list of transactions'.format(
    *[len(df) for df in [df_data_train_no_target, df_data_valid_no_target, df_data_test_no_target]]))

Record in dataset, train 20000, valid 3000, test 7000
Each record is a client with list of transactions


In [19]:
df_target = df_target.rename(columns={'bins': 'target_bin'})

In [20]:
df_data_train = pd.merge(df_data_train_no_target, df_target, on='client_id')
df_data_valid = pd.merge(df_data_valid_no_target, df_target, on='client_id')
df_data_test = pd.merge(df_data_test_no_target, df_target, on='client_id')

df_data_train_no_target = df_data_train_no_target.to_dict(orient='records')
df_data_valid_no_target = df_data_valid_no_target.to_dict(orient='records')
df_data_test_no_target = df_data_test_no_target.to_dict(orient='records')

In [21]:
df_data_train = df_data_train.to_dict(orient='records')
df_data_valid = df_data_valid.to_dict(orient='records')
df_data_test = df_data_test.to_dict(orient='records')

In [22]:
def remove_extra_feature(name):
    for i in range(len(name)):
        if 'trans_date' in name[i]:
            del name[i]['trans_date']

In [23]:
remove_extra_feature(df_data_train_no_target)
remove_extra_feature(df_data_valid_no_target)
remove_extra_feature(df_data_test_no_target)
remove_extra_feature(df_data_train)
remove_extra_feature(df_data_valid)
remove_extra_feature(df_data_test)

In [24]:
from ptls.data_load.datasets import MemoryMapDataset

dataset_train = MemoryMapDataset(df_data_train)
dataset_valid = MemoryMapDataset(df_data_valid)

dataset_test = MemoryMapDataset(df_data_test)

In [25]:
import torch
import torchmetrics
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head

In [34]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            'small_group': {'in': 250, 'out': 16},
            'event_time' : {'in' : 800, 'out' : 8},
        },
        numeric_values={
            'amount_rur': 'log',
        },
        embeddings_noise=0.003,
    ),
    hidden_size=48,
)

In [35]:
from functools import partial
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule

In [36]:
from ptls.frames.coles import CoLESModule

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [37]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=df_data_train_no_target,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    valid_data = ColesDataset(
        MemoryMapDataset(
            data=df_data_valid_no_target,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=2,
    train_batch_size=256,
)

In [38]:
trainer = pl.Trainer(
    max_epochs=1,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [39]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 21.2 K
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
21.2 K    Trainable params
0         Non-trainable params
21.2 K    Total params
0.085     Total estimated model params size (MB)


logger.version = 1


  self.pid = os.fork()
  self.pid = os.fork()


{'loss': tensor(93.4118), 'seq_len': tensor(110.0687), 'valid/recall_top_k': tensor(0.0236)}
CPU times: user 7.47 s, sys: 773 ms, total: 8.24 s
Wall time: 14 s


In [40]:
from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(dataset_train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(dataset_test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(torch.Size([20000, 48]), torch.Size([7000, 48]))

In [41]:
df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))

df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in dataset_train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in dataset_test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(20000, 50) (7000, 50)


In [42]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

In [43]:
lgb_clf = lgb.LGBMClassifier(
    n_estimators=1000,
    boosting_type='gbdt',
    objective='multiclass',
    num_class=4,
    metric='multi_error',
    learning_rate=0.02,
    subsample=0.75,
    subsample_freq=1,
    feature_fraction=0.75,
    colsample_bytree=None,
    max_depth=12,
    lambda_l1=1,
    reg_alpha=None,
    lambda_l2=1,
    reg_lambda=None,
    min_data_in_leaf=50,
    min_child_samples=None,
    num_leaves=50,
    random_state=42,
    n_jobs=4,
)

In [None]:
lgb_clf.fit(x_train, y_train)

In [45]:
lgb_clf.score(x_test, y_test)

0.40814285714285714