In [1]:
import pandas as pd
import torch
from joblib import cpu_count

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from sklearn.model_selection import train_test_split
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.preprocessing.pandas.pandas_preprocessor import PandasDataPreprocessor
from pyinstrument import Profiler
import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BATCH_SIZE = 48

In [3]:
profiler = Profiler()

def define_data():
    source_data = pd.read_csv(
        'https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/transactions_train.csv.gz?download=true',
        compression='gzip')

    preprocessor = PandasDataPreprocessor(
        col_id='client_id',
        col_event_time='trans_date',
        event_time_transformation='none',
        cols_category=['small_group'],
        cols_numerical=['amount_rur'],
        return_records=True
    )
    return preprocessor, source_data


def define_model():
    trx_encoder_params = dict(embeddings_noise=0.003,
                              numeric_values={'amount_rur': 'identity'},
                              embeddings={'trans_date': {'in': 800, 'out': 16},
                                          'small_group': {'in': 250, 'out': 16}})

    seq_encoder = RnnSeqEncoder(trx_encoder=TrxEncoder(**trx_encoder_params), hidden_size=256, type='gru')
    model = CoLESModule(seq_encoder=seq_encoder, optimizer_partial=partial(torch.optim.Adam, lr=0.001),
                        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9))
    return model

In [4]:
if __name__ == "__main__":
    accelerator = "cuda" if torch.cuda.is_available() else "cpu"
    device = 1 if torch.cuda.is_available() else "auto"
    data_loader_workers = 1 if torch.cuda.is_available() else cpu_count()
    preprocessor, source_data = define_data()
    model = define_model()

    dataset = preprocessor.fit_transform(source_data)
    train, test = train_test_split(dataset, test_size=0.2, random_state=42)
    
    len_filter = SeqLenFilter(min_seq_len=25)
    in_memory_dataset = MemoryMapDataset(data=train, i_filters=[len_filter])
    data_splitter = SampleSlices(split_count=5, cnt_min=25, cnt_max=200)
    coles_df = ColesDataset(data=in_memory_dataset, splitter=data_splitter)
    train_dl = PtlsDataModule(
        train_data=coles_df,
        train_num_workers=data_loader_workers,
        train_batch_size=BATCH_SIZE,
    )
    trainer = pl.Trainer(
        max_epochs=1,
        accelerator=accelerator,
        devices=device,
        enable_progress_bar=True,
        # prepare_data_per_node=False,
        # replace_sampler_ddp=False,
        # sync_batchnorm=True
    )

Creating Dask Server
Link Dask Server - http://192.168.0.163:8787/status


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done 2449 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 3199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 4049 tasks      | elapsed:    1.5s
[Parallel(n_jobs=1)]: Done 4999 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done 6049 tasks      | elapsed:    1.8s
[Parallel(n_jobs=1)]: Done 7199 tasks      | elapsed:    2.2s
[Parallel(n_jobs=1)]: Done 8449 tasks      | elapsed:    2.4s
[Parallel(n_jobs=1)]: Done 9799 tasks      | elapsed:    2.7s
[Parallel(n_jobs=1)]: Done 11249 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 12799 tasks      | elapsed:    3.2s
[Parallel(

In [None]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

### Save sequence encoder for other experiments

In [5]:
torch.save(model.state_dict(), "coles-emb.pt")

In [5]:
model.load_state_dict(torch.load("coles-emb.pt"))

<All keys matched successfully>

## Inference 

In [7]:
from ptls.data_load.datasets import inference_data_loader
train_dl = inference_data_loader(train, num_workers=0, batch_size=BATCH_SIZE)
batch = next(iter(train_dl))
out = model(batch)
out

tensor([[-0.5118,  0.2972,  0.5622,  ..., -0.3912, -0.9931, -0.5068],
        [-0.4401,  0.2680,  0.2137,  ..., -0.3336, -0.9934, -0.5672],
        [-0.3206,  0.3152,  0.6050,  ..., -0.2922, -0.9941, -0.4643],
        ...,
        [-0.3805,  0.3260,  0.6107,  ..., -0.3531, -0.9937, -0.5188],
        [-0.4753,  0.2806,  0.3469,  ..., -0.4049, -0.9912, -0.2656],
        [-0.3687,  0.2517,  0.5571,  ..., -0.2954, -0.9936, -0.4598]],
       grad_fn=<SliceBackward0>)

In [8]:
from ptls.frames.inference_module import InferenceModule

inf_model = InferenceModule(model, model_out_name="coles-emb.pt")

batch = next(iter(train_dl))
out = inf_model(batch)
out

tensor([[-0.5137,  0.2966,  0.5583,  ..., -0.3873, -0.9930, -0.5075],
        [-0.4423,  0.2671,  0.2093,  ..., -0.3298, -0.9933, -0.5676],
        [-0.3224,  0.3146,  0.6018,  ..., -0.2885, -0.9940, -0.4647],
        ...,
        [-0.3825,  0.3252,  0.6071,  ..., -0.3491, -0.9936, -0.5192],
        [-0.4771,  0.2799,  0.3424,  ..., -0.4007, -0.9911, -0.2658],
        [-0.3703,  0.2509,  0.5534,  ..., -0.2918, -0.9935, -0.4601]],
       grad_fn=<SliceBackward0>)

## ONNX Inference 

In [9]:
from sklearn.ensemble import RandomForestClassifier
from ptls.frames.inference_module import ONNXInferenceModule
from ptls.data_load.datasets import inference_data_loader

%load_ext pyinstrument

torch.set_float32_matmul_precision('high')

train_dl = inference_data_loader(train, num_workers=0, batch_size=BATCH_SIZE, onnx=True)
test_dl = inference_data_loader(test, num_workers=0, batch_size=BATCH_SIZE, onnx=True)

onnx_model = ONNXInferenceModule(model, model_out_name="coles-emb.onnx", dl=train_dl).to(accelerator)
batch = next(iter(train_dl))
out = onnx_model(batch)
out

tensor([[[  0.0000,   1.0000,   2.0000,  ...,   0.0000,   0.0000,   0.0000],
         [ 13.0000,  14.0000,  31.0000,  ...,   0.0000,   0.0000,   0.0000],
         [  1.0000,   2.0000,   2.0000,  ...,   0.0000,   0.0000,   0.0000],
         ...,
         [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
         [  5.0000,   7.0000,   9.0000,  ...,   0.0000,   0.0000,   0.0000],
         [  1.0000,   1.0000,   6.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[ 40.7560,  40.4130,  36.7880,  ...,   0.0000,   0.0000,   0.0000],
         [  4.9910,  31.5980,   5.4560,  ...,   0.0000,   0.0000,   0.0000],
         [ 82.0650,   1.8970,  12.9600,  ...,   0.0000,   0.0000,   0.0000],
         ...,
         [ 17.0820,   2.7020,  91.5910,  ...,   0.0000,   0.0000,   0.0000],
         [ 19.1880,  49.4130,  21.4780,  ...,   0.0000,   0.0000,   0.0000],
         [ 18.2860,  10.5330,  17.2200,  ...,   0.0000,   0.0000,   0.0000]],

        [[  1.0000,   1.0000,   1.0000,  ...

  assert x.payload.size()[1] > 0, "Batch can'not have 0 transactions"


tensor([[[ 0.6376,  0.2678, -0.4230,  ..., -0.5958, -1.0622,  0.0603],
         [ 0.6376,  0.2678, -0.4230,  ..., -0.5958, -1.0622,  0.0588],
         [ 0.6750, -0.4338, -0.3970,  ..., -0.5958, -1.0622,  0.0433],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.1143],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.1143],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.1143]],

        [[ 0.1355, -1.6823, -2.2415,  ..., -0.0052,  0.5342, -0.0930],
         [ 0.3867, -1.2478, -1.3098,  ...,  1.2940, -1.4396,  0.0210],
         [ 0.9172, -1.6079, -0.6308,  ..., -0.5958, -1.0622, -0.0910],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.1143],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.1143],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.1143]],

        [[ 0.7729,  1.1921,  0.5007,  ..., -0.5958, -1.0622,  0.2372],
         [ 0.9573, -0.7268, -0.2775,  ..., -0



tensor([[-0.5181,  0.2925,  0.5493,  ..., -0.3882, -0.9927, -0.5098],
        [-0.4417,  0.2703,  0.2119,  ..., -0.3433, -0.9927, -0.5874],
        [-0.3198,  0.3198,  0.6079,  ..., -0.3013, -0.9937, -0.4856],
        ...,
        [-0.3833,  0.3254,  0.6064,  ..., -0.3625, -0.9932, -0.5410],
        [-0.4783,  0.2793,  0.3374,  ..., -0.4045, -0.9907, -0.2717],
        [-0.3689,  0.2507,  0.5537,  ..., -0.3003, -0.9932, -0.4722]],
       dtype=torch.float16)

### Inference with pl.trainer

In [10]:
%%pyinstrument

train_embeds = torch.vstack(trainer.predict(onnx_model, train_dl, ))
test_embeds = torch.vstack(trainer.predict(onnx_model, test_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Kaefsky\Python\ptls\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Predicting DataLoader 0: |          | 500/? [00:17<00:00, 27.79it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Predicting DataLoader 0: |          | 125/? [00:04<00:00, 28.03it/s]


In [9]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


In [10]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.5538333333333333

### Inference w/o pl.trainer (float32)

In [11]:
%%pyinstrument

train_embeds = torch.vstack(onnx_model.predict(train_dl, dtype=torch.float32))
test_embeds = torch.vstack(onnx_model.predict(test_dl, dtype=torch.float32))

Predicting DataLoader: 500it [00:18, 27.53it/s]
Predicting DataLoader: 125it [00:04, 28.07it/s]


In [12]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


In [13]:
%%pyinstrument

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.551

### Inference w/o pl.trainer (float16)

In [14]:
%%pyinstrument

train_embeds = torch.vstack(onnx_model.predict(train_dl))
test_embeds = torch.vstack(onnx_model.predict(test_dl))

Predicting DataLoader: 500it [00:18, 27.24it/s]
Predicting DataLoader: 125it [00:04, 27.09it/s]


In [15]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


In [16]:
%%pyinstrument

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.5558333333333333

### Regular Inference

In [17]:
train_dl = inference_data_loader(train, num_workers=0, batch_size=BATCH_SIZE)
test_dl = inference_data_loader(test, num_workers=0, batch_size=BATCH_SIZE)

In [18]:
%%pyinstrument

train_embeds = torch.vstack(trainer.predict(model, train_dl, ))
test_embeds = torch.vstack(trainer.predict(model, test_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\Kaefsky\Python\ptls\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Predicting DataLoader 0: |          | 500/? [00:19<00:00, 25.65it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Predicting DataLoader 0: |          | 125/? [00:04<00:00, 25.12it/s]


In [19]:
# join target and embeddings

df_target = pd.read_csv('https://huggingface.co/datasets/dllllb/age-group-prediction/resolve/main/train_target.csv?download=true')
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


In [20]:
%%pyinstrument

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.546