In [None]:
import sys
if 'google.colab' in str(get_ipython()):
  ! {sys.executable} -m pip install pytorch-lifestream
  ! pip install "torch<2"
  ! pip install torchvision==0.13.1

In [3]:
%load_ext autoreload
%autoreload 2

import torch
import pytorch_lightning as pl

In [None]:
import os

if not os.path.exists('data/transactions.csv'):
    ! mkdir data
    ! curl -OL https://storage.yandexcloud.net/ptls-datasets/trans-gender-2019.zip
    ! unzip trans-gender-2019.zip -d data
    ! mv trans-gender-2019.zip data/

In [6]:
import pandas as pd

data_path = 'data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions.csv'))
source_data.head(2)

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
0,39026145,0 10:23:26,4814,1030,-2245.92,
1,39026145,1 10:19:29,6011,7010,56147.89,


In [7]:
from datetime import timedelta
def convert_to_day(time_string):
    return int(time_string.split()[0])

In [8]:
source_data['tr_datetime'] = source_data['tr_datetime'].apply(convert_to_day)

In [10]:
source_data = source_data.dropna()

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
label_enc = LabelEncoder()
source_data['term_id'] = label_enc.fit_transform(source_data['term_id'])

In [14]:
source_data.head(2)

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
377,39026145,208,5499,1010,-7254.31,273471
378,39026145,208,5499,1010,-1392.47,273370


In [15]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='customer_id',
    col_event_time='tr_datetime',
    event_time_transformation='none',
    cols_category=['mcc_code', 'tr_type', 'term_id'],
    cols_numerical=['amount'],
    return_records=True,
)

In [16]:
%%time

dataset = preprocessor.fit_transform(source_data)

CPU times: user 25.9 s, sys: 2.17 s, total: 28.1 s
Wall time: 30.7 s


In [17]:
import pickle

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [18]:

dataset = sorted(dataset, key=lambda x: x['customer_id'])

In [19]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(11978, 2995)

In [21]:
def remove_extra_feature(name):
    for i in range(len(name)):
        if 'tr_datetime' in name[i]:
            del name[i]['tr_datetime']

In [22]:
remove_extra_feature(train)
remove_extra_feature(test)


In [24]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount': 'identity'},
    embeddings={
        'event_time': {'in': 500, 'out': 16},
        'tr_type' : {'in': 80, 'out': 16},
        'mcc_code' : {'in': 200, 'out': 16},
        'term_id' : {'in' : 440400, 'out' : 32}

    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [25]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=2,
    train_batch_size=256,
)

In [27]:
import torch
import pytorch_lightning as pl

import logging

logger = pl.loggers.TensorBoardLogger("logs", name="coles")


trainer = pl.Trainer(
    max_epochs=35,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
    logger = logger
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [28]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")


logger.version = 0


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 14.4 M
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
14.4 M    Trainable params
0         Non-trainable params
14.4 M    Total params
57.464    Total estimated model params size (MB)
  rank_zero_warn(
  self.pid = os.fork()


{'loss': tensor(60.1287), 'seq_len': tensor(90.4657)}
CPU times: user 5min 44s, sys: 47.6 s, total: 6min 32s
Wall time: 8min 26s


In [29]:
torch.save(seq_encoder.state_dict(), "coles-emb.pt")

In [30]:
from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [31]:
df_target = pd.read_csv(os.path.join(data_path, 'gender_train.csv'))
df_target = df_target.set_index('customer_id')

In [32]:
train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['customer_id'] = [x['customer_id'] for x in train]
train_df = train_df.merge(df_target, how='inner', on='customer_id')

In [38]:
test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['customer_id'] = [x['customer_id'] for x in test]
test_df = test_df.merge(df_target, how='inner', on='customer_id')

In [39]:
from sklearn.ensemble import RandomForestClassifier

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['gender']
x_test, y_test =  test_df[embed_columns], test_df['gender']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)


0.7065613608748481

In [40]:
import lightgbm as lgb

lgb_clf = lgb.LGBMClassifier(
    n_estimators =  500,
    objective = 'binary',
    metric = 'auc',
    subsample= 0.5,
    subsample_freq = 1,
    learning_rate = 0.02,
    feature_fraction = 0.75,
    max_depth = 6,
    lambda_l1 = 1,
    lambda_l2 = 1,
    min_data_in_leaf = 50,
    random_state = 42,
    n_jobs = 8,
    reg_alpha = None,
    reg_lambda = None,
    colsample_bytree = None,
    min_child_samples = None,
)

In [None]:
lgb_clf.fit(x_train, y_train)


In [42]:
lgb_clf.score(x_test, y_test)



0.7284325637910085