In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from IPython.display import clear_output
! pip install --upgrade numpy pandas tqdm torch catalyst==20.09
clear_output()

In [None]:
from IPython.display import clear_output

import os
import json
import pickle
from bisect import bisect_left, bisect_right
from datetime import datetime, timedelta
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from tqdm import tqdm

unk_token = "<UNK>"

# GPU hack if you need
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Data

Columns
- `party_rk` – client unique identifier
- `account_rk` – client account unique identifier
- `financial_account_type_cd` – debit/credit card flag
- `transaction_dttm` – operation datetime
- `transaction_type_desc` – purchase/payment/...
- `transaction_amt_rur` – transaction price
- `merchant_type` - DUTY FREE STORES/FUEL DEALERS/RESTAURANTS/ etc
- `merchant_group_rk` - McDonald's/Wildberries/ etc

It's important that table is already sorted by `transaction_dttm` column!!!

In [None]:
#@title Load and unpack data
from IPython.display import clear_output

def load_from_gdrive(gid, filename='temp'):
    !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id='$gid -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id="$gid -O $filename && rm -rf /tmp/cookies.txt

# !rm -rf data*
load_from_gdrive('1-DNWpDk1RnMHaiEnbHVpv_vo9XBbO0iA', 'data.zip')
!unzip data.zip -d /content/data/
clear_output()

In [None]:
DATADIR = "/content/data" # "./data"

transactions_path = f"{DATADIR}/avk_hackathon_data_transactions.csv"
socdem_path = f"{DATADIR}/avk_hackathon_data_party_x_socdem.csv"

In [None]:
BUILD_MAPPINGS = False
BUILD_PATRY_DATA = False

TARGET_VARIABLE = 'category' # `merchant_type` or `category`

## Mappings
~1 min

In [None]:
# # Prepare & save mappings

def create_mapping(values):
    mapping = {unk_token: 0}
    for v in values:
        if not pd.isna(v) and v != unk_token:
            mapping[str(v)] = len(mapping)

    return mapping

if BUILD_MAPPINGS:
    mappings = defaultdict(dict)

    for col in tqdm(
        [
            "transaction_type_desc",
            "merchant_rk",
            "merchant_type",
            "merchant_group_rk",
            "category",
            "financial_account_type_cd",
        ]
    ):

        col_values = (
            pd.read_csv(transactions_path, usecols=[col])[col]
            .fillna(unk_token)
            .astype(str)
        )
        mappings[col] = create_mapping(col_values.unique())
        del col_values

    for col in tqdm(
        [
            "gender_cd",
            "age",
            "marital_status_desc",
            "children_cnt",
            "region_flg",
        ]
    ):

        col_values = (
            pd.read_csv(socdem_path, usecols=[col])[col]
            .fillna(unk_token)
            .astype(str)
        )
        mappings[col] = create_mapping(col_values.unique())
        del col_values


    with open(f"{DATADIR}/mappings.json", "w") as f:
        json.dump(mappings, f)

else:
    # load mappings
    with open(f"{DATADIR}/mappings.json", 'r') as f:
        mappings = json.load(f)

## Parse transactions by users
~ 20 min

In [None]:
if BUILD_PATRY_DATA:
    usecols = [
        "party_rk",
        "transaction_dttm",
        "transaction_amt_rur",
        "merchant_type",
        "transaction_type_desc",
        "financial_account_type_cd",
        "category",
    ]

    str_cols = [
        "transaction_dttm",
        "merchant_type",
        "transaction_type_desc",
        "financial_account_type_cd",
        "category",
    ]

    df = pd.read_csv(transactions_path, usecols=usecols)
    df["transaction_amt_rur"] = df["transaction_amt_rur"].fillna(0)
    df[str_cols] = df[str_cols].fillna(unk_token).astype(str)
    # df.head()

In [None]:
if BUILD_PATRY_DATA:
    party_list = df.party_rk.unique()

    # Prepare & save client data
    party2dates = dict()  # for each party save a series of the transaction dates 
    party2sum = dict()  # for each party save a series of the transaction costs 
    party2merchant_type = dict()  # for each party save a series of the transaction_type 
    party2trans_type = dict()  # for each party save a series of the transaction merchant_type
    party2category = dict()
    party2fin_acc_type = dict()

    for party_rk in tqdm(party_list):
        party_rows = df[df.party_rk == party_rk]
        party_rows = party_rows.sort_values('transaction_dttm')

        party2dates[party_rk] = list(party_rows.transaction_dttm.values)
        party2sum[party_rk] = list(party_rows.transaction_amt_rur.values)
        party2merchant_type[party_rk] = list(party_rows.merchant_type.values)
        party2trans_type[party_rk] = list(party_rows.transaction_type_desc.values)
        party2category[party_rk] = list(party_rows.category.values)
        party2fin_acc_type[party_rk] = list(party_rows.financial_account_type_cd.values)
        

    # map values
    party2merchant_type = {x[0]: [mappings["merchant_type"][y] for y in x[1]] for x in party2merchant_type.items()}
    party2trans_type = {x[0]: [mappings["transaction_type_desc"][y] for y in x[1]] for x in party2trans_type.items()}
    party2category = {x[0]: [mappings["category"][y] for y in x[1]] for x in party2category.items()}
    party2fin_acc_type = {x[0]: [mappings["financial_account_type_cd"][y] for y in x[1]] for x in party2fin_acc_type.items()}

    # add month and week days
    dates = pd.date_range("2019-01-01", "2020-02-28").tolist()#.strftime("%Y-%m-%d").tolist()
    weekdays = {date.strftime("%Y-%m-%d"): date.weekday() for date in dates}
    monthdays = {date.strftime("%Y-%m-%d"): date.day - 1 for date in dates}

    def add_weekdays():
        party2weekday = dict()
        party2day = dict()
        party_list = pd.read_csv(transactions_path, usecols=['party_rk']).party_rk.unique()

        for party_rk in party_list:
            party2weekday[party_rk] = [weekdays[date] for date in party2dates[party_rk]]
            party2day[party_rk] = [monthdays[date] for date in party2dates[party_rk]]

        return party2weekday, party2day

    party2weekday, party2day = add_weekdays()

    # add user profiles
    def get_user_profiles():
        party2user = dict()
        party_table = pd.read_csv(socdem_path).fillna(unk_token).astype(str)

        for index, row in party_table.iterrows():
            party2user[int(row.party_rk)] = {
                col: mappings[col][row[col]]
                for col in ["gender_cd", "age", "marital_status_desc", "children_cnt", "region_flg"]
            }

        return party2user

    party2user = get_user_profiles()

    # save data
    party_data = {
        'dates': party2dates,
        'sum': party2sum,
        'merchant_type': party2merchant_type,
        'trans_type': party2trans_type,
        'category': party2category,
        'fin_acc_type': party2fin_acc_type,
        'day': party2day,
        'weekday': party2weekday,
        'user': party2user,
    }

    for name, data in party_data.items():
        data_x = {str(x[0]): x[1] for x in data.items()}

        with open(f"{DATADIR}/party2{name}.json", "w") as f:
            json.dump(data_x, f)

else:
    # load data
    party_data = {
        'dates': None,
        'sum': None,
        'merchant_type': None,
        'trans_type': None,
        'category': None,
        'fin_acc_type': None,
        'day': None,
        'weekday': None,
        'user': None,
    }

    for name in party_data.keys():
        with open(f"{DATADIR}/party2{name}.json", "r") as f:
            data = json.load(f)

        data = {int(x[0]): x[1] for x in data.items()}
        party_data[name] = data

    party_data['day'] = {key: [v - 1 for v in values] for key, values in party_data['day'].items()}

## PyTorch dataset

In [None]:
from sklearn.model_selection import train_test_split

train_party, valid_party = train_test_split(
    pd.read_csv(transactions_path, usecols=['party_rk']).party_rk.unique(), 
    train_size=0.8, random_state=42
)
# train_party = pd.read_csv(transactions_path, usecols=['party_rk']).party_rk.unique()

print(f'Train: {len(train_party)} Val: {len(valid_party)}')

Train: 40000 Val: 10000


In [None]:
predict_period_len = 60  # -- days
train_predict_dates = (
    # pd.date_range("2019-03-01", "2019-10-31", freq="MS")
    pd.date_range("2019-03-01", "2019-12-31", freq="MS")
    .strftime("%Y-%m-%d")
    .tolist()
)
valid_predict_dates = (
    pd.date_range("2019-11-01", "2019-12-31", freq="MS")
    .strftime("%Y-%m-%d")
    .tolist()
)
submission_predict_dates = (
    pd.date_range("2020-01-01", "2020-02-28", freq="2MS")
    .strftime("%Y-%m-%d")
    .tolist()
)

In [None]:
def prepare_data(party_list, mode="train", target=TARGET_VARIABLE):
    """
    This function define the pipeline of the creation of train and valid samples.
    We consider each client from party_list. For each client take each 
    predict_period_start from predict_dates list. All client transaction before
    this date is our features. Next, we look at the customer's transactions in 
    the next two months. This transactions should be predicted. It will form 
    our labels vector.
    """

    data = {
        'user': [],
        'sum': [],
        'trans_type': [],
        'merchant_type': [],
        'category': [],
        'fin_acc_type': [],
        'day': [],
        'weekday': [],
        'labels_merchant_type': [],
        'labels_category': [],
    }

    for party_rk in tqdm(party_list):
        user_info = party_data['user'][party_rk]
        date_series = party_data['dates'][party_rk]
        sum_series = party_data['sum'][party_rk]
        merch_type_series = party_data['merchant_type'][party_rk]
        trans_type_series = party_data['trans_type'][party_rk]
        category_series = party_data['category'][party_rk]
        fin_acc_type_series = party_data['fin_acc_type'][party_rk]
        day_series = party_data['day'][party_rk]
        weekday_series = party_data['weekday'][party_rk]

        if mode == "train":
            predict_dates = train_predict_dates
        elif mode == "valid":
            predict_dates = valid_predict_dates
        elif mode == "submission":
            predict_dates = submission_predict_dates
        else:
            raise Exception("Unknown mode")

        for predict_period_start in predict_dates:

            predict_period_end = datetime.strftime(
                datetime.strptime(predict_period_start, "%Y-%m-%d")
                + timedelta(days=predict_period_len),
                "%Y-%m-%d",
            )

            l, r = (
                bisect_left(date_series, predict_period_start),
                bisect_right(date_series, predict_period_end),
            )

            history_merch_type = merch_type_series[:l]
            history_sum = sum_series[:l]
            history_trans_type = trans_type_series[:l]
            history_category = category_series[:l]
            history_fin_ccc_type = fin_acc_type_series[:l]
            history_day = day_series[:l]
            history_weekday = weekday_series[:l]
            
            predict_merch = merch_type_series[l:r]
            predict_category = category_series[l:r]
            non_empty = any(predict_merch) if target == 'merchant_type' else any(predict_category)

            if non_empty and l or mode not in ("train", "valid"):
                data['user'].append(user_info)
                data['sum'].append(history_sum)
                data['trans_type'].append(history_trans_type)
                data['merchant_type'].append(history_merch_type)
                data['category'].append(history_category)
                data['fin_acc_type'].append(history_fin_ccc_type)
                data['merchant_type'].append(history_merch_type)
                data['day'].append(history_day)
                data['weekday'].append(history_weekday)
                data['labels_merchant_type'].append(predict_merch)
                data['labels_category'].append(predict_category)

    return data

In [None]:
train_data = prepare_data(train_party, mode="train")
valid_data = prepare_data(valid_party, mode="valid")

100%|██████████| 40000/40000 [00:25<00:00, 1543.02it/s]
100%|██████████| 10000/10000 [00:00<00:00, 10764.87it/s]


## PyTorch loaders

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

In [12]:
MERCH_TYPE_NCLASSES = len(mappings['merchant_type'])
TRANS_TYPE_NCLASSES = len(mappings['transaction_type_desc'])
SIZES = dict(map(lambda x: (x[0], len(x[1])), mappings.items()))

SIZES['trans_type'] = 5
SIZES['fin_acc_type'] = 3
SIZES['day'] = 31
SIZES['weekday'] = 7

PADDING_LEN = 256

In [13]:
class RSDataset(Dataset):
    def __init__(self, data, target=TARGET_VARIABLE):
        super(RSDataset, self).__init__()
        self.data = data
        self.target = target
        assert self.target in ('merchant_type', 'category')

    def __len__(self):
        return len(self.data['sum'])

    def __getitem__(self, idx):
        targets = np.zeros((SIZES[self.target] - 1,), dtype=np.float32)
        for m in self.data[f'labels_{self.target}'][idx]:
            if m:  # skip UNK, UNK-token should not be predicted
                targets[m - 1] = 1.0

        item = {
            "features": {},
            "targets": targets,
        }

        sum_feature = np.array(self.data['sum'][idx][-PADDING_LEN:])
        sum_feature = np.vectorize(lambda s: np.log(1 + s))(sum_feature)
        if sum_feature.shape[0] < PADDING_LEN:
            pad = np.zeros(
                (PADDING_LEN - sum_feature.shape[0],), dtype=np.float32
            )
            sum_feature = np.hstack((sum_feature, pad))
        item["features"]["sum"] = torch.from_numpy(sum_feature).float()

        for feature_name in ["trans_type", "merchant_type", "category", "fin_acc_type", "day", "weekday"]:
            feature_values = self.data[feature_name][idx]
            feature_values = np.array(feature_values[-PADDING_LEN:])
            mask = np.ones(feature_values.shape[0], dtype=np.float32)
            if feature_values.shape[0] < PADDING_LEN:
                feature_values = np.append(
                    feature_values,
                    np.zeros(
                        PADDING_LEN - feature_values.shape[0], dtype=np.int64
                    ),
                )
                mask = np.append(
                    mask,
                    np.zeros(PADDING_LEN - mask.shape[0], dtype=np.float32),
                )
            item["features"][feature_name] = torch.from_numpy(feature_values).long()
        
        item["features"]["mask"] = torch.from_numpy(mask).float()

        for feature_name in ["gender_cd", "age", "marital_status_desc", "children_cnt", "region_flg"]:
            item["features"][feature_name] = torch.tensor(self.data['user'][idx][feature_name]).long()

        return item

In [14]:
train_dataset = RSDataset(train_data, target=TARGET_VARIABLE)
valid_dataset = RSDataset(valid_data, target=TARGET_VARIABLE)

In [15]:
train_loader = DataLoader(
    train_dataset, batch_size=64, shuffle=True, num_workers=2
)
valid_loader = DataLoader(
    valid_dataset, batch_size=64, shuffle=False, num_workers=2
)

In [None]:
# sanity check
# for i in tqdm(range(len(train_loader))):
#     batch = next(iter(train_loader))

## Model

This is the baseline model for predicting purchases in `merchant_type` in the next 2 months

In [16]:
import torch.nn as nn
from collections import OrderedDict

In [17]:
params = {
    'emb_dim': {
        'merchant_type': 87,
        'category': 16,
        'trans_type': 3,
        'fin_acc_type': 1,
        'day': 5,
        'weekday': 3,
        'gender_cd': 1,
        'age': 4,
        'marital_status_desc': 3,
        'children_cnt': 3,
        'region_flg': 1
    },
    'transformer_nhead': 4,
    'transformer_dim_feedforward': 256,
    'transformer_dropout': 0.1,
    'dense_unit': 256,
    'num_layers': 3,
}

In [18]:
MERCH_TYPE_NCLASSES, TRANS_TYPE_NCLASSES
sum(params['emb_dim'].values())

127

In [19]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        self.embedding_layers = nn.ModuleDict({
            name: nn.Embedding(SIZES[name], params["emb_dim"][name])
            for name in params["emb_dim"]
        })

        embedding_size = sum(params['emb_dim'].values()) + 1

        transformer_blocks = []
        for i in range(params["num_layers"]):
            transformer_block = nn.TransformerEncoderLayer(
                d_model=embedding_size,
                nhead=params["transformer_nhead"],
                dim_feedforward=params["transformer_dim_feedforward"],
                dropout=params["transformer_dropout"],
            )
            transformer_blocks.append(
                (f"transformer_block_{i}", transformer_block)
            )

        self.transformer_encoder = nn.Sequential(
            OrderedDict(transformer_blocks)
        )

        self.linear = nn.Linear(
            in_features=embedding_size, out_features=params["dense_unit"]
        )
        self.scorer = nn.Linear(
            in_features=params["dense_unit"],
            out_features=SIZES[TARGET_VARIABLE] - 1,
        )

    def forward(self, features):
        seq_embs = [
            self.embedding_layers[feature_name](features[feature_name]) * features["mask"].unsqueeze(-1)
            # for feature_name in ["merchant_type", "trans_type", "fin_acc_type", "day", "weekday"]
            for feature_name in ["merchant_type", "trans_type", "category", "fin_acc_type", "day", "weekday"]
        ]
        seq_embs.append(features["sum"].unsqueeze(-1))

        user_embs = [
            self.embedding_layers[feature_name](features[feature_name].view(-1, 1)) * features["mask"].unsqueeze(-1)
            for feature_name in ["gender_cd", "age", "marital_status_desc", "children_cnt", "region_flg"]
        ]
        embeddings = torch.cat(seq_embs + user_embs, dim=-1)

        transformer_output = self.transformer_encoder(embeddings)

        input_mask = features["mask"]
        lengths = input_mask.sum(dim=1).unsqueeze(1)
        x = transformer_output * input_mask.unsqueeze(-1)
        pooling = torch.sum(x, dim=1) / lengths

        # pooling = torch.mean(transformer_output, dim=1)

        linear = torch.tanh(self.linear(pooling))
        merch_logits = self.scorer(linear)

        return merch_logits

### One-batch-check

In [20]:
model = Model()
criterion = nn.BCEWithLogitsLoss()
batch = next(iter(train_loader))
with torch.no_grad():
    output = model(batch['features'])
    loss = criterion(output, batch['targets'])
print(loss)

tensor(0.7177)


In [None]:
# # sanity check
# for i in tqdm(range(len(train_loader))):
#     batch = next(iter(train_loader))
#     with torch.no_grad():
#         output = model(batch['features'])

## Train loop with [Catalyst](https://github.com/catalyst-team/catalyst)

[A comprehensive step-by-step guide to basic and advanced features](https://github.com/catalyst-team/catalyst#step-by-step-guide).

---



In [None]:
from catalyst import dl, utils
from catalyst.utils import metrics

## Custom metrics for this hackathon

In [None]:
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np
import torch
from catalyst.utils.metrics.functional import preprocess_multi_label_metrics
from catalyst.utils.torch import get_activation_fn


def multi_label_metrics(
    outputs: torch.Tensor,
    targets: torch.Tensor,
    threshold: Union[float, torch.Tensor],
    activation: Optional[str] = None,
    eps: float = 1e-7,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Computes multi-label precision for the specified activation and threshold.

    Args:
        outputs (torch.Tensor): NxK tensor that for each of the N examples
            indicates the probability of the example belonging to each of
            the K classes, according to the model.
        targets (torch.Tensor): binary NxK tensort that encodes which of the K
            classes are associated with the N-th input
            (eg: a row [0, 1, 0, 1] indicates that the example is
            associated with classes 2 and 4)
        threshold (float): threshold for for model output
        activation (str): activation to use for model output
        eps (float): epsilon to avoid zero division
    
    Extended version of 
        https://github.com/catalyst-team/catalyst/blob/master/catalyst/utils/metrics/accuracy.py#L58

    Returns:
        computed multi-label metrics
    """
    outputs, targets, _ = preprocess_multi_label_metrics(
        outputs=outputs, targets=targets
    )
    activation_fn = get_activation_fn(activation)
    outputs = activation_fn(outputs)

    outputs = (outputs > threshold).long()

    accuracy = (targets.long() == outputs.long()).sum().float() / np.prod(
        targets.shape
    )

    intersection = (outputs.long() * targets.long()).sum(axis=1).float()
    num_predicted = outputs.long().sum(axis=1).float()
    num_relevant = targets.long().sum(axis=1).float()
    union = num_predicted + num_relevant

    # Precision = ({predicted items} && {relevant items}) / {predicted items}
    precision = intersection / (num_predicted + eps * (num_predicted == 0))
    # Recall = ({predicted items} && {relevant items}) / {relevant items}
    recall = intersection / (num_relevant + eps * (num_relevant == 0))
    # IoU = ({predicted items} && {relevant items}) / ({predicted items} || {relevant items})
    iou = (intersection + eps * (union == 0)) / (union - intersection + eps)

    return accuracy, precision.mean(), recall.mean(), iou.mean()


def precision_at_k(
    actual: torch.Tensor, 
    predicted: torch.Tensor, 
    k: int,
):
    """
    Computes precision at cutoff k for one sample

    Args:
       actual: (torch.Tensor): tensor of length K with predicted item_ids sorted by relevance
       predicted (torch.Tensor): binary tensor that encodes which of the K
           classes are associated with the N-th input
           (eg: a row [0, 1, 0, 1] indicates that the example is
           associated with classes 2 and 4)
       k (int): parameter k of precison@k

    Returns:
       Computed value of precision@k for given sample
    """
    p_at_k = 0.0
    for item in predicted[:k]:
        if actual[item]:
            p_at_k += 1
    p_at_k /= k

    return p_at_k


def average_precision_at_k(
    actual: torch.Tensor, 
    predicted: torch.Tensor, 
    k: int,
) -> float:
    """
    Computes average precision at cutoff k for one sample

    Args:
      actual: (torch.Tensor): tensor of length K with predicted item_ids sorted by relevance
      predicted (torch.Tensor): binary tensor that encodes which of the K
          classes are associated with the N-th input
          (eg: a row [0, 1, 0, 1] indicates that the example is
          associated with classes 2 and 4)
      k (int): parameter k of AP@k

    Returns:
        Computed value of AP@k for given sample
    """
    ap_at_k = 0.0
    for idx, item in enumerate(predicted[:k]):
        if actual[item]:
            ap_at_k += precision_at_k(actual, predicted, k=idx + 1)
    ap_at_k /= min(k, actual.sum().cpu().numpy())
    

    return ap_at_k


def mean_average_precision_at_k(
    output: torch.Tensor, target: torch.Tensor, top_k: Tuple[int, ...] = (1,)
) -> List[float]:
    """
    Computes mean_average_precision_at_k at set of cutoff parameters K

    Args:
       outputs (torch.Tensor): NxK tensor that for each of the N examples
           indicates the probability of the example belonging to each of
           the K classes, according to the model.
       targets (torch.Tensor): binary NxK tensort that encodes which of the K
           classes are associated with the N-th input
           (eg: a row [0, 1, 0, 1] indicates that the example is
           associated with classes 2 and 4)
       top_k (tuple): list of parameters k at which map@k will be computed


    Returns:
       List of computed values of map@k at each cutoff k from topk
    """
    max_k = max(top_k)
    batch_size = target.size(0)

    _, top_indices = output.topk(k=max_k, dim=1, largest=True, sorted=True)

    result = []
    for k in top_k:  # loop over k
        map_at_k = 0.0
        for actual_target, predicted_items in zip(
            target, top_indices
        ):  # loop over samples
            map_at_k += average_precision_at_k(
                actual_target, predicted_items, k
            )
        map_at_k = map_at_k / batch_size
        result.append(map_at_k)

    return result

In [None]:
# What is Runner?
# https://catalyst-team.github.io/catalyst/api/core.html#runner
class CustomRunner(dl.Runner):

    def _handle_batch(self, batch):
        # model train/valid step
        features, targets = batch["features"], batch["targets"]
        logits = self.model(features)
        scores = torch.sigmoid(logits)

        loss = self.criterion(logits, targets)
        accuracy, precision, recall, iou = multi_label_metrics(
            logits, targets, threshold=0.5, activation="Sigmoid"
        )
        map05, map10 = mean_average_precision_at_k(
            scores, targets, top_k=(5, 10)
        )
        batch_metrics = {
            "loss": loss,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "iou": iou,
            "map05": map05,
            "map10": map10,
            # "map20": map20,
            # "map30": map30
        }
        
        self.input = {"features": features, "targets": targets}
        self.output = {"logits": logits, "scores": scores}
        self.batch_metrics.update(batch_metrics)

        if self.is_train_loader:
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
    
    def predict_batch(self, batch):
        # model inference step
        batch = utils.maybe_recursive_call(batch, "to", device=self.device)
        logits = self.model(batch["features"])
        scores = torch.sigmoid(logits)
        return scores

In [None]:
model = Model()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

loaders = {"train": train_loader, "valid": valid_loader}

In [None]:
# !rm -rf logs

In [None]:
# %reload_ext tensorboard
# %tensorboard --logdir ./logs

In [None]:
# For other minimal examples, please follow the link below
# https://github.com/catalyst-team/catalyst#minimal-examples
runner = CustomRunner()
# model training
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=None,
    loaders=loaders,
    logdir="./logs",
    num_epochs=10,
    verbose=True,
    load_best_on_end=True,
    overfit=False,  #  <<<--- DO NOT FORGET TO MAKE IT ``False`` 
                    #  (``True`` uses only one batch to check pipeline correctness)
    callbacks=[
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html
        # dl.AveragePrecisionCallback(input_key="targets", output_key="scores", prefix="ap"),
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
        # dl.AUCCallback(input_key="targets", output_key="scores", prefix="auc"),
    ],
    main_metric="iou", # "ap/mean", 
    minimize_metric=False,
)

1/10 * Epoch (train): 100% 5841/5841 [19:56<00:00,  4.88it/s, accuracy=0.944, iou=0.500, loss=0.174, map05=0.750, map10=0.875, precision=1.000, recall=0.500]
1/10 * Epoch (valid): 100% 255/255 [00:41<00:00,  6.17it/s, accuracy=0.778, iou=0.318, loss=0.486, map05=0.667, map10=0.704, precision=0.667, recall=0.318]
[2020-09-20 00:17:28,561] 
1/10 * Epoch 1 (train): accuracy=0.8689 | iou=0.4376 | loss=0.3118 | map05=0.7585 | map10=0.7303 | precision=0.7458 | recall=0.5580
1/10 * Epoch 1 (valid): accuracy=0.8770 | iou=0.4599 | loss=0.2936 | map05=0.7783 | map10=0.7503 | precision=0.7214 | recall=0.6106
2/10 * Epoch (train): 100% 5841/5841 [20:05<00:00,  4.85it/s, accuracy=0.889, iou=0.500, loss=0.284, map05=0.543, map10=0.597, precision=0.800, recall=0.571]
2/10 * Epoch (valid): 100% 255/255 [00:41<00:00,  6.16it/s, accuracy=0.778, iou=0.348, loss=0.383, map05=0.667, map10=0.700, precision=0.571, recall=0.382]
[2020-09-20 00:38:15,286] 
2/10 * Epoch 2 (train): accuracy=0.8761 | iou=0.4628 |

In [None]:
# model inference example
# for prediction in runner.predict_loader(loader=loaders["valid"]):
#     assert prediction.detach().cpu().numpy().shape[-1] == MERCH_TYPE_NCLASSES-1

In [None]:
# import plotly  # required for contrib
# from catalyst.contrib.utils import plot_tensorboard_log

# plot_tensorboard_log(logdir="./logs", step="batch")

In [None]:
import plotly  # required for contrib
from catalyst.contrib.utils import plot_tensorboard_log

plot_tensorboard_log(
    logdir="./logs", 
    step="epoch", 
    metrics=[
        "loss", "accuracy", "precision", "recall", "iou", 
        "map05", "map10", "map20",
        "ap/mean", "auc/mean"
    ]
)

## Your task

We suggest you to improve this baseline. Feel free to use any kind of model architectures, loss functions, inputs, etc. in your experiments.


YOUR TASK is to predict purchases in `merchant_type` in **January-February 2020** for all the clients (50k) from the given dataset.

SUBMISSION FORMAT: You should submit a `.csv` file in the following format. 

The submission file should contain two columns:
* `party_rk` -- client unique identifier
* `recommendations` -- list of the **top 30** predicted `merchant_type`, sorted by predicted proba (pay attention!) **separated by commas**. 

The `.csv` file separator should be **semicolon (";")**. The submission file example can be generated by the pipeline shown below.

EVALUATION: Your submission will be evaluated by metric **MAP@30**. Scores for this part of the hackathon will be given according to the value of this metric.

Good luck!

## Submission file example

In [None]:
# create data loader for submission
full_party = pd.read_csv(transactions_path, usecols=['party_rk']).party_rk.unique()
full_data = prepare_data(
    full_party, mode="submission"
)
full_dataset = RSDataset(
   full_data
)
full_loader = DataLoader(
    full_dataset, batch_size=64, shuffle=False, num_workers=8, drop_last=False)

100%|██████████| 50000/50000 [00:07<00:00, 6754.24it/s] 


In [None]:
# get predictions from the model
predictions = []
predictions_scores = []

for scores in tqdm(runner.predict_loader(loader=full_loader), total = len(full_loader)):
    _, top_indices = scores.topk(k=30, dim=1, largest=True, sorted=True)
    top_indices += 1
    predictions += top_indices.detach().cpu().tolist()
    predictions_scores += scores.detach().cpu().tolist()

100%|██████████| 782/782 [00:39<00:00, 19.99it/s]


In [None]:
# # inverse mapping for merchant_type in predictions
# category_inverse_mapping = {k: v for v, k in mappings['category'].items()}
# def inverse_mapping(x):
#     return list(map(category_inverse_mapping.get, x))

# predictions = list(map(inverse_mapping, predictions))

In [None]:
# create submission table
submission = pd.DataFrame({
    "party_rk" : full_party, 
    "recommendations" : predictions
})
submission['recommendations'] = submission['recommendations'].apply(lambda x: ",".join(map(str, x)))


submission.to_csv('submission_PEPEtoners_categories_ids.csv', index=False, sep=";")

In [None]:
# create submission table
submission = pd.DataFrame({
    "party_rk" : full_party, 
    "recommendations" : predictions_scores
})
submission['recommendations'] = submission['recommendations'].apply(lambda x: ",".join(map(lambda y: str(round(y, 4)), x)))


submission.to_csv('submission_PEPEtoners_categories_scores.csv', index=False, sep=";")