In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("transactions_training_data.csv",parse_dates=["Date"]).query("Date>='2017-07-01'")
sep = " "
data["feature_string"] = \
(data.Category + sep + \
data.Date.dt.day_name() + sep + \
data["Account Name"] + sep + \
data["Transaction Type"] + sep + \
data.Description).str.lower()

In [3]:
data["label"] = ~data.Labels.isna()
data["feature_float"] = data.Amount
data["weights"] = np.ones_like(data.Amount)

In [4]:
### tokenize

In [5]:
import youtokentome as yttm

train_data_path = "train_data.txt"
model_path = "vocab.model"

open(train_data_path,"w").write("\n".join(data.feature_string.values))

yttm.BPE.train(data=train_data_path, vocab_size=1000, model=model_path)



<youtokentome.youtokentome.BPE at 0x7fd23054dd10>

In [6]:
bpe = yttm.BPE(model=model_path)
features_ids = bpe.encode(data.feature_string.values.tolist(), output_type=yttm.OutputType.ID)


In [7]:
max_len = max(len(f) for f in features_ids)

features_ids = [ f[:max_len] + [0]*(max_len-len(f)) for f in features_ids ]
features_ids = np.array(features_ids)

In [8]:
import torch
from torch.nn import functional
import pytorch_lightning as pl
from torch.utils.data import DataLoader, random_split, TensorDataset

In [9]:
# dataset = TensorDataset(torch.tensor(features_ids,dtype=int),
#                         torch.tensor(data["feature_float"].values.reshape(-1,1),dtype=torch.float),
#                         torch.tensor(data["label"].values,dtype=int),
#                         torch.tensor(data["weights"].values,dtype=torch.float))
# ntrain = int(.75*len(dataset))
# dataset_train, dataset_test = random_split(dataset,[ntrain, len(dataset)-ntrain])
# train_dataloader = DataLoader(dataset,batch_size=64,shuffle=True)
# test_dataloader = DataLoader(dataset,batch_size=64)

In [10]:
from collections import defaultdict

In [18]:
1-data["label"].mean()

0.6703856415231628

In [55]:

def acc(y,logits):
    yhat = logits.argmax(dim=1)
    return (1.0*(yhat==y)).mean()
    

    

class TransClassifier(pl.LightningModule):
    def __init__(self, h = 32, batch_size = 64, dropout_rate = .2, num_emb = 1000, seq_type = None):
        super().__init__()
        self.h = h
        self.batch_size = batch_size
        self.emb = torch.nn.Embedding(num_emb, embedding_dim=h,padding_idx=0)
        
        if seq_type=="cnn":
            self.seq_encoder = torch.nn.Conv1d(h,h,3)
        elif seq_type="lstm":
            self.seq_encoder = torch.nn.LSTM(h,h,batch_first=True)
        else:
            self.seq_encoder = None
            
        self.lin1 = torch.nn.Linear(h+1,h+1)
        self.cls = torch.nn.Linear(h+1,2)
        self.dropout_rate = dropout_rate
        self.drop = torch.nn.Dropout(dropout_rate)
        
    def input_drop_out(self,x):
        if self.training:
            to_zero = torch.rand(*x.shape)<=self.dropout_rate
            x[to_zero] = 0
        return x
        
        
        
    def forward(self,x_s,x_f,y=None,w = None):
        
        x_s = self.input_drop_out(x_s)
        
        x_emb = self.emb(x_s)
        
        x = self.cnn(x_emb.permute(0,2,1))#.permute(0,1,2)

        x_reduced,_ = x.max(dim=2)
        
        x_comb = torch.cat([x_reduced, x_f], dim = 1)
        x_comb = self.drop(x_comb)
#         x_comb = self.lin1(x_comb)
        x_comb = torch.nn.functional.relu(x_comb)
        logits = self.cls(x_comb)
        
        out = (logits,)
        
        if y is not None:
            temp = functional.cross_entropy(logits,y,reduction='none')
            if w is not None:
                loss = (temp*w).mean()
            else:
                loss = temp.mean()
            
            out = out + (loss, acc(y,logits))
            
        return out
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    
    def _step(self, batch, batch_idx):
        x_s,x_f,y,w = batch
        yhat, l, a = self.forward(x_s,x_f,y,w)
        return dict(loss = l, acc = a)
                                
    def training_step(self, batch, batch_idx):
        out_dict = self._step(batch, batch_idx)
        return dict(loss = out_dict["loss"],log = out_dict)
    
    def validation_step(self, batch, batch_idx):
        out_dict = self._step(batch, batch_idx)
        return {f'val_{k}':v for k,v in out_dict.items()}
        
    def validation_epoch_end(self, outputs):
        temp = defaultdict(list)
        for output in outputs:
            for k,v in output.items():
                temp[k].append(v)
                
        for k,v in temp.items():
            temp[k] = torch.stack(v).mean()
            
        temp = {k:v for k,v in temp.items()}
                
        temp["log"] = {k:v for k,v in temp.items()}
        return temp
        
        
    def prepare_data(self):
        dataset = TensorDataset(torch.tensor(features_ids,dtype=int),
                        torch.tensor(data["feature_float"].values.reshape(-1,1),dtype=torch.float),
                        torch.tensor(data["label"].values,dtype=int),
                        torch.tensor(data["weights"].values,dtype=torch.float))
        ntrain = int(.75*len(dataset))
        dataset_train, dataset_val = random_split(dataset,[ntrain, len(dataset)-ntrain])
        self.dataset_train = dataset
        self.dataset_val = dataset_val
        
    
    def train_dataloader(self):
        return DataLoader(self.dataset_train,batch_size=self.batch_size,shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.dataset_val,batch_size=self.batch_size,shuffle=True)


        
            
    
    

In [56]:
model = TransClassifier(dropout_rate=0, h = 512, batch_size=256)
trainer = pl.Trainer(max_epochs=300,
                     min_epochs=10,
#                      early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_acc", patience = 10, verbose=True),
                    progress_bar_refresh_rate = 0)
trainer.fit(model)


INFO:lightning:
  | Name | Type      | Params
-------------------------------
0 | emb  | Embedding | 512 K 
1 | cnn  | Conv1d    | 786 K 
2 | lin1 | Linear    | 263 K 
3 | cls  | Linear    | 1 K   
4 | drop | Dropout   | 0     
INFO:lightning:Detected KeyboardInterrupt, attempting graceful shutdown...


1