In [4]:
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch as T
import numpy as np
import pytorch_lightning as pl
import pandas as pd
import pickle
from functools import reduce
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans

In [5]:
with open("../preprocessor.pkt", "rb") as f:
    preprocessor = pickle.load(f)
preprocessor

{'scaler': StandardScaler(),
 'les': {'ecfg': LabelEncoder(),
  'flbmk': LabelEncoder(),
  'flg_3dsmk': LabelEncoder(),
  'insfg': LabelEncoder(),
  'ovrlt': LabelEncoder()},
 'mms': MinMaxScaler()}

In [6]:
# raw_data = pd.read_csv("../data/preprocess_train.csv")
# raw_data = raw_data.drop("txkey", 1)
# raw_data = raw_data.drop("oversea_flag_0", 1)
# raw_data = raw_data.drop("oversea_flag_1", 1)
# raw_data = raw_data.drop("most_freq", 1)

In [7]:
data = pd.read_csv("../data/train_norm.csv")
data = data.drop("Unnamed: 0", 1)
x_train, x_val, y_train, y_val = train_test_split(data.drop("fraud_ind", 1), data["fraud_ind"], test_size=0.15, random_state=0)

In [8]:
raw_test_data = pd.read_csv("../data/test.csv")

In [9]:
def preprocess_test_data(raw_test_data):
    labels = raw_test_data["fraud_ind"]
    df = raw_test_data.drop(["fraud_ind", "txkey"], 1)
    df = df.fillna("NA")
    
    for c in df.keys():
        if not (df[c].dtype == np.int64 or df[c].dtype == np.float64):
            df.loc[:,c] = preprocessor['les'][c].transform(df.loc[:,c])
    
    x = preprocessor['scaler'].transform(df)
    df = pd.DataFrame(data=x, index=df.index, columns=df.columns)
    df = pd.DataFrame(preprocessor['mms'].transform(df.values), index=df.index, columns=df.columns)
    
    df["fraud_ind"] = labels
    
    return df

test_data = preprocess_test_data(raw_test_data)
x_test, y_test = test_data.drop("fraud_ind", 1), test_data['fraud_ind']

In [17]:
x_train

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,...,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd
938646,0.876235,0.646040,0.152137,0.090812,0.833333,0.826667,1.0,0.2,0.0,0.0,...,0.0,0.0,0.213483,0.912108,0.538126,0.760229,0.0,0.871983,0.953271,0.0
659914,0.999564,0.186682,0.005545,0.071274,0.833333,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.808989,0.771435,0.995643,0.574337,0.0,0.000000,0.953271,0.0
777518,0.975596,0.653619,0.855775,0.056580,0.833333,0.826667,0.0,0.5,0.0,0.0,...,0.0,0.0,0.550562,0.807128,0.612200,0.350954,0.0,0.872433,0.953271,0.0
139433,0.983295,0.473435,0.667760,0.133905,0.833333,0.826667,0.0,0.4,0.0,0.0,...,0.0,0.0,0.550562,0.871351,0.590414,0.795561,0.0,0.636636,0.953271,0.0
642191,0.905142,0.093085,0.314155,0.054195,0.833333,0.826667,0.0,0.5,0.0,0.0,...,0.0,0.0,0.404494,0.601977,0.546841,0.188767,0.0,0.871983,0.953271,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359783,0.876235,0.923073,0.515661,0.063263,0.833333,0.826667,0.0,0.4,0.0,0.0,...,0.0,0.0,0.325843,0.609169,0.653595,0.788766,0.0,0.537850,0.953271,0.0
152315,0.999564,0.254383,0.983913,0.071274,0.833333,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.134831,0.490094,0.995643,0.575247,1.0,0.000000,0.953271,0.0
963395,0.905142,0.032590,0.460077,0.059160,0.833333,0.826667,0.0,0.5,0.0,0.0,...,0.0,0.0,0.033708,0.858378,0.636166,0.128036,0.0,0.871983,0.953271,0.0
117952,0.823068,0.628302,0.343269,0.106712,0.833333,0.826667,0.0,0.2,0.0,0.0,...,0.0,0.0,0.977528,0.684530,0.747277,0.522249,0.0,0.871983,0.953271,0.0


In [10]:
y_val.value_counts()/y_test.count()

0    0.440190
1    0.007791
Name: fraud_ind, dtype: float64

In [11]:
y_train.value_counts()/y_train.count()

0    0.981993
1    0.018007
Name: fraud_ind, dtype: float64

In [12]:
y_test.value_counts()/y_test.count()

0    0.986668
1    0.013332
Name: fraud_ind, dtype: float64

# Autoencoder & KNN

In [13]:
class MyDataset(Dataset):
    def __init__(self, df, labels):
        self.data = df.values.astype(np.float32)
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return T.tensor(self.data[idx]), self.labels[idx]

In [14]:
class Model(pl.LightningModule):
    def __init__(self, dim, x_train, y_train, x_val, y_val, K=3):
        super(Model, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(dim, 18),
            nn.Tanh(),
            nn.Linear(18, 14),
            nn.Tanh(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(14, 18),
            nn.Tanh(),
            nn.Linear(18, dim),
            nn.Sigmoid(),
        )
        self.K = K
        self.x_train = x_train
        self.y_train = T.tensor(y_train.values)
        self.x_val = x_val
        self.y_val = T.tensor(y_val.values)
        self.x_train_embedded = None

    def forward(self, x):
        z = self.encoder(x)
        z = self.decoder(z)
        return z
    
    def train_dataloader(self):
        return DataLoader(MyDataset(self.x_train, self.y_train), batch_size=train_batch_size, shuffle=False, pin_memory=True, num_workers=8)
    
    def val_dataloader(self):
        return DataLoader(MyDataset(self.x_val, self.y_val), batch_size=val_batch_size, shuffle=False, pin_memory=True, num_workers=8)
    
    def training_step(self, batch, batch_idx):
        data, labels = batch
        loss = F.mse_loss(self(data), data)
        return loss
    
    def on_train_end(self):
        self._update_embedding()
    
    def on_validation_epoch_start(self):
        self._update_embedding()
    
    def validation_step(self, batch, batch_idx):
        data, labels = batch
        return {
            "preds": self._KNN(self.encoder(data)),
            "labels": labels,
        }

    def validation_epoch_end(self, outputs):
        preds = np.array(reduce(lambda a, b : a + b["preds"].cpu().detach().tolist(), outputs, []))
        labels = np.array(reduce(lambda a, b : a + b["labels"].cpu().detach().tolist(), outputs, []))
        ((TN, FP), (FN, TP)) = metrics.confusion_matrix(labels, preds, labels=[0, 1])
        
        logging_metrics = {
            "precision": 0,
            "recall": 0,
            "f_score": 0,
        }

        if (TP + FP) != 0 and (TP + FN) != 0:
            precision = TP / (TP + FP)
            recall = TP / (TP + FN)
            f_score = 2 * (precision * recall) / (precision + recall)
            logging_metrics["precision"] = precision
            logging_metrics["recall"] = recall
            logging_metrics["f_score"] = f_score
        
        self.log_dict(logging_metrics, prog_bar=True)
        
    def _update_embedding(self):
        x_train_embedded = []
        for batch, labels in self.train_dataloader():
            x_train_embedded.append(self.encoder(batch.cuda()))
        self.x_train_embedded = T.cat(x_train_embedded)
    
    def _euclidean_distances(self, x_train, x_test):
        return T.linalg.norm(x_train - x_test.unsqueeze(1), dim=2)

    def _KNN(self, x_test):
        distance = self._euclidean_distances(self.x_train_embedded, x_test)
        top_k_xvals, top_k_indices = T.topk(T.neg(distance), self.K)
        prediction_indices = self.y_train[top_k_indices]
        prediction = T.mode(prediction_indices)
        return prediction.values

    def configure_optimizers(self):
        optimizer = T.optim.Adam(self.parameters(), lr=1e-2)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': T.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[x for x in range(0, num_train_epochs, 30)], gamma=0.1),
                'interval': 'epoch'
            }
        }

model = Model(x_train.shape[1], x_train, y_train, x_val, y_val, K=5)

In [15]:
train_batch_size = 2048
val_batch_size = 64
num_train_epochs = 100
check_val_every_n_epoch = 20

class ValProgressBar(pl.callbacks.progress.ProgressBar):
    def on_validation_start(self, trainer, pl_module):
        super().on_validation_start(trainer, pl_module)
        self.val_progress_bar.reset(total=len(trainer.val_dataloaders[0]))
    
trainer = pl.Trainer(
    gpus=1,
    max_epochs=num_train_epochs,
    check_val_every_n_epoch=check_val_every_n_epoch,
    callbacks=[ValProgressBar()]
)
trainer.fit(model)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 662   
1 | decoder | Sequential | 669   
---------------------------------------
1.3 K     Trainable params
0         Non-trainable params
1.3 K     Total params
0.005     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [48]:
test_dataloader = DataLoader(MyDataset(x_test, T.tensor(y_test.values)), batch_size=val_batch_size, shuffle=False, pin_memory=True, num_workers=16)

In [25]:
from tqdm.notebook import tqdm

preds = []
labels = []

model = model.cuda()
for x, y in tqdm(test_dataloader):
    x = x.cuda()
    
    preds.append(model._KNN(model.encoder(x)))
    labels.append(y)

  0%|          | 0/5945 [00:00<?, ?it/s]

In [32]:
preds = T.cat(preds)
labels = T.cat(labels)

In [37]:
((TN, FP), (FN, TP)) = metrics.confusion_matrix(labels, preds, labels=[0, 1])

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f_score = 2 * (precision * recall) / (precision + recall)
print(f"precision: {precision}\nrecall: {recall}\nf_score: {f_score}")

precision: 0.6632950787784477
recall: 0.6723186119873817
f_score: 0.6677763634583375
