In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torchvision.utils import save_image
from torch.utils.data.sampler import SubsetRandomSampler

from imblearn.over_sampling import SMOTE

import numpy as np
import pandas as pd 
import datetime
import os, sys


from matplotlib.pyplot import imshow, imsave
%matplotlib inline

In [2]:
MODEL_NAME = 'fraudClassifier'
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("transaction_dataset.csv")

#Rename columns for easier access
df.columns = df.columns.str.strip().str.replace(' ','_').str.lower()

#Remove weird stuff 
df.drop(columns=['unnamed:_0'], inplace=True)

#Remove duplicate accounts
df.drop_duplicates(subset=['address'], inplace=True)

#Remove accounts 
df.drop(columns=['address'], inplace=True)

#Remove index
df.drop(columns=['index'], inplace=True)

#Remove token names 
df.drop(columns=['erc20_most_sent_token_type','erc20_most_rec_token_type'], inplace = True)

#Remove var=0 columns
df.drop(df.var(numeric_only=True)[df.var(numeric_only=True) == 0].index, axis = 1, inplace = True)

#Remove small distribution columns
small_distr_col = []
for col in df.columns[3:] :
    if len(df[col].value_counts()) < 10:
        small_distr_col.append(col)
df.drop(columns=small_distr_col,inplace = True)


#Replace nan values by median 
df.fillna(df.median(numeric_only=True), inplace=True)
# Remove negative values 
df[df<0] = None 
df.dropna(inplace=True)

df_n = df.copy()

#Normalization 
for col in df_n.columns[1:]:
    df_n[col] = (df_n[col]-df_n[col].mean())/df_n[col].std()
means = [df[col].mean() for col in df.columns[1:]]
stds = [df[col].std() for col in df.columns[1:]]


class Accounts(torch.utils.data.Dataset):
    def __init__(self,df):

        x=df.iloc[:,1:].values
        y=df.iloc[:,0].values

        self.x_train=torch.tensor(x,dtype=torch.float32)
        self.y_train=torch.tensor(y,dtype=torch.long)

    def __len__(self):
        return len(self.y_train)
    
    def __getitem__(self,idx):
        return self.x_train[idx],self.y_train[idx]

    
dataset = Accounts(df_n)
batch_size = 64
validation_split = .2
shuffle_dataset = True
random_seed= 42
features_size= 33

# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler,drop_last=True)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler,drop_last=True)

for i, (data, labels) in enumerate(train_loader):
    print(data.shape, labels.shape)
    break;

torch.Size([64, 33]) torch.Size([64])


In [4]:
def to_onehot(x, num_classes=2):
    assert isinstance(x, int) or isinstance(x, (torch.LongTensor, torch.cuda.LongTensor))
    if isinstance(x, int):
        c = torch.zeros(1, num_classes).long()
        c[0][x] = 1
    else:
        x = x.cpu()
        c = torch.LongTensor(x.size(0), num_classes)
        c.zero_()
        c.scatter_(1, x, 1) # dim, index, src value
    return c


In [5]:
class Classifier(nn.Module):
    """
        Simple Discriminator w/ MLP
    """
    def __init__(self, input_size=features_size, num_classes=1):
        super(Classifier, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes),
            nn.Sigmoid(),
        )
    
    def forward(self, x):        
        x = x.view(x.size(0), -1)
        y_ = self.layer(x)
        return y_

In [6]:
model = Classifier().to(DEVICE)

In [7]:
criterion = nn.BCELoss()
model_opt = torch.optim.Adam(model.parameters(), lr=0.001)
max_epoch = 100
step = 0

In [8]:
@torch.no_grad()
def validate(data = validation_loader):
    model.eval()  # Important: eval mode (affects dropout, batch norm etc)
    test_loss = 0
    correct = 0
    for address, labels in data:
        x,y = address.to(DEVICE), labels.to(DEVICE)
        output = model(x)
        test_loss += criterion(output, y.view(batch_size,1).float()).item() * len(address)
        pred = output.round()
        correct+= pred.eq(labels.view_as(pred)).sum().item()

    test_loss /= len(validation_loader.dataset)
    correct/=len(validation_loader.dataset)
    return test_loss, correct*100

In [9]:
for epoch in range(max_epoch):
    for idx, (address, labels) in enumerate(train_loader):
        model.train()
        # Training Discriminator
        x,y = address.to(DEVICE), labels.to(DEVICE)
        output = model(x)
        loss = criterion(output,y.view(batch_size,1).float())
        
        model_opt.zero_grad()
        loss.backward()
        model_opt.step()
        
        test_loss,acc = validate(validation_loader)
        
        if step % 1000 == 0:
            print('Epoch: {}/{}, Step: {}, Train Loss: {:.4g}, Test Loss: {:.4g}, Accuracy: {:.4g}%'.format(epoch, max_epoch, step, loss.item(),test_loss,acc))
    
        step += 1
torch.save(model.state_dict(), "./models/fraud_classifier")

Epoch: 0/100, Step: 0, Train Loss: 0.6616, Test Loss: 0.126, Accuracy: 14.9%
Epoch: 9/100, Step: 1000, Train Loss: 0.05724, Test Loss: 0.03473, Accuracy: 18.83%
Epoch: 18/100, Step: 2000, Train Loss: 0.07023, Test Loss: 0.02483, Accuracy: 18.72%
Epoch: 27/100, Step: 3000, Train Loss: 0.05631, Test Loss: 0.01675, Accuracy: 18.88%
Epoch: 36/100, Step: 4000, Train Loss: 0.03492, Test Loss: 0.02601, Accuracy: 18.86%
Epoch: 45/100, Step: 5000, Train Loss: 0.02531, Test Loss: 0.01876, Accuracy: 19%
Epoch: 54/100, Step: 6000, Train Loss: 0.02349, Test Loss: 0.02115, Accuracy: 18.89%
Epoch: 63/100, Step: 7000, Train Loss: 0.02078, Test Loss: 0.02274, Accuracy: 18.88%
Epoch: 72/100, Step: 8000, Train Loss: 0.05295, Test Loss: 0.02206, Accuracy: 18.81%
Epoch: 81/100, Step: 9000, Train Loss: 0.003935, Test Loss: 0.03495, Accuracy: 18.88%
Epoch: 90/100, Step: 10000, Train Loss: 0.04135, Test Loss: 0.02445, Accuracy: 18.98%


In [10]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(df_n.iloc[:,1:].values,df_n.iloc[:,0].values)
print(y_train.shape)

(13618,)


In [11]:
class AccountsAug(torch.utils.data.Dataset):
    def __init__(self):

        x=X_train
        y=y_train

        self.x_train=torch.tensor(x,dtype=torch.float32)
        self.y_train=torch.tensor(y,dtype=torch.long)

    def __len__(self):
        return len(self.y_train)
    
    def __getitem__(self,idx):
        return self.x_train[idx],self.y_train[idx]
dataset = AccountsAug()
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

aug_train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler,drop_last=True)
aug_validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler,drop_last=True)

for i, (data, labels) in enumerate(aug_train_loader):
    print(data.shape, labels.shape)
    break;

torch.Size([64, 33]) torch.Size([64])


In [12]:
model = Classifier()
step = 0
for epoch in range(max_epoch):
    for idx, (address, labels) in enumerate(aug_train_loader):
        model.train()
        # Training Discriminator
        x,y = address.to(DEVICE), labels.to(DEVICE)
        output = model(x)
        loss = criterion(output,y.view(batch_size,1).float())
        
        model_opt.zero_grad()
        loss.backward()
        model_opt.step()
        
        test_loss,acc = validate(aug_validation_loader)
        
        if step % 1000 == 0:
            print('Epoch: {}/{}, Step: {}, Train Loss: {:.4g}, Test Loss: {:.4g}, Accuracy: {:.4g}%'.format(epoch, max_epoch, step, loss.item(),test_loss,acc))
    
        step += 1

Epoch: 0/100, Step: 0, Train Loss: 0.6909, Test Loss: 0.2108, Accuracy: 15.23%
Epoch: 5/100, Step: 1000, Train Loss: 0.6941, Test Loss: 0.2108, Accuracy: 15.17%
Epoch: 11/100, Step: 2000, Train Loss: 0.6979, Test Loss: 0.2108, Accuracy: 15.17%
Epoch: 17/100, Step: 3000, Train Loss: 0.6882, Test Loss: 0.2108, Accuracy: 15.14%
Epoch: 23/100, Step: 4000, Train Loss: 0.6908, Test Loss: 0.2108, Accuracy: 15.16%
Epoch: 29/100, Step: 5000, Train Loss: 0.6949, Test Loss: 0.2108, Accuracy: 15.12%
Epoch: 35/100, Step: 6000, Train Loss: 0.6892, Test Loss: 0.2108, Accuracy: 15.19%
Epoch: 41/100, Step: 7000, Train Loss: 0.7009, Test Loss: 0.2108, Accuracy: 15.16%
Epoch: 47/100, Step: 8000, Train Loss: 0.6903, Test Loss: 0.2108, Accuracy: 15.19%
Epoch: 52/100, Step: 9000, Train Loss: 0.689, Test Loss: 0.2108, Accuracy: 15.15%
Epoch: 58/100, Step: 10000, Train Loss: 0.6906, Test Loss: 0.2108, Accuracy: 15.11%
Epoch: 64/100, Step: 11000, Train Loss: 0.6944, Test Loss: 0.2108, Accuracy: 15.19%
Epoch: 7