We'll explore some unsupervised models and see what works best for our purposes in this notebook.

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("./data/flagright.csv", index_col=[0])

In [4]:
df.head()

Unnamed: 0,destinationCountry,destinationCurrency,destinationAmount,originCountry,originCurrency,originAmount,state,destinationMethod,originMethod,transactionId,originUserId,destinationUserId,hour_sin,hour_cos,day_sin,day_cos,week_day_sin,week_day_cos
0,IN,INR,10132.8,IN,INR,10132.8,CREATED,GENERIC_BANK_ACCOUNT,GENERIC_BANK_ACCOUNT,bd70fcaebc254c23b07b29fd994ba5f2,29529892-22d3-4a74-b6f2-fbe1d5ee8b6f,,0.258819,-0.965926,-0.903356,-0.428892,0.433884,-0.900969
1,IN,INR,145653.93,IN,INR,145653.93,CREATED,GENERIC_BANK_ACCOUNT,GENERIC_BANK_ACCOUNT,c9f8913d0bd548838e97bd6a609dbc45,0b85951b-c817-499e-ad17-453e5feaf87c,,0.258819,-0.965926,-0.903356,-0.428892,0.433884,-0.900969
2,IN,INR,6311.0,IN,INR,6311.0,CREATED,GENERIC_BANK_ACCOUNT,GENERIC_BANK_ACCOUNT,4de9f33636cf44378f748f723ee4ac87,29529892-22d3-4a74-b6f2-fbe1d5ee8b6f,,0.258819,-0.965926,-0.903356,-0.428892,0.433884,-0.900969
3,IN,INR,400000.0,IN,INR,400000.0,CREATED,GENERIC_BANK_ACCOUNT,GENERIC_BANK_ACCOUNT,12b97d4eb51940d0886f609903fb2154,2f6ec341-9075-4aaa-9db2-9fd5d8597f97,0b85951b-c817-499e-ad17-453e5feaf87c,0.258819,-0.965926,-0.903356,-0.428892,0.433884,-0.900969
4,IN,INR,45.0,IN,INR,45.0,CREATED,GENERIC_BANK_ACCOUNT,GENERIC_BANK_ACCOUNT,32c6bc06f67b4e2b8222f76c8268cea9,0b85951b-c817-499e-ad17-453e5feaf87c,,0.258819,-0.965926,-0.903356,-0.428892,0.433884,-0.900969


In [None]:
numerical_columns = ['timestamp', 'amount']
categorical_columns = ['transactionId', 'originUserId', 'destinationUserId'] #separate the numerical and categorical columns for easier handling


features = pd.DataFrame(index = df.index)
features[numerical_columns] = df[numerical_columns]
features[categorical_columns] = df[categorical_columns]

In [None]:
from sklearn.preprocessing import LabelEncoder

#Encode Columns using 
labelencoder_X = LabelEncoder()
X_labelEncoder = features[numerical_columns]
X_labelEncoder['transactionId'] = labelencoder_X.fit_transform(features['transactionId']) # we can use LabelEncoder or OneHotEncoder for the categorical features
X_labelEncoder['originUserId'] = labelencoder_X.fit_transform(features['originUserId'])   #both of them accomplish the same thing, but the sklearn documentation
X_labelEncoder['destinationUserId'] = labelencoder_X.fit_transform(features['destinationUserId']) #recommends that users use OneHotEncoder for features and LabelEncoder for labels

In [None]:
X_labelEncoder.head()

In [None]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state=42).fit(X_labelEncoder)
clf.predict(X_labelEncoder[:5])

In [None]:
clf.decision_function(X_labelEncoder[:5])

In [None]:
preds = clf.predict(X_labelEncoder)

In [None]:
np.count_nonzero(preds == 1)

In [None]:
np.count_nonzero(preds == -1)

In [None]:
from sklearn.metrics import calinski_harabasz_score

print(calinski_harabasz_score(X_labelEncoder, preds))

In [None]:
features['amount'].mean()

In [None]:
import seaborn as sns
sns.histplot(features['amount'])

We can check what the distribution of the transaction amount is so we can get a baseline amount to flag bigger transactions. This can help us compare our model with a rule-based approach where you automaticall flag the biggest transactions.

In [None]:
features['amount'].quantile(0.95)

In [None]:
features['isFlagged'] = features['amount'] > 135000

The calinski_harabasz_score is a cluster analysis metric that tells you how well your model is making clusters. The score is defined as ratio of the sum of between-cluster dispersion and of within-cluster dispersion.

In [None]:
print(calinski_harabasz_score(X_labelEncoder, features['isFlagged']))

In [None]:
import joblib

with open('model.joblib', 'wb') as f:
    joblib.dump(clf,f)


with open('model.joblib', 'rb') as f:
    predictor = joblib.load(f)

print("Testing following input: ")
print(X_labelEncoder[0:1])

In [None]:
sampInput = [[1662358419786, 10132.8, 1598, 3, 22]]
print(type(sampInput))
print(predictor.predict(sampInput))

Below, I tried an autoencoder approach.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Define your autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, encoding_size),
            nn.ReLU())
        self.decoder = nn.Sequential(
            nn.Linear(encoding_size, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_size))
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Define your training and testing data
train_data = features[:1700]
test_data = features[1700:]

# Preprocess categorical features
categorical_columns = ['transactionId','originUserId','destinationUserId','isFlagged']
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(list(train_data[col].values) + list(test_data[col].values))
    train_data[col] = le.transform(train_data[col])
    test_data[col] = le.transform(test_data[col])

# Normalize your data
mean = train_data.mean()
std = train_data.std()
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

# Define a custom dataset for your data
class TabularDataset(Dataset):
    def __init__(self, data):
        self.data = data.values.astype(np.float32)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
    
# Create data loaders
train_dataset = TabularDataset(train_data)
test_dataset = TabularDataset(test_data)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize your autoencoder model and optimizer
input_size = len(train_data.columns)
encoding_size = 10
model = Autoencoder(input_size, encoding_size)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Define your loss function
criterion = nn.MSELoss()

# Train the autoencoder
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0
    for data in train_loader:
        inputs = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, running_loss/len(train_loader)))

# Evaluate the autoencoder
model.eval()
with torch.no_grad():
    anomaly_scores = []
    for data in test_loader:
        inputs = data
        outputs = model(inputs)
        loss = torch.sum((outputs - inputs)**2, dim=1)
        anomaly_scores += loss.cpu().numpy().tolist()

# Detect anomalies using a threshold
threshold = 0.1
anomaly_labels = [1 if score > threshold else 0 for score in anomaly_scores]

In [None]:
sum(anomaly_labels)

In [None]:
anomaly_labels

In [None]:
anomaly_scores

After trying a couple of approaches, I decided on using IsolationForest since it provides what we need (an outlier boolean and a confidence score) out of the box.