In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from scipy.stats import rankdata
from scipy import sparse


In [2]:
df = pd.read_csv("../data/raw/fraud_dataset_v1.csv")

labels_orig = df["label"].values        # ONLY for evaluation
df_feat = df.drop(columns=["label"])

num_cols = df_feat.select_dtypes(include=np.number).columns
cat_cols = df_feat.select_dtypes(include=object).columns

X_num = StandardScaler().fit_transform(df_feat[num_cols])

if len(cat_cols) > 0:
    enc = OneHotEncoder(sparse_output=True, handle_unknown="ignore")
    X_cat = enc.fit_transform(df_feat[cat_cols])
    X = sparse.hstack([sparse.csr_matrix(X_num), X_cat]).tocsr()
else:
    X = sparse.csr_matrix(X_num)

X_dense = X.astype(np.float32).toarray()
N, input_dim = X_dense.shape


In [3]:
dataset = TensorDataset(torch.from_numpy(X_dense))
train_size = int(0.8 * N)
val_size = N - train_size

train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=1024)


In [4]:
class DenoisingAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=8):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        noise = torch.randn_like(x) * 0.05
        x_noisy = x + noise
        z = self.encoder(x_noisy)
        x_hat = self.decoder(z)
        return x_hat, z


In [5]:
ae = DenoisingAutoencoder(input_dim, latent_dim=8)
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
criterion = nn.MSELoss()

best_val = np.inf
patience, counter = 5, 0
best_state = None

for epoch in range(50):
    ae.train()
    train_loss = 0
    for (x,) in train_loader:
        optimizer.zero_grad()
        x_hat, _ = ae(x)
        loss = criterion(x_hat, x)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * x.size(0)

    ae.eval()
    val_loss = 0
    with torch.no_grad():
        for (x,) in val_loader:
            x_hat, _ = ae(x)
            val_loss += criterion(x_hat, x).item() * x.size(0)

    train_loss /= train_size
    val_loss   /= val_size
    print(f"Epoch {epoch+1} | Train {train_loss:.6f} | Val {val_loss:.6f}")

    if val_loss < best_val:
        best_val = val_loss
        best_state = ae.state_dict()
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping")
            break

ae.load_state_dict(best_state)
ae.eval()


Epoch 1 | Train 0.004507 | Val 0.002604
Epoch 2 | Train 0.002286 | Val 0.002183
Epoch 3 | Train 0.001593 | Val 0.001363
Epoch 4 | Train 0.001303 | Val 0.001274
Epoch 5 | Train 0.001258 | Val 0.001207
Epoch 6 | Train 0.001271 | Val 0.001183
Epoch 7 | Train 0.001160 | Val 0.001107
Epoch 8 | Train 0.001101 | Val 0.001065
Epoch 9 | Train 0.001054 | Val 0.000984
Epoch 10 | Train 0.000966 | Val 0.001009
Epoch 11 | Train 0.000930 | Val 0.000872
Epoch 12 | Train 0.000834 | Val 0.000790
Epoch 13 | Train 0.000772 | Val 0.000731
Epoch 14 | Train 0.000762 | Val 0.001742
Epoch 15 | Train 0.000736 | Val 0.000618
Epoch 16 | Train 0.000599 | Val 0.000577
Epoch 17 | Train 0.000559 | Val 0.000539
Epoch 18 | Train 0.000519 | Val 0.000495
Epoch 19 | Train 0.000476 | Val 0.000461
Epoch 20 | Train 0.000473 | Val 0.000479
Epoch 21 | Train 0.000414 | Val 0.000394
Epoch 22 | Train 0.000400 | Val 0.000374
Epoch 23 | Train 0.000351 | Val 0.000335
Epoch 24 | Train 0.000329 | Val 0.000366
Epoch 25 | Train 0.000323

DenoisingAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=618, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=8, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=618, bias=True)
  )
)

In [6]:
rec_error, latent = [], []

with torch.no_grad():
    for i in range(0, N, 1024):
        batch = torch.from_numpy(X_dense[i:i+1024])
        x_hat, z = ae(batch)
        rec_error.append(((batch - x_hat)**2).mean(dim=1).numpy())
        latent.append(z.numpy())

rec_error = np.concatenate(rec_error)
latent = np.concatenate(latent)


In [7]:
if_model = IsolationForest(
    n_estimators=300,
    contamination=0.01,
    random_state=42,
    n_jobs=-1
)
if_model.fit(latent)
if_score = -if_model.score_samples(latent)

rec_rank = rankdata(rec_error) / N
if_rank  = rankdata(if_score) / N

ensemble_global = 0.6 * rec_rank + 0.4 * if_rank


In [8]:
X_lof = np.column_stack([latent, rec_error])

lof = LocalOutlierFactor(
    n_neighbors=35,
    metric="euclidean"
)

lof_score = -lof.fit_predict(X_lof)
lof_rank = rankdata(lof_score) / N


In [9]:
TH_GLOBAL = 0.9985
pred_global = ensemble_global >= TH_GLOBAL


In [10]:
TH_LOCAL = 0.999
pred_local_candidate = lof_rank >= TH_LOCAL


In [11]:
K_LOCAL = int(0.001 * N)

local_idx = np.where(pred_local_candidate)[0]
if len(local_idx) > K_LOCAL:
    local_idx = local_idx[np.argsort(lof_rank[local_idx])[-K_LOCAL:]]

pred_local = np.zeros(N, dtype=bool)
pred_local[local_idx] = True


In [12]:
preds = (pred_global | pred_local).astype(int)


In [13]:
label_map = {"regular": 0, "local": 1, "global": 1}
labels_numeric = np.array([label_map[l] for l in labels_orig])

print("Anomalies predicted:", preds.sum())
print("Precision:", precision_score(labels_numeric, preds))
print("Recall:", recall_score(labels_numeric, preds))
print("F1:", f1_score(labels_numeric, preds))
print("ROC-AUC:", roc_auc_score(labels_numeric, ensemble_global))

print("\nDetected anomalies by type:")
print(pd.Series(labels_orig[preds == 1]).value_counts())


Anomalies predicted: 269
Precision: 0.26022304832713755
Recall: 0.7
F1: 0.3794037940379404
ROC-AUC: 0.8867683131641613

Detected anomalies by type:
regular    199
global      70
Name: count, dtype: int64
