In [None]:
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier, IsolationForest

import torch
from utils import load_mat, preprocess_features



In [None]:
# Load Amazon dataset from ./data/Amazon.mat using the TAM utils
adj, feat_sp, ano_labels, str_labels, attr_labels = load_mat("Amazon")

print("Adjacency shape:", adj.shape)
print("Feature matrix (sparse) shape:", feat_sp.shape)
print("Anomaly labels shape:", ano_labels.shape)

# Basic stats
n_nodes = adj.shape[0]
n_features = feat_sp.shape[1]
n_anomalies = int(ano_labels.sum())
print(f"Nodes: {n_nodes}, Features: {n_features}, Anomalies: {n_anomalies} ({n_anomalies/n_nodes*100:.2f}%)")


In [None]:
# Row-normalize features using the same preprocessing as TAM
feat_dense_norm, feat_sparse_tuple = preprocess_features(feat_sp)

# preprocess_features returns a numpy.matrix; convert to ndarray
X = np.asarray(feat_dense_norm)
y = ano_labels.astype(int)

print("Dense normalized feature matrix shape:", X.shape)
print("Label distribution (0=normal, 1=anomaly):", np.bincount(y))


In [None]:
# Keep enough components to capture most variance (e.g., 50)
n_components = min(50, X.shape[1])
pca = PCA(n_components=n_components, random_state=6220)
X_pca = pca.fit_transform(X)

explained = pca.explained_variance_ratio_
cum_explained = np.cumsum(explained)

plt.figure(figsize=(6,4))
plt.plot(range(1, n_components+1), cum_explained, marker='o')
plt.axhline(0.90, color='red', linestyle='--', label='90% variance')
plt.xlabel("Number of principal components")
plt.ylabel("Cumulative explained variance")
plt.title("PCA – Cumulative Explained Variance (Amazon)")
plt.grid(True)
plt.legend()
plt.show()

print("Components needed for 90% variance:",
      np.searchsorted(cum_explained, 0.90) + 1)
print("Components needed for 95% variance:",
      np.searchsorted(cum_explained, 0.95) + 1)


In [None]:
plt.figure(figsize=(6,5))
normal_mask = (y == 0)
anom_mask = (y == 1)

plt.scatter(X_pca[normal_mask, 0], X_pca[normal_mask, 1],
            s=5, alpha=0.4, label="Normal")
plt.scatter(X_pca[anom_mask, 0], X_pca[anom_mask, 1],
            s=10, alpha=0.8, label="Anomaly")

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Projection (PC1 vs PC2) – Amazon")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Supervised ML baselines using anomaly labels as target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=6220, stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])
print("Anomalies in train:", int(y_train.sum()))
print("Anomalies in test:", int(y_test.sum()))


In [None]:
# IsolationForest: unsupervised anomaly detection
iso = IsolationForest(
    n_estimators=200,
    contamination=float(y.mean()),  # use true anomaly ratio
    random_state=6220,
    n_jobs=-1
)

iso.fit(X_train)  # only features, no labels
# decision_function: higher = more normal, so invert
iso_scores = -iso.decision_function(X_test)

iso_auc = roc_auc_score(y_test, iso_scores)
iso_ap = average_precision_score(y_test, iso_scores)

print(f"IsolationForest – AUROC: {iso_auc:.4f}, AUPRC: {iso_ap:.4f}")


In [None]:
models = ["RandomForest", "IsolationForest"]
aucs = [rf_auc, iso_auc]
aps = [rf_ap, iso_ap]

x = np.arange(len(models))
width = 0.35

plt.figure(figsize=(6,4))
plt.bar(x - width/2, aucs, width, label="AUROC")
plt.bar(x + width/2, aps, width, label="AUPRC")
plt.xticks(x, models)
plt.ylim(0.0, 1.0)
plt.ylabel("Score")
plt.title("Baseline Anomaly Detection Performance (Amazon)")
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()

for name, a, p in zip(models, aucs, aps):
    print(f"{name:15s}  AUROC = {a:.4f},  AUPRC = {p:.44f}")


In [None]:
import os

FIG_DIR = "figures"
os.makedirs(FIG_DIR, exist_ok=True)


In [None]:
plt.figure(figsize=(6,4))
plt.plot(range(1, n_components+1), cum_explained, marker='o')
plt.axhline(0.90, color='red', linestyle='--', label='90% variance')
plt.xlabel("Number of principal components")
plt.ylabel("Cumulative explained variance")
plt.title("PCA – Cumulative Explained Variance (Amazon)")
plt.grid(True)
plt.legend()

plt.savefig(f"{FIG_DIR}/fig_pca_variance_amazon.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
plt.figure(figsize=(6,5))

plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], s=5, alpha=0.4, label="Normal")
plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], s=10, alpha=0.8, label="Anomaly")

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA Projection (PC1 vs PC2) – Amazon")
plt.legend()
plt.grid(True)

plt.savefig(f"{FIG_DIR}/fig_pca_scatter_amazon.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
models = ["RandomForest", "IsolationForest"]
aucs = [rf_auc, iso_auc]
aps = [rf_ap, iso_ap]

x = np.arange(len(models))
width = 0.35

plt.figure(figsize=(6,4))
plt.bar(x - width/2, aucs, width, label="AUROC")
plt.bar(x + width/2, aps, width, label="AUPRC")
plt.xticks(x, models)
plt.ylim(0, 1)
plt.ylabel("Score")
plt.title("Baseline Anomaly Detection Performance (Amazon)")
plt.legend()
plt.grid(axis="y", alpha=0.3)

plt.savefig(f"{FIG_DIR}/fig_baseline_amazon.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
plt.figure(figsize=(6,4))
plt.plot(xs, auc_values, marker='o', label="AUROC")
plt.plot(xs, ap_values, marker='s', label="AUPRC")
plt.xlabel("Evaluation index")
plt.ylabel("Score")
plt.title("TAM Performance on Amazon")
plt.ylim(0, 1)
plt.grid(True)
plt.legend()

plt.savefig(f"{FIG_DIR}/fig_tam_amazon.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
print("RandomForest  AUROC:", rf_auc, "AUPRC:", rf_ap)
print("IsolationForest AUROC:", iso_auc, "AUPRC:", iso_ap)
print("TAM Best AUROC:", best_auc)
print("TAM Best AUPRC:", best_ap)
