In [2]:
import sys
import os

# Go up one level: from notebooks/ → project root
project_root = os.path.abspath("..")
sys.path.append(project_root)

print("Added to sys.path:", project_root)
print("Current working directory:", os.getcwd())


Added to sys.path: C:\Users\sadek\OneDrive\Desktop\DSAI4101-project
Current working directory: C:\Users\sadek\OneDrive\Desktop\DSAI4101-project\notebooks


In [3]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from torchvision import datasets

from src.b_models_impl import MyEmbeddingClient  # uses your new SimpleCNN


In [4]:
emb_client = MyEmbeddingClient(
    model_path="../models/classifier/simple_cnn.pth",
    classes_path="../models/classifier/classes.json"
)

device = emb_client.device
model = emb_client.model
model.eval()

print("Device:", device)
print("Classes:", emb_client.idx_to_class)


Device: cpu
Classes: ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']


In [5]:
from torch.utils.data import DataLoader

# 1) Load TRAIN data (normal classes)
train_ds = datasets.ImageFolder(
    root="../data/split/train",        # your 6 normal classes
    transform=emb_client.transform    # use EXACT same transform as classifier
)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=False)

# 2) Extract embeddings
all_embs = []

with torch.no_grad():
    for imgs, _ in train_loader:
        imgs = imgs.to(device)
        embs = model.forward_features(imgs)   # (B, 256) with new model
        all_embs.append(embs.cpu().numpy())

X_known = np.concatenate(all_embs, axis=0)
print("Known embeddings shape:", X_known.shape)


Known embeddings shape: (1767, 256)


In [22]:
# Anomaly dataset (e.g. images in ../data/rare_classes/anomaly)
anom_ds = datasets.ImageFolder(
    root="../data/rare_classes",
    transform=emb_client.transform
)

anom_loader = DataLoader(anom_ds, batch_size=32, shuffle=False)

all_anom = []

with torch.no_grad():
    for imgs, _ in anom_loader:
        imgs = imgs.to(device)
        embs = model.forward_features(imgs)
        all_anom.append(embs.cpu().numpy())

X_anom = np.concatenate(all_anom, axis=0)
print("Anomaly embeddings shape:", X_anom.shape)


Anomaly embeddings shape: (75, 256)


In [16]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(
    n_estimators=200,
    contamination=0.05,
    random_state=42
)
iso.fit(X_known)

scores_known = iso.decision_function(X_known)
scores_anom  = iso.decision_function(X_anom)

print("Known score range :", scores_known.min(), "→", scores_known.max())
print("Anomaly score range:", scores_anom.min(), "→", scores_anom.max())

# Try different thresholds (percentiles of NORMAL scores)
percentiles = [1, 2, 5, 10, 15, 20, 25, 30]

print("\nPercentile | TP  FN  FP  TN")
print("---------------------------------")

for p in percentiles:
    thr = np.percentile(scores_known, p)  # p% of normal data will be below thr

    pred_known = scores_known < thr   # below thr → anomaly
    pred_anom  = scores_anom  < thr

    tp = pred_anom.sum()
    fn = len(pred_anom) - tp
    fp = pred_known.sum()
    tn = len(pred_known) - fp

    print(f"{p:9d} | {tp:2d}  {fn:2d}  {fp:3d}  {tn:4d}")


Known score range : -0.07430385966504749 → 0.1138009755979244
Anomaly score range: -0.037781342821330166 → 0.09939493102825198

Percentile | TP  FN  FP  TN
---------------------------------
        1 |  0  75   18  1749
        2 |  1  74   36  1731
        5 |  5  70   89  1678
       10 | 11  64  177  1590
       15 | 17  58  265  1502
       20 | 19  56  354  1413
       25 | 26  49  442  1325
       30 | 31  44  530  1237


In [23]:
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
import numpy as np

# 1) Fit PCA on TRAIN normals only
pca = PCA(n_components=0.95, random_state=42)  # keep 95% variance
pca.fit(X_known_train)

print("Original dim:", X_known_train.shape[1])
print("PCA dim     :", pca.n_components_)

# 2) Transform train normals, test normals, anomalies
Z_known_train = pca.transform(X_known_train)
Z_known_test  = pca.transform(X_known_test)
Z_anom        = pca.transform(X_anom)

# 3) Train IsolationForest on PCA features
iso_pca = IsolationForest(
    n_estimators=200,
    contamination=0.05,
    random_state=42
)
iso_pca.fit(Z_known_train)

scores_train_pca = iso_pca.decision_function(Z_known_train)
scores_test_pca  = iso_pca.decision_function(Z_known_test)
scores_anom_pca  = iso_pca.decision_function(Z_anom)

print("Score ranges:")
print("  Test normals:", scores_test_pca.min(), "→", scores_test_pca.max())
print("  Anomalies   :", scores_anom_pca.min(), "→", scores_anom_pca.max())


Original dim: 256
PCA dim     : 10
Score ranges:
  Test normals: -0.09953923975420187 → 0.14330614374589679
  Anomalies   : -0.03390019545616607 → 0.12567741436099822


In [24]:
percentiles = [1, 2, 5, 10, 15, 20, 25, 30]

print("\n[IF+PCA] Percentile | TP  FN  FP  TN")
print("----------------------------------------")

for p in percentiles:
    thr = np.percentile(scores_train_pca, p)  # based on TRAIN normals

    pred_normals = scores_test_pca < thr
    pred_anom    = scores_anom_pca < thr

    tp = pred_anom.sum()
    fn = len(pred_anom) - tp
    fp = pred_normals.sum()
    tn = len(pred_normals) - fp

    print(f"{p:9d} | {tp:2d}  {fn:2d}  {fp:3d}  {tn:4d}")



[IF+PCA] Percentile | TP  FN  FP  TN
----------------------------------------
        1 |  0  75    5   526
        2 |  1  74   15   516
        5 |  4  71   31   500
       10 |  4  71   48   483
       15 | 13  62   77   454
       20 | 14  61  102   429
       25 | 24  51  127   404
       30 | 33  42  151   380
