In [14]:
import os
import torch
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset

# ------------------------- CONFIG -------------------------
TRAIN_SOURCE_DIR = "C:/DCASE_Temp/FineTuned/Source_Embeddings"
TRAIN_TARGET_DIR = "C:/DCASE_Temp/FineTuned/Target_Embeddings"
CHECKPOINT_DIR = "K:/DCASE"

MACHINE_TYPES = ["bearing", "fan", "gearbox", "slider", "toycar", "toytrain", "valve"]
INPUT_DIM = 768
LATENT_DIM = 128
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ------------------------- DATASET -------------------------
class NPYDataset(Dataset):
    def __init__(self, folder):
        self.files = sorted([
            os.path.join(folder, f)
            for f in os.listdir(folder)
            if f.endswith('.npy')
        ])

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = np.load(self.files[idx])
        if data.ndim > 1:
            data = data.mean(axis=0)
        assert data.shape[0] == INPUT_DIM, f"Shape mismatch: {data.shape} in file {self.files[idx]}"
        return torch.tensor(data, dtype=torch.float32), self.files[idx]

# ------------------------- MODEL -------------------------
class MahalanobisAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512), nn.ReLU(),
            nn.Linear(512, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 512), nn.ReLU(),
            nn.Linear(512, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

# ---------------------- MIXUP ----------------------
def mixup_data(data, num_augmented=100):
    mixed = []
    for _ in range(num_augmented):
        a, b = data[np.random.randint(len(data))], data[np.random.randint(len(data))]
        lam = np.random.beta(0.5, 0.5)
        mixed_sample = lam * a + (1 - lam) * b
        mixed.append(mixed_sample.clone().detach())
    return mixed

# ---------------------- TRAINING ----------------------
def run_training_all_machines():
    print("========== TRAINING: ALL MACHINE TYPES ==========")

    # Load and concatenate source datasets
    all_source_datasets = [NPYDataset(os.path.join(TRAIN_SOURCE_DIR, m)) for m in MACHINE_TYPES]
    source_dataset = ConcatDataset(all_source_datasets)
    source_loader = DataLoader(source_dataset, batch_size=16, shuffle=True)

    # Collect and mix all target data
    all_target_data = []
    for m in MACHINE_TYPES:
        target_dataset = NPYDataset(os.path.join(TRAIN_TARGET_DIR, m))
        target_loader = DataLoader(target_dataset, batch_size=1)
        all_target_data.extend([x.squeeze(0) for x, _ in target_loader])

    target_aug = mixup_data(all_target_data, num_augmented=700)  # 100 per machine

    # Initialize model
    model = MahalanobisAE(INPUT_DIM, LATENT_DIM).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    # Training loop
    model.train()
    for epoch in range(40):
        total_loss = 0.0

        for x, _ in source_loader:
            x = x.to(DEVICE)
            x_hat, _ = model(x)
            loss = loss_fn(x_hat, x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/40, Loss: {total_loss / len(source_loader):.4f}")

    # Save final model
    torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, "mahalanobis2_ae_all.pt"))
    print("✅ Global model saved as 'mahalanobis2_ae_all.pt'")

# ---------------------- RUN ----------------------
if __name__ == "__main__":
    run_training_all_machines()


Epoch 1/40, Loss: 0.0334
Epoch 2/40, Loss: 0.0103
Epoch 3/40, Loss: 0.0079
Epoch 4/40, Loss: 0.0067
Epoch 5/40, Loss: 0.0060
Epoch 6/40, Loss: 0.0051
Epoch 7/40, Loss: 0.0048
Epoch 8/40, Loss: 0.0044
Epoch 9/40, Loss: 0.0043
Epoch 10/40, Loss: 0.0040
Epoch 11/40, Loss: 0.0037
Epoch 12/40, Loss: 0.0038
Epoch 13/40, Loss: 0.0035
Epoch 14/40, Loss: 0.0034
Epoch 15/40, Loss: 0.0032
Epoch 16/40, Loss: 0.0033
Epoch 17/40, Loss: 0.0030
Epoch 18/40, Loss: 0.0030
Epoch 19/40, Loss: 0.0030
Epoch 20/40, Loss: 0.0028
Epoch 21/40, Loss: 0.0029
Epoch 22/40, Loss: 0.0054
Epoch 23/40, Loss: 0.0031
Epoch 24/40, Loss: 0.0028
Epoch 25/40, Loss: 0.0028
Epoch 26/40, Loss: 0.0028
Epoch 27/40, Loss: 0.0026
Epoch 28/40, Loss: 0.0027
Epoch 29/40, Loss: 0.0026
Epoch 30/40, Loss: 0.0026
Epoch 31/40, Loss: 0.0025
Epoch 32/40, Loss: 0.0024
Epoch 33/40, Loss: 0.0025
Epoch 34/40, Loss: 0.0025
Epoch 35/40, Loss: 0.0024
Epoch 36/40, Loss: 0.0023
Epoch 37/40, Loss: 0.0024
Epoch 38/40, Loss: 0.0025
Epoch 39/40, Loss: 0.

In [10]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

# ---------------------- CONFIG ----------------------
TEST_SOURCE_DIR = "K:/DCASE/BEATs/Test_Embeddings/Source_Embeddings"
TEST_TARGET_DIR = "K:/DCASE/BEATs/Test_Embeddings/Target_Embeddings"
LABEL_DIR = "K:/DCASE/generated_labels"
CHECKPOINT_DIR = "K:/DCASE"
MACHINE_TYPES = ["bearing", "fan", "gearbox", "slider", "toycar", "toytrain", "valve"]
INPUT_DIM = 768
LATENT_DIM = 128
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------- DATASET ----------------------
class NPYDataset(Dataset):
    def __init__(self, folder):
        self.files = sorted([os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.npy')])

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = np.load(self.files[idx])
        if data.ndim > 1:
            data = data.mean(axis=0)
        return torch.tensor(data, dtype=torch.float32), self.files[idx]

# ---------------------- MODEL ----------------------
class MahalanobisAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512), nn.ReLU(),
            nn.Linear(512, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 512), nn.ReLU(),
            nn.Linear(512, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

# ---------------------- EVALUATION ----------------------
def calculate_mahalanobis_scores(model, dataloader):
    model.eval()
    zs = []
    fnames = []

    with torch.no_grad():
        for x, fname in dataloader:
            x = x.to(DEVICE)
            _, z = model(x)
            zs.append(z.cpu())
            fnames += [os.path.basename(f) for f in fname]

    zs = torch.cat(zs, dim=0)
    mean = zs.mean(dim=0)
    cov = torch.cov(zs.T)
    cov_inv = torch.linalg.pinv(cov + 1e-6 * torch.eye(cov.shape[0]))

    scores = []
    for z in zs:
        diff = (z - mean).unsqueeze(0)
        dist = torch.sqrt(torch.matmul(torch.matmul(diff, cov_inv), diff.T))
        scores.append(dist.item())

    return fnames, scores

def calculate_auc(labels, scores):
    auc = roc_auc_score(labels, scores)
    top_k = int(len(labels) * 0.1)
    top_k_idxs = np.argsort(scores)[-top_k:]
    labels_at_top_k = np.array(labels)[top_k_idxs]
    pauc = np.mean(labels_at_top_k)
    return auc, pauc

def run_evaluation():
    print(f"\n🔍 Loading global model: mahalanobis_ae_all.pt\n")
    model = MahalanobisAE(INPUT_DIM, LATENT_DIM).to(DEVICE)
    model.load_state_dict(torch.load(os.path.join(CHECKPOINT_DIR, "mahalanobis_ae_all.pt")))
    model.eval()

    def clean_filename(f):
        f = os.path.basename(f).strip()
        if f.startswith("BEATs_aug_"):
            f = f[len("BEATs_aug_"):]
        return f

    for machine in MACHINE_TYPES:
        print(f"\n========== [TESTING: {machine.upper()}] ==========")
        label_path = os.path.join(LABEL_DIR, f"test_{machine}.csv")
        label_df = pd.read_csv(label_path)
        label_df["filename"] = label_df["filename"].str.strip()
        label_df = label_df.set_index("filename")

        for domain, test_dir in [("SOURCE", TEST_SOURCE_DIR), ("TARGET", TEST_TARGET_DIR)]:
            test_dataset = NPYDataset(os.path.join(test_dir, machine))
            test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

            fnames, scores = calculate_mahalanobis_scores(model, test_loader)
            fnames_clean = [clean_filename(f) for f in fnames]

            label_df_filtered = label_df[label_df.index.str.contains(domain.lower())]

            try:
                labels = [label_df_filtered.loc[f]["label"] for f in fnames_clean]
            except KeyError as e:
                print(f"[{domain}] ❌ Missing label for file: '{e.args[0]}'")
                continue

            if not labels:
                print(f"[{domain}] ❌ No matching labels found.")
                continue

            auc, pauc = calculate_auc(labels, scores)
            print(f"[{domain}] ✅ AUC: {auc:.4f}, pAUC: {pauc:.4f}")
# ---------------------- RUN ----------------------
if __name__ == "__main__":
    run_evaluation()



🔍 Loading global model: mahalanobis_ae_all.pt


[SOURCE] ❌ Missing label for file: 'section_00_source_test_anomaly_0001_pro_A_vel_4_loc_A.npy'
[TARGET] ❌ Missing label for file: 'section_00_target_test_anomaly_0001_pro_A_vel_4_loc_E.npy'

[SOURCE] ❌ Missing label for file: 'section_00_source_test_anomaly_0001_n_A.npy'
[TARGET] ❌ Missing label for file: 'section_00_target_test_anomaly_0001_n_X.npy'

[SOURCE] ❌ Missing label for file: 'section_00_source_test_anomaly_0001_noAttribute.npy'
[TARGET] ❌ Missing label for file: 'section_00_target_test_anomaly_0001_noAttribute.npy'

[SOURCE] ❌ Missing label for file: 'section_00_source_test_anomaly_0001_noAttribute.npy'
[TARGET] ❌ Missing label for file: 'section_00_target_test_anomaly_0001_noAttribute.npy'

[SOURCE] ❌ Missing label for file: 'section_00_source_test_anomaly_0001_car_B1_spd_31V_mic_1.npy'
[TARGET] ❌ Missing label for file: 'section_00_target_test_anomaly_0001_car_A1_spd_28V_mic_2.npy'

[SOURCE] ❌ Missing label for file: 'sectio

In [15]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

# ---------------------- CONFIG ----------------------
TEST_SOURCE_DIR = "K:/DCASE/BEATs/Test_Embeddings/Source_Embeddings"
TEST_TARGET_DIR = "K:/DCASE/BEATs/Test_Embeddings/Target_Embeddings"
LABEL_DIR = "K:/DCASE/generated_labels"
CHECKPOINT_DIR = "K:/DCASE"
MACHINE_TYPES = ["bearing", "fan", "gearbox", "slider", "toycar", "toytrain", "valve"]
INPUT_DIM = 768
LATENT_DIM = 128
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------- DATASET ----------------------
class NPYDataset(Dataset):
    def __init__(self, folder):
        self.files = sorted([os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.npy')])

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = np.load(self.files[idx])
        if data.ndim > 1:
            data = data.mean(axis=0)
        return torch.tensor(data, dtype=torch.float32), self.files[idx]

# ---------------------- MODEL ----------------------
class MahalanobisAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512), nn.ReLU(),
            nn.Linear(512, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 512), nn.ReLU(),
            nn.Linear(512, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

# ---------------------- EVALUATION ----------------------
def calculate_mahalanobis_scores(model, dataloader):
    model.eval()
    zs = []
    fnames = []

    with torch.no_grad():
        for x, fname in dataloader:
            x = x.to(DEVICE)
            _, z = model(x)
            zs.append(z.cpu())
            fnames += [os.path.basename(f) for f in fname]

    zs = torch.cat(zs, dim=0)
    mean = zs.mean(dim=0)
    cov = torch.cov(zs.T)
    cov_inv = torch.linalg.pinv(cov + 1e-6 * torch.eye(cov.shape[0]))

    scores = []
    for z in zs:
        diff = (z - mean).unsqueeze(0)
        dist = torch.sqrt(torch.matmul(torch.matmul(diff, cov_inv), diff.T))
        scores.append(dist.item())

    return fnames, scores

def calculate_auc(labels, scores):
    auc = roc_auc_score(labels, scores)
    top_k = int(len(labels) * 0.1)
    top_k_idxs = np.argsort(scores)[-top_k:]
    labels_at_top_k = np.array(labels)[top_k_idxs]
    pauc = np.mean(labels_at_top_k)
    return auc, pauc

def clean_filename(f):
    f = os.path.basename(f).strip()
    for prefix in ["BEATs_", "aug_"]:
        if f.startswith(prefix):
            f = f[len(prefix):]
    if f.endswith(".npy"):
        f = f[:-4]
    return f

def run_evaluation():
    print(f"\n🔍 Loading global model: mahalanobis2_ae_all.pt\n")
    model = MahalanobisAE(INPUT_DIM, LATENT_DIM).to(DEVICE)
    model.load_state_dict(torch.load(os.path.join(CHECKPOINT_DIR, "mahalanobis2_ae_all.pt"), map_location=DEVICE))
    model.eval()

    for machine in MACHINE_TYPES:
        print(f"\n========== [TESTING: {machine.upper()}] ==========")
        label_path = os.path.join(LABEL_DIR, f"test_{machine}.csv")
        label_df = pd.read_csv(label_path)

        # Clean and index labels
        label_df["filename"] = label_df["filename"].str.strip()
        label_df["filename"] = label_df["filename"].str.replace(".npy", "", regex=False)
        label_df["filename"] = label_df["filename"].str.replace("BEATs_", "", regex=False)
        label_df["filename"] = label_df["filename"].str.replace("aug_", "", regex=False)
        label_df = label_df.set_index("filename")

        for domain, test_dir in [("SOURCE", TEST_SOURCE_DIR), ("TARGET", TEST_TARGET_DIR)]:
            domain_dir = os.path.join(test_dir, machine)
            if not os.path.exists(domain_dir):
                print(f"[{domain}] ⚠️ Directory does not exist: {domain_dir}")
                continue

            test_dataset = NPYDataset(domain_dir)
            test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

            fnames, scores = calculate_mahalanobis_scores(model, test_loader)
            fnames_clean = [clean_filename(f) for f in fnames]

            labels = []
            for f in fnames_clean:
                if f in label_df.index:
                    labels.append(label_df.loc[f]["label"])
                else:
                    print(f"[{domain}] ❌ Missing label for file: '{f}'")

            if not labels:
                print(f"[{domain}] ❌ No matching labels found.")
                continue

            auc, pauc = calculate_auc(labels, scores)
            print(f"[{domain}] ✅ AUC: {auc:.4f}, pAUC: {pauc:.4f}")

# ---------------------- RUN ----------------------
if __name__ == "__main__":
    run_evaluation()


🔍 Loading global model: mahalanobis2_ae_all.pt


[SOURCE] ✅ AUC: 0.4876, pAUC: 0.5000
[TARGET] ✅ AUC: 0.4368, pAUC: 0.4000

[SOURCE] ✅ AUC: 0.4944, pAUC: 0.5000
[TARGET] ✅ AUC: 0.3988, pAUC: 0.6000

[SOURCE] ✅ AUC: 0.4856, pAUC: 0.5000
[TARGET] ✅ AUC: 0.4904, pAUC: 0.5000

[SOURCE] ✅ AUC: 0.4656, pAUC: 0.4000
[TARGET] ✅ AUC: 0.3576, pAUC: 0.3000

[SOURCE] ✅ AUC: 0.4188, pAUC: 0.6000
[TARGET] ✅ AUC: 0.4814, pAUC: 0.4000

[SOURCE] ✅ AUC: 0.5164, pAUC: 0.6000
[TARGET] ✅ AUC: 0.5480, pAUC: 0.5000

[SOURCE] ✅ AUC: 0.4728, pAUC: 0.4000
[TARGET] ✅ AUC: 0.4720, pAUC: 0.3000


In [13]:
import os
import pandas as pd

# ---------------- CONFIG ----------------
LABEL_DIR = "K:/DCASE/generated_labels"
MACHINE_TYPES = ["bearing", "fan", "gearbox", "slider", "toycar", "toytrain", "valve"]

# ---------------- SPLIT FUNCTION ----------------
def split_labels_by_domain():
    for machine in MACHINE_TYPES:
        original_csv = os.path.join(LABEL_DIR, f"test_{machine}.csv")
        
        if not os.path.exists(original_csv):
            print(f"⚠️ Missing label file: {original_csv}")
            continue

        df = pd.read_csv(original_csv)

        # Check column presence
        if "filename" not in df.columns or "label" not in df.columns:
            print(f"⚠️ Incorrect format in {original_csv}. Expected columns: 'filename', 'label'")
            continue

        # Split based on 'source' and 'target' in filename
        source_df = df[df["filename"].str.contains("source")]
        target_df = df[df["filename"].str.contains("target")]

        # Save them separately
        source_path = os.path.join(LABEL_DIR, f"test_{machine}_source.csv")
        target_path = os.path.join(LABEL_DIR, f"test_{machine}_target.csv")

        source_df.to_csv(source_path, index=False)
        target_df.to_csv(target_path, index=False)

        print(f"✅ Split '{machine}' into source ({len(source_df)}) and target ({len(target_df)})")

# ---------------- MAIN ----------------
if __name__ == "__main__":
    split_labels_by_domain()


✅ Split 'bearing' into source (100) and target (100)
✅ Split 'fan' into source (100) and target (100)
✅ Split 'gearbox' into source (100) and target (100)
✅ Split 'slider' into source (100) and target (100)
✅ Split 'toycar' into source (100) and target (100)
✅ Split 'toytrain' into source (100) and target (100)
✅ Split 'valve' into source (100) and target (100)


PermissionError: [Errno 13] Permission denied: 'K:/DCASE/generated_labels'