# SemanticMask: A Contrastive View Design for Anomaly Detection in Tabular Data

This notebook describes the user-guide of SemanticMask and its variants using saheart dataset.

### Necessary packages and functions call

In [1]:
import numpy as np
import torch
import torch.optim as optim
from data_loader import load_saheart
from train import ContrastiveEncoder,train_dnn
#from train_position import ContrastiveEncoder_position,train_encoder_position
from semanticmask_augmentation import MyDataset,MyDataset_position,MyDataset_description,MyDataset_test
import random,os
from evaluate import evaluate
from evaluate_position import evaluate_position



### Load data

Load original saheart dataset and preprocess the loaded data.

In [2]:
x_train,y_train,x_valid,y_valid,x_test,y_test = load_saheart()
#np.save('data/X_train_saheart.npy', x_train)  
#np.save('data/y_train_saheart.npy', y_train)    all zeros 
#np.save('data/X_valid_saheart.npy', x_valid)    
#np.save('data/y_valid_saheart.npy', y_valid)    all zeros 
#np.save('data/X_test_saheart.npy', x_test)   
#np.save('data/y_test_saheart.npy', y_test)  


The shape of data: (462, 9)
The shape of normal data: (302, 9)
The shape of anomalous data: (160, 9)


In [3]:
#The different permutations and partitions of normal data in the load_saheart() function can lead to minor fluctuations in the results. 
#To ensure better reproducibility, we provide the preprocessed datasets used in our paper.
X_train = np.load('data/X_train_saheart.npy')   
y_train = np.load('data/y_train_saheart.npy')  
X_valid = np.load('data/X_valid_saheart.npy')    
y_valid = np.load('data/y_valid_saheart.npy')
X_test = np.load('data/X_test_saheart.npy')   
y_test = np.load('data/y_test_saheart.npy')

### Data augmentation

In [4]:
 #f_label: The feature grouping results obtained by applying sentence-BERT and k-means clustering based on column names can be found in group.ipynb.
f_label = np.array([1, 0, 1, 0, 1, 0, 0, 0, 0])  
data_train__SemanticMask = MyDataset(X_train,y_train,f_label)
data_train = MyDataset_test(X_train, y_train)
data_valid = MyDataset_test(X_valid, y_valid)
data_test = MyDataset_test(X_test, y_test)
trainloader_SemanticMask = torch.utils.data.DataLoader(data_train__SemanticMask,batch_size=151)   
trainloader = torch.utils.data.DataLoader(dataset=data_train,batch_size=151)
validloader = torch.utils.data.DataLoader(dataset=data_valid,batch_size=75)
testloader = torch.utils.data.DataLoader(dataset=data_test,batch_size=236)


data_train_position = MyDataset_position(X_train,y_train,f_label)
trainloader_position = torch.utils.data.DataLoader(data_train_position,batch_size=151)  

# In this dataset, the partition results of SemanticMask and SemanticMask+description is same. 
f_label = np.array([1, 0, 1, 0, 1, 0, 0, 0, 0])
data_train_description = MyDataset_description(X_train,y_train,f_label)
trainloader_description = torch.utils.data.DataLoader(data_train_description,batch_size=151) 

**Hyperparameter setting**

In [5]:
temperature = 0.01
epochs = 1000

### Train the SemanticMask

In [6]:
# No need to call .cuda() anywhere in the code
net = ContrastiveEncoder()  # Instantiate the model (no need for .cuda() here)
optimizer = optim.Adam(net.parameters(), lr = 0.001)

# Ensure the model and inputs are on the CPU
net, training_loss = train_dnn(net, temperature, epochs, optimizer, trainloader_SemanticMask)

AUC = []

# Evaluate the model using CPU
auroc = evaluate(net, trainloader, validloader, testloader)
print(auroc)


100%|██████████| 1000/1000 [00:04<00:00, 217.64it/s]


[[66 10]
 [90 70]]
AUCROC: 0.7209703947368422
0.7209703947368422


**Report prediction performances of our pretrained model**

In [None]:
AUC = []
for i in range(5):
    net = ContrastiveEncoder()
    net = torch.load("model/SemanticMask_"+str(i)+".pkl", map_location=torch.device('cpu'))
    auroc =evaluate(net,trainloader,validloader,testloader)
    AUC.append(auroc)
AUC= np.array(AUC)
print(AUC)
print("The average value of AUCROC:", np.mean(AUC))
print("The standard deviation of AUCROC:",np.std(AUC))  

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

### Train the SemanticMask + position

In [None]:
from train_position import ContrastiveEncoder,train_encoder_position
net = ContrastiveEncoder().cuda()
optimizer = optim.Adam(net.parameters(), lr = 0.001)
net,training_loss = train_encoder_position(net,temperature,epochs,optimizer,trainloader_position)
auroc =evaluate_position(net,trainloader,validloader,testloader)
print(auroc)  

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:16<00:00, 59.28it/s]


[[ 68   8]
 [102  58]]
AUCROC: 0.7056743421052631
0.7056743421052631


**Report prediction performances of our pretrained model**

In [None]:
from train_position import ContrastiveEncoder,train_encoder_position
AUC = []
for i in range(5):
    net = torch.load("model/position_"+str(i)+".pkl")
    auroc =evaluate_position(net,trainloader,validloader,testloader)
    AUC.append(auroc)
AUC= np.array(AUC)
print(AUC)
print(np.mean(AUC))
print(np.std(AUC))  

[[67  9]
 [90 70]]
AUCROC: 0.7412006578947369
[[67  9]
 [96 64]]
AUCROC: 0.6863486842105263
[[63 13]
 [86 74]]
AUCROC: 0.7053453947368421
[[ 63  13]
 [100  60]]
AUCROC: 0.6828947368421053
[[ 66  10]
 [102  58]]
AUCROC: 0.7114309210526315
[0.74120066 0.68634868 0.70534539 0.68289474 0.71143092]
0.7054440789473684
0.02091646186391073


### Train the SemanticMask + description

In [None]:
from train import ContrastiveEncoder,train_dnn

net = ContrastiveEncoder().cuda()
optimizer = optim.Adam(net.parameters(), lr = 0.001)
Encoder,training_loss = train_dnn(net,temperature,epochs,optimizer,trainloader_description)
auroc =evaluate(net,trainloader,validloader,testloader)
print(auroc)  
    
    

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:08<00:00, 117.31it/s]


[[64 12]
 [81 79]]
AUCROC: 0.7168585526315789
0.7168585526315789


**Report prediction performances of our pretrained model**

In [None]:
from train import ContrastiveEncoder,train_dnn
AUC = []
for i in range(5):
    net = torch.load("model/description_"+str(i)+".pkl")
    auroc =evaluate(net,trainloader,validloader,testloader)
    AUC.append(auroc)
AUC= np.array(AUC)
print(AUC)
print(np.mean(AUC))
print(np.std(AUC))  

[[63 13]
 [86 74]]
AUCROC: 0.7111019736842106
[[68  8]
 [92 68]]
AUCROC: 0.7615131578947368
[[67  9]
 [99 61]]
AUCROC: 0.7041940789473684
[[60 16]
 [86 74]]
AUCROC: 0.7092927631578949
[[63 13]
 [95 65]]
AUCROC: 0.7038651315789474
[0.71110197 0.76151316 0.70419408 0.70929276 0.70386513]
0.7179934210526315
0.021941701827476186
