### 1. Import libraries and load data

In [None]:
import numpy as np
import pandas as pd
import pybnesian as pbn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics
from tqdm import tqdm
import matplotlib.pyplot as plt
from importlib import reload
import functions
import load_functions

import functions
functions = reload(functions)

(x_train, x_test) = # Load the training and testing datasets as Pandas DataFrames (each row makes for a sample, each column is an attribute)
                    # Name of the class variable must be 'class'

### 2. Parameters

Características de esta versión:
<ul>
    <li>Inicializa el generador en forma de toro</li>
    <li>Genera las muestras en forma de toro</li>
    <li>No fuerza arcos</li>
    <li>Mogollón de épocas de entrenamiento</li>
</ul>


In [None]:
# Number of noise nodes 
noise_dim = 15

# Mean and variance for initial Gen-BN noise CPDs
initial_noise_mean = 0
initial_noise_variance = .75

# Mean and variance for noise from which samples will be generated
sample_mean = .75
sample_variance = sample_mean/2

# Size of samples used in tests (visualization)
test_sample_size = 10

# Convergence tolerance for logl difference
tol = .01
max_iterations = 100000

# Size of each training sample (generation of best samples)
generation_size = 100000
gen_train_size = 10000

# Size of the training batch for the reverse generator
rgen_train_size = 10000

# Mean and variance for noise from which rgen will be trained
sample_mean_rgen = 0
sample_variance_rgen = sample_variance

# Number of noise-sampling iterations to calculate mean anomaly score
mean_iter = 10

# Anomaly score distance power (1=Manhattan, 2=Euclidean...)
ano_power = 2

In [None]:
print('Model data:')
print('Noise nodes: '+str(noise_dim))
print('Noise mean and variance: '+'mu='+str(sample_mean)+', sigma='+str(sample_variance))
print('Logl convergence tolerance: '+str(tol))
print('Max. number of epochs: '+str(max_iterations))
print('Generation size: '+str(generation_size))
print('Selection size: '+str(gen_train_size)+', ratio: '+str(gen_train_size/generation_size))

### 3. Net initialization

In [None]:
# Discriminator (AUC for UNSW goes around 0.97)
disc_bn = functions.learn_net(x_train, structure_algorithm='PC')
assert(disc_bn.fitted())
base_nodes = disc_bn.nodes()

y_true = np.array(x_test['class'])
y_score = -1*disc_bn.logl(x_test)
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)
disc_roc_auc = metrics.auc(fpr, tpr)

print('Disc ROC AUC: '+str(disc_roc_auc))

In [None]:
disc_bn.logl(x_test[x_test['class']==0]).mean()

In [None]:
disc_bn.logl(x_test[x_test['class']==1]).mean()

In [None]:
# Noise nodes
noise_nodes = []
for i in range(noise_dim):
  noise_nodes.append('noise'+str(i+1))

# Noise arcs
noise_arcs = []
for nnode in noise_nodes:
  for bnode in base_nodes:
    noise_arcs.append((nnode,bnode))

# Banned arcs
banned_arcs = []
for bnode in base_nodes:
  for nnode in noise_nodes:
    banned_arcs.append((bnode,nnode))

# Banned edges
banned_edges = []
for bnode in base_nodes:
  for node in base_nodes:
    banned_edges.append((bnode,node))

In [None]:
init_cpds = functions.create_initial_cpds(base_nodes, noise_nodes, noise_mean = initial_noise_mean, noise_variance = initial_noise_variance, torus = False)
gen_bn = functions.reset_interface_net(noise_nodes, base_nodes, noise_arcs)
gen_bn.add_cpds(init_cpds)
assert(gen_bn.fitted())

gen_nodes = noise_nodes + base_nodes

### 4. Training

In [None]:
logl_array = []
need_training = 1

while (need_training and len(logl_array)<max_iterations):
  batch_samples = functions.gen_samples(gen_bn, generation_size, sample_mean, sample_variance, torus = True)
  batch_samples['sample_logl'] = disc_bn.logl(batch_samples)
  logl_mean = batch_samples['sample_logl'].mean(axis=0)
  logl_array.append(logl_mean)
  print('Iteración: ' +str(len(logl_array)))
  print('Logl media: ' +str(logl_mean))
  
  batch_samples.sort_values('sample_logl', axis=0, ascending = False, inplace = True, ignore_index = True)
  selected_samples = batch_samples[0:gen_train_size]
  # gen_bn = functions.learn_condnet(learn_data = batch_samples[gen_nodes], interface_nodes = noise_nodes, nodes = base_nodes)
  gen_bn = functions.learn_condfromnet(learn_data = selected_samples[gen_nodes], interface_nodes = noise_nodes, nodes = base_nodes, banned_arcs = banned_arcs, banned_edges = banned_edges)
  assert(gen_bn.fitted())

  if (len(logl_array)>1):
    logl_difference = abs(logl_array[-1]-logl_array[-2])
    print('Diferencia con la anterior logl: '+ str(logl_difference))
    if logl_difference < tol:
      need_training = 0

##### 4.1. Gen testing

In [None]:
# Logl evolution
plt.plot(logl_array, label="Logl")
plt.xlabel('Epoch')
plt.ylabel('Log-likelihood')
plt.plot([], [], ' ', label='Start logl: ' + str(round(logl_array[0],4)))
plt.plot([], [], ' ', label='End logl: ' + str(round(logl_array[-1],4)))
plt.legend()
plt.show()

# Learnt CPDs
print('Learnt CPDs:')
for node in base_nodes:
  print(gen_bn.cpd(node))

In [None]:
# Logl evolution in log scale
fig, ax = plt.subplots()
ax.plot(logl_array, label="Logl")
ax.set_xlabel('Epoch')
ax.set_ylabel('Log-likelihood')
ax.set_yscale('symlog')
ax.set_yticks([-10e25,-10e20,-10e15,-10e10,-10e5,-10])
ax.plot([], [], ' ', label='Start logl: ' + str("{:.2e}".format(logl_array[0])))
ax.plot([], [], ' ', label='End logl: ' + str(round(logl_array[-1],4)))
plt.legend()
plt.show()

### 5. Anomaly detection

In [None]:
# Reversed noise arcs
noise_arcs_reversed = []
for nnode in noise_nodes:
  for bnode in base_nodes:
    noise_arcs_reversed.append((bnode,nnode))

# Generating data for rgen training
reverse_data = functions.gen_samples(gen_bn, rgen_train_size, sample_mean, sample_variance, torus = True)

# Banned arcs
reversed_banned_arcs = []
for bnode in base_nodes:
  for nnode in noise_nodes:
    reversed_banned_arcs.append((nnode,bnode))

# Learning reverse generator
# rgen_bn = functions.learn_condnet(learn_data = reverse_data, interface_nodes = base_nodes, nodes = noise_nodes)
# rgen_bn = functions.learn_condfromnet(learn_data = batch_samples[gen_nodes], interface_nodes = base_nodes, nodes = noise_nodes, banned_arcs = reversed_banned_arcs)
rgen_bn = functions.learn_condfromnet(learn_data = batch_samples[gen_nodes], interface_nodes = base_nodes, nodes = noise_nodes, banned_arcs = reversed_banned_arcs, banned_edges = banned_edges)
assert(rgen_bn.fitted())

for node in noise_nodes:
  print(rgen_bn.cpd(node))

##### 5.1. By anomaly score

In [None]:
test_data = x_test.copy()

for i in range(mean_iter):
    noise_sample = rgen_bn.sample(evidence = test_data, concat_evidence = True, ordered = True)
    noise_sample = noise_sample.to_pandas()
    if i == 0:
        ano_score = noise_sample.apply(lambda row : functions.ano_score(row, noise_nodes, power = ano_power), axis = 1)
    else:
        ano_score = noise_sample.apply(lambda row : functions.ano_score(row, noise_nodes, power = ano_power), axis = 1) + ano_score

mean_ano_score = ano_score/mean_iter
test_data['ano_score'] = mean_ano_score.to_numpy()

In [None]:
y_true = np.array(test_data['class'])
y_score_ano = np.array(test_data['ano_score'])

fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score_ano)
roc_auc = metrics.auc(fpr, tpr)

print('Area under the ROC curve: ' + str(roc_auc))
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='BayesGEN NoiseAnoScore')
display.plot()
plt.show()

##### 5.2. By sample reconstruction

In [None]:
test_data = x_test.copy()
noise_sample = rgen_bn.sample(evidence = test_data, concat_evidence = True, ordered = True)
noise_sample = noise_sample.to_pandas()

reconstructed_sample = gen_bn.sample(evidence = noise_sample[noise_nodes], concat_evidence = True, ordered = True).to_pandas()

diff_sample = noise_sample.copy()
diff_sample[base_nodes] = noise_sample[base_nodes] - reconstructed_sample[base_nodes]
modulo = diff_sample.apply(lambda row : functions.euclidean_mod(row, base_nodes), axis = 1)
diff_sample['rec_error'] = modulo

y_true = np.array(diff_sample['class'])
y_score_rec = np.array(diff_sample['rec_error'])

fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score_rec)
roc_auc = metrics.auc(fpr, tpr)

print('Area under the ROC curve: ' + str(roc_auc))
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='BayesGEN Rec. Error')
display.plot()
plt.show()

##### 5.3. Applying both

In [None]:
y_score_comb = np.multiply(y_score_ano, y_score_rec)
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score_comb)
roc_auc = metrics.auc(fpr, tpr)

print('Area under the ROC curve: ' + str(roc_auc))
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='BayesGEN Comb. score')
display.plot()
plt.show()