##### [Notice]
##### When you implement the code, please change the data path (refer in Rawcode/Github_WADI.py file).
##### Jupyter file shows the result of running code in author PC.

## Import package and definition

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

import tensorflow as tf
from tqdm import tqdm
import numpy as np
import os
import pandas as pd
import random
import time
from sklearn.model_selection import train_test_split

import joblib
import pickle
import matplotlib.pyplot as plt

#from eval_utils import *

from sklearn.metrics import f1_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import optuna
import torch.nn as nn

from thop import profile

from torchsummary import summary


torch.manual_seed(42)


tf.config.experimental_run_functions_eagerly(True)

def find_best_f1(pred, label, min_thd, max_thd, n_bins):
    f1_scores = []
    term = (max_thd - min_thd)/(n_bins-1)
    if isinstance(pred, torch.Tensor):
        pred = pred.cpu().numpy()
    for i in range(n_bins):
        pred_labels = put_labels(pred, min_thd + i*term)
        f1_scores.append(f1_score(label, pred_labels))
    
    max_id = f1_scores.index(max(f1_scores))

    if f1_scores[max(max_id-1, 0)] == f1_scores[max_id] == f1_scores[min(max_id+1, n_bins-1)]:
        return min_thd + max_id*term, f1_scores[max_id]
    else:
        return find_best_f1(pred, label, max(min_thd + max_id*term - term/2, min_thd), min(min_thd + max_id*term + term/2, max_thd), n_bins)

def put_labels(distance, threshold):
    distance = np.array(distance)
    threshold = np.array(threshold)  
    xs = np.zeros_like(distance)
    xs[distance > threshold] = 1
    return xs

def calc_p2p(predict, actual):
    tp = np.sum(predict * actual)
    tn = np.sum((1-predict) * (1-actual))
    fp = np.sum(predict * (1-actual))
    fn = np.sum((1-predict) * actual)
    
    precision = tp / (tp + fp + 0.000001)
    recall = tp / (tp + fn + 0.000001)
    f1 = 2 * precision * recall / (precision + recall + 0.000001)
    return f1, precision, recall, tp, tn, fp, fn

def get_trad_f1(score, label):
    score = np.asarray(score)
    maxx = float(score.max())
    minn = float(score.min())
    
    label = np.asarray(label)
    actual = label > 0.1
    
    grain = 1000
    max_f1 = 0.0
    max_f1_thres = 0.0
    p = 0
    r = 0
    for i in range(grain):
        thres = (maxx-minn)/grain * i + minn
        predict = score > thres
        f1, precision, recall, tp, tn, fp, fn = calc_p2p(predict, actual)
        if f1 > max_f1:
            max_f1 = f1
            max_f1_thres = thres
            p = precision
            r = recall
            
    return max_f1, max_f1_thres, p, r

def get_test_f1(score, label,thres):
    score = np.asarray(score)
    maxx = float(score.max())
    minn = float(score.min())
    
    label = np.asarray(label)
    actual = label > 0.1
    
    grain = 1000
    max_f1 = 0.0
    max_f1_thres = 0.0
    p = 0
    r = 0
       
    predict = score > thres
    f1, precision, recall, tp, tn, fp, fn = calc_p2p(predict, actual)
    max_f1 = f1
    max_f1_thres = thres
    p = precision
    r = recall
            
    
    return max_f1, max_f1_thres, p, r


    
def get_best_f1(score, label):
    score = np.asarray(score)
    maxx = float(score.max())
    minn = float(score.min())
    
    grain = 10
    max_f1 = 0.0
    max_f1_thres = 0.0
    p = 0
    r = 0
    for i in range(grain):
        thres = (maxx-minn)/grain * i + minn
        # thres = i / grain
        predict, actual = point_adjust(score, label, thres=thres)
        f1, precision, recall, tp, tn, fp, fn = calc_p2p(predict, actual)
        if f1 > max_f1:
            max_f1 = f1
            max_f1_thres = thres
            p = precision
            r = recall
            
    return max_f1, max_f1_thres, p, r


## Dataset load & split

In [3]:
#=========================================================== Data load======================================================================================
Training_WADI_RAW = pd.read_csv("/home/bedro/000_KD/WADI/WADI_train.csv")

TEST_WADI_RAW = pd.read_csv("/home/bedro/000_KD/WADI/WADI_test.csv")


C_TEST_WADI_RAW=TEST_WADI_RAW.drop(['attack'], axis = 1)


MTS_cad_WADI_1 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/1_WADI_MTS_CAD_prediction_score.csv")
MTAD_gat_2 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/2_WADI_mtad_gat_prediction_score.csv")
GANF_3 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/3_WADI_ganf_prediction_score.csv")
ANOMALY_transformer_4 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/4_WADI_anomaly_transformer_prediction_score.csv")
RANSynCoder_5 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/5_WADI_RANSyn_prediction_score.csv")
Autoencoder_6 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/6_WADI_Autoencoder_prediction_score.csv")
USAD_7 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/7_WADI_USAD_prediction_score.csv")
GDN_8 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/8_WADI_GDN_w_prediction_scores.csv")
LSTM_9 = pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/9_WADI_lstm_prediction_score.csv")
MSCRED_10 =pd.read_csv("/home/bedro/000_KD/2024_dataset/WADI_prediction_value/10_WADI_mscred_prediction_score.csv")


list_WADI_model=[MTS_cad_WADI_1['score'],MTAD_gat_2['score'],GANF_3['score'],ANOMALY_transformer_4['score'],RANSynCoder_5['score'],Autoencoder_6['score'],USAD_7['score'],GDN_8['score'], LSTM_9['score'],MSCRED_10['score']] ###########


WADI_anomaly_score_concate = pd.concat((list_WADI_model[0], list_WADI_model[1], list_WADI_model[2], list_WADI_model[3], list_WADI_model[4], list_WADI_model[5], list_WADI_model[6], list_WADI_model[7], list_WADI_model[8], list_WADI_model[9]), axis = 1)


WADI_label=TEST_WADI_RAW['attack']


#=========================================================== Data split======================================================================================


X_train, X_test, y_train, y_test = train_test_split(WADI_anomaly_score_concate, WADI_label, test_size=0.92,  random_state=1234)

C_X_train, C_X_test, C_y_train, C_y_test = train_test_split(C_TEST_WADI_RAW, WADI_label, test_size = 0.92, random_state=1234)



WADI_feature_score_concate = pd.concat((C_X_train,X_train), axis = 1)

WADI_feature_score_concate_valid = pd.concat((C_X_train,X_train), axis = 1)

WADI_feature_score_concate_test = pd.concat((C_X_test,X_test), axis = 1)


train_dataset = TensorDataset(torch.FloatTensor(WADI_feature_score_concate.values), torch.FloatTensor(y_train.values))##############

valid_dataset = TensorDataset(torch.FloatTensor(WADI_feature_score_concate_valid.values), torch.FloatTensor(y_train.values))##############

test_dataset = TensorDataset(torch.FloatTensor(WADI_feature_score_concate_test.values), torch.FloatTensor(C_y_test.values))###################


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


## Class of Meta-learner model (teacher model)

In [4]:
class NeuralNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, hidden_dim2, hidden_dim3, hidden_dim4, activation_fn_name):
        super(NeuralNet, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        #self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.activation_fn1 = self._get_activation_fn(activation_fn_name)
        #self.dropout1 = nn.Dropout(dropout_rate)
        
        self.fc2 = nn.Linear(hidden_dim, hidden_dim2)
        #self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.activation_fn2 = self._get_activation_fn(activation_fn_name)
        #self.dropout2 = nn.Dropout(dropout_rate)
        
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        #self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.activation_fn3 = self._get_activation_fn(activation_fn_name)
        #self.dropout3 = nn.Dropout(dropout_rate)
        
        self.fc4 = nn.Linear(hidden_dim3, hidden_dim4)
        #self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.activation_fn4 = self._get_activation_fn(activation_fn_name)
        #self.dropout4 = nn.Dropout(dropout_rate)
        
        self.fc5= nn.Linear(hidden_dim4, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        #x = self.bn1(x)
        x = self.activation_fn1(x)
        #x = self.dropout1(x)
        
        x = self.fc2(x)
        #x = self.bn2(x)
        x = self.activation_fn2(x)
        #x = self.dropout2(x)
        
        x = self.fc3(x)
        #x = self.bn3(x)
        x = self.activation_fn3(x)
        #x = self.dropout3(x)
        
        x = self.fc4(x)
        #x = self.bn3(x)
        x = self.activation_fn4(x)
        #x = self.dropout4(x)
        
        x = self.fc5(x)
        x = self.sigmoid(x)
        return x

    def _get_activation_fn(self, name):
        """Return an activation function given its name."""
        if name == "ReLU":
            return nn.ReLU()
        elif name == "LeakyReLU":
            return nn.LeakyReLU()
        elif name == "Tanh":
            return nn.Tanh()
        elif name == "Sigmoid":
            return nn.Sigmoid()
        else:
            raise ValueError(f"Unsupported activation function: {name}")

## Evaluation of Teacher model

In [5]:
teacher_load_student = joblib.load('/home/bedro/000_KD/WADI_teacher_student/best_optuna.pkl')

df_teacher = teacher_load_student.trials_dataframe().drop(['number','datetime_start','datetime_complete','duration','state'], axis=1)



trial_num = 4

best_params = teacher_load_student.trials[trial_num].params


teacher_model = NeuralNet(input_dim=WADI_feature_score_concate.shape[1], 
                  hidden_dim=best_params["hidden_dim"], 
                  hidden_dim2=best_params["hidden_dim2"],
                  hidden_dim3=best_params["hidden_dim3"],
                  hidden_dim4=best_params["hidden_dim4"],
                  activation_fn_name=best_params["activation_fn"])
             
                  
teacher_model.load_state_dict(torch.load(f'/home/bedro/000_KD/WADI_teacher_student/Teacher_model_trial_{trial_num}.pth'))
teacher_model.eval()


input_data = torch.randn(1, 133)
summary(teacher_model, input_size=input_data)


input_tensor_meta = torch.tensor(WADI_feature_score_concate.iloc[0].to_numpy(), dtype=torch.float32)

meta_flops, meta_params = profile(teacher_model, inputs=(input_tensor_meta,))

print(f"meta-learner FLOPs: {meta_flops}, meta-learner Parameters: {meta_params}")


y_pred_values_valid=[]
y_true_valid=[]

with torch.no_grad():
    for data, target in valid_loader:  
        output_valid = teacher_model(data).squeeze()
        y_pred_values_valid.extend(output_valid.tolist())
        y_true_valid.extend(target.tolist())

y_pred_values_test = []
y_true_test = []


with torch.no_grad():
    for data, target in test_loader:
        output = teacher_model(data).squeeze()
        #y_pred_values_test.extend(output.tolist())
        y_pred_values_test.extend(output.flatten().tolist())
        batch_true_labels_test = [int(label) for label in target.tolist()]
        y_true_test.extend(batch_true_labels_test)


thresholds = np.linspace(0, 1, 100)
best_threshold = 0
max_f1 = 0
for thd in thresholds:
    y_pred = [1 if y > thd else 0 for y in y_pred_values_test]
    f1 = f1_score(y_true_test, y_pred, zero_division=1)
    if f1 > max_f1:
        max_f1 = f1
        best_threshold = thd


valid_f1,valid_treshold,_,_=get_trad_f1(y_pred_values_valid, y_true_valid)

test_f1,test_treshold,precision,recall=get_test_f1(y_pred_values_test, y_true_test,valid_treshold)

print("Teacher model f1 score is %f and threshold is %f\n" %(test_f1, test_treshold))

Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            9,514
├─LeakyReLU: 1-2                         --
├─Linear: 1-3                            10,944
├─LeakyReLU: 1-4                         --
├─Linear: 1-5                            26,316
├─LeakyReLU: 1-6                         --
├─Linear: 1-7                            29,237
├─LeakyReLU: 1-8                         --
├─Linear: 1-9                            170
├─Sigmoid: 1-10                          --
Total params: 76,181
Trainable params: 76,181
Non-trainable params: 0
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Register count_relu() for <class 'torch.nn.modules.activation.LeakyReLU'>.
meta-learner FLOPs: 75616.0, meta-learner Parameters: 76181.0
Teacher model f1 score is 0.695874 and threshold is 0.996000



## Definition of knowledge distillation

In [6]:
from tensorflow import keras
import torch.optim as optim


import torch.nn.functional as F


def knowledge_distillation_loss(y_true, student_y_pred, teacher_preds_value, alpha, temperature): #0.5 1.0
    # Ensure that the student predictions have the same shape as the true labels
    student_y_pred = torch.squeeze(student_y_pred)

    # Cross-entropy loss
    ce_loss = F.binary_cross_entropy_with_logits(student_y_pred, y_true)#F.binary_cross_entropy_with_logits(student_y_pred, y_true)

    # Soften predictions and calculate distillation loss
    teacher_soft = torch.sigmoid(teacher_preds_value / temperature)
    student_soft = torch.sigmoid(student_y_pred / temperature)
    kd_loss = F.mse_loss(student_soft, teacher_soft)#F.binary_cross_entropy_with_logits(student_soft, teacher_soft) ################################### MSE, CE

    # Combine losses
    combined_loss = (1 - alpha) * kd_loss + alpha * ce_loss
    return combined_loss


def train_on_batch(model, dataset_zip, optimizer, alpha, temp):
    total_loss = 0
    for (teacher_pred, X_train), true_label in dataset_zip:
        # Convert TensorFlow tensors to PyTorch tensors
        teacher_pred = torch.from_numpy(teacher_pred.numpy()).float()
        X_train = torch.from_numpy(X_train.numpy()).float()
        true_label = torch.from_numpy(true_label.numpy()).float()

        # Forward pass
        optimizer.zero_grad()  # Clear existing gradients
        student_y_pred = model(X_train)
        loss = knowledge_distillation_loss(true_label, student_y_pred, teacher_pred, alpha, temp)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    return total_loss / len(dataset_zip)

## Class of student model

In [7]:
class StudentModel(torch.nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.fc1 = torch.nn.Linear(123, 71)
        self.fc2 = torch.nn.Linear(71, 152)
        self.fc3 = torch.nn.Linear(152, 172)
        self.fc4 = torch.nn.Linear(172, 169)
        self.fc5 = torch.nn.Linear(169, 1)  

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        x = torch.tanh(self.fc4(x))
        return torch.sigmoid(self.fc5(x)) 

## Evaluation of student model

In [8]:
def get_trad_f1_final(score, label):
    score = np.asarray(score)
    maxx = float(score.max())
    minn = float(score.min())
    
    label = np.asarray(label)
    actual = label > 0.1
    
    predict = score > valid_thresh
    max_f1, p, r, tp, tn, fp, fn = calc_p2p(predict, actual)
    
    
    max_f1_thres= valid_thresh       
    print("Student model f1 score is %f and threshold is %f\n" %(max_f1, valid_thresh))
    return max_f1, max_f1_thres, p, r

y_train_pred_values_pretrain_teacher = []
y_train_true = []

# Pre-trained teacher model, we put train dataset to get a predictive output
with torch.no_grad():
    for data, target in train_loader:
        output = teacher_model(data).squeeze()
        y_train_pred_values_pretrain_teacher.extend(output.tolist())
        batch_true_labels = [int(label) for label in target.tolist()]
        y_train_true.extend(batch_true_labels)

student_model = StudentModel()  # Create an instance of the model




optimizer = optim.Adam(student_model.parameters(),lr=0.0001354368553070506)  # Pass the model instance  0.00001

print("student parameter")

input_data = torch.randn(1, 123)
summary(teacher_model, input_size=input_data)

epochs = 20
batch_size = 64

alpha_value = 0
temperature_value = 10



dataset_12 = tf.data.Dataset.from_tensor_slices((y_train_pred_values_pretrain_teacher, C_X_train))
dataset_label = tf.data.Dataset.from_tensor_slices(C_y_train)
dataset_zip = tf.data.Dataset.zip((dataset_12, dataset_label)).batch(batch_size)



for epoch in tqdm(range(epochs), desc="Training", unit="epoch"):
    print(f'Epoch {epoch + 1}/{epochs}')
    loss = train_on_batch(student_model, dataset_zip, optimizer, alpha=alpha_value, temp=temperature_value)
    print(f'Loss: {loss}')



student_model.eval()


#####use validation set to decide threshold 



with torch.no_grad():  # Disable gradient calculation
    y_predicted_valid = student_model(torch.tensor(C_X_train.to_numpy(), dtype=torch.float32))
    #y_predicted = student_model(torch.tensor(SWaT_feature_score_concate_test.to_numpy(), dtype=torch.float32))



y_predicted_np_valid = int(y_predicted_valid.numpy()) if y_predicted_valid.requires_grad else y_predicted_valid.detach().numpy()


predict_valid = y_predicted_np_valid.reshape(-1)
actual_valid = C_y_train.to_numpy().reshape(-1)


unique_values_valid_predict, counts_valid_predict = np.unique(y_predicted_np_valid, return_counts=True)


unique_values_ground_valid_actual, counts_ground_valid_actual = np.unique(actual_valid, return_counts=True)


valid_f1,valid_thresh,valid_p,valid_c = get_trad_f1(predict_valid,actual_valid)


input_tensor = torch.tensor(C_X_test.iloc[[0]].to_numpy(), dtype=torch.float32)
flops, params = profile(student_model, inputs=(input_tensor,))

print(f"FLOPs: {flops}, Parameters: {params}")




#### use test dataset

with torch.no_grad():  # Disable gradient calculation
    y_predicted = student_model(torch.tensor(C_X_test.to_numpy(), dtype=torch.float32))
    #y_predicted = student_model(torch.tensor(SWaT_feature_score_concate_test.to_numpy(), dtype=torch.float32))


y_predicted_np = int(y_predicted.numpy()) if y_predicted.requires_grad else y_predicted.detach().numpy()


predict = y_predicted_np.reshape(-1)
actual = C_y_test.to_numpy().reshape(-1)


unique_values, counts = np.unique(y_predicted_np, return_counts=True)


unique_values_ground, counts_ground = np.unique(actual, return_counts=True)

print(get_trad_f1_final(predict,actual))

student parameter
Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            9,514
├─LeakyReLU: 1-2                         --
├─Linear: 1-3                            10,944
├─LeakyReLU: 1-4                         --
├─Linear: 1-5                            26,316
├─LeakyReLU: 1-6                         --
├─Linear: 1-7                            29,237
├─LeakyReLU: 1-8                         --
├─Linear: 1-9                            170
├─Sigmoid: 1-10                          --
Total params: 76,181
Trainable params: 76,181
Non-trainable params: 0


2024-02-18 21:22:20.949329: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-18 21:22:23.134800: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30971 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:02:00.0, compute capability: 7.0
2024-02-18 21:22:23.135516: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30668 MB memory:  -> device: 1, name: Tesla V100-PCIE-32GB, pci bus id: 0000:03:00.0, compute capability: 7.0
2024-02-18 21:22:23.136137: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device

Epoch 1/20


Training:   5%|██████▌                                                                                                                             | 1/20 [00:00<00:13,  1.44epoch/s]

Loss: 5.017042043307272e-05
Epoch 2/20


Training:  10%|█████████████▏                                                                                                                      | 2/20 [00:01<00:11,  1.51epoch/s]

Loss: 2.264330447298301e-05
Epoch 3/20


Training:  15%|███████████████████▊                                                                                                                | 3/20 [00:02<00:11,  1.46epoch/s]

Loss: 1.4841169993297222e-05
Epoch 4/20


Training:  20%|██████████████████████████▍                                                                                                         | 4/20 [00:02<00:11,  1.44epoch/s]

Loss: 1.2096368227109832e-05
Epoch 5/20


Training:  25%|█████████████████████████████████                                                                                                   | 5/20 [00:03<00:10,  1.49epoch/s]

Loss: 9.478677891242118e-06
Epoch 6/20


Training:  30%|███████████████████████████████████████▌                                                                                            | 6/20 [00:04<00:09,  1.42epoch/s]

Loss: 8.785603597861859e-06
Epoch 7/20


Training:  35%|██████████████████████████████████████████████▏                                                                                     | 7/20 [00:04<00:09,  1.40epoch/s]

Loss: 8.619445251586464e-06
Epoch 8/20


Training:  40%|████████████████████████████████████████████████████▊                                                                               | 8/20 [00:05<00:08,  1.40epoch/s]

Loss: 7.951898677880951e-06
Epoch 9/20


Training:  45%|███████████████████████████████████████████████████████████▍                                                                        | 9/20 [00:06<00:07,  1.43epoch/s]

Loss: 8.048395915784937e-06
Epoch 10/20


Training:  50%|█████████████████████████████████████████████████████████████████▌                                                                 | 10/20 [00:06<00:06,  1.50epoch/s]

Loss: 9.51092281636203e-06
Epoch 11/20


Training:  55%|████████████████████████████████████████████████████████████████████████                                                           | 11/20 [00:07<00:06,  1.29epoch/s]

Loss: 6.881900914322302e-06
Epoch 12/20


Training:  60%|██████████████████████████████████████████████████████████████████████████████▌                                                    | 12/20 [00:08<00:06,  1.23epoch/s]

Loss: 1.240067616105499e-05
Epoch 13/20


Training:  65%|█████████████████████████████████████████████████████████████████████████████████████▏                                             | 13/20 [00:09<00:05,  1.27epoch/s]

Loss: 9.850091499008506e-06
Epoch 14/20


Training:  70%|███████████████████████████████████████████████████████████████████████████████████████████▋                                       | 14/20 [00:10<00:05,  1.14epoch/s]

Loss: 1.237508027149384e-05
Epoch 15/20


Training:  75%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                | 15/20 [00:11<00:04,  1.13epoch/s]

Loss: 7.532258769474379e-06
Epoch 16/20


Training:  80%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 16/20 [00:12<00:03,  1.10epoch/s]

Loss: 6.705424064306158e-06
Epoch 17/20


Training:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 17/20 [00:13<00:02,  1.20epoch/s]

Loss: 7.103950348316576e-06
Epoch 18/20


Training:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 18/20 [00:13<00:01,  1.24epoch/s]

Loss: 1.212703138964772e-05
Epoch 19/20


Training:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 19/20 [00:14<00:00,  1.28epoch/s]

Loss: 1.0163939479678484e-05
Epoch 20/20


Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.31epoch/s]

Loss: 1.2136950410880694e-05
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.





FLOPs: 74906.0, Parameters: 75471.0
Student model f1 score is 0.689333 and threshold is 0.953983

(0.6893329132029887, 0.9539827458004146, 0.738196352643243, 0.6465375778747907)
