## Installation list

In [1]:
!pip list

Package                      Version
---------------------------- --------------------
absl-py                      1.4.0
alembic                      1.13.1
anyio                        4.2.0
argon2-cffi                  23.1.0
argon2-cffi-bindings         21.2.0
arrow                        1.3.0
asttokens                    2.4.1
astunparse                   1.6.3
async-lru                    2.0.4
attrs                        23.2.0
Babel                        2.14.0
backcall                     0.2.0
beautifulsoup4               4.12.3
bleach                       6.1.0
Brotli                       1.0.9
cachetools                   5.3.1
certifi                      2023.11.17
cffi                         1.15.0
chardet                      5.2.0
charset-normalizer           2.0.4
cmaes                        0.10.0
colorlog                     6.8.0
comm                         0.2.1
contourpy                    1.1.0
cryptography                 41.0.3
cycler                  

## Import package and definition

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

import tensorflow as tf
from tqdm import tqdm
import numpy as np
import os
import pandas as pd
import random
import time
from sklearn.model_selection import train_test_split

import joblib
import pickle
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import optuna
import torch.nn as nn

from thop import profile

from torchsummary import summary

torch.manual_seed(42)


tf.config.experimental_run_functions_eagerly(True)

def find_best_f1(pred, label, min_thd, max_thd, n_bins):
    f1_scores = []
    term = (max_thd - min_thd)/(n_bins-1)
    if isinstance(pred, torch.Tensor):
        pred = pred.cpu().numpy()
    for i in range(n_bins):
        pred_labels = put_labels(pred, min_thd + i*term)
        f1_scores.append(f1_score(label, pred_labels))
    
    max_id = f1_scores.index(max(f1_scores))

    if f1_scores[max(max_id-1, 0)] == f1_scores[max_id] == f1_scores[min(max_id+1, n_bins-1)]:
        return min_thd + max_id*term, f1_scores[max_id]
    else:
        return find_best_f1(pred, label, max(min_thd + max_id*term - term/2, min_thd), min(min_thd + max_id*term + term/2, max_thd), n_bins)

def put_labels(distance, threshold):
    distance = np.array(distance)
    threshold = np.array(threshold)  
    xs = np.zeros_like(distance)
    xs[distance > threshold] = 1
    return xs

def calc_p2p(predict, actual):
    tp = np.sum(predict * actual)
    tn = np.sum((1-predict) * (1-actual))
    fp = np.sum(predict * (1-actual))
    fn = np.sum((1-predict) * actual)
    
    precision = tp / (tp + fp + 0.000001)
    recall = tp / (tp + fn + 0.000001)
    f1 = 2 * precision * recall / (precision + recall + 0.000001)
    return f1, precision, recall, tp, tn, fp, fn

def get_trad_f1(score, label):
    score = np.asarray(score)
    maxx = float(score.max())
    minn = float(score.min())
    
    label = np.asarray(label)
    actual = label > 0.1
    
    grain = 1000
    max_f1 = 0.0
    max_f1_thres = 0.0
    p = 0
    r = 0
    for i in range(grain):
        thres = (maxx-minn)/grain * i + minn
        predict = score > thres
        f1, precision, recall, tp, tn, fp, fn = calc_p2p(predict, actual)
        if f1 > max_f1:
            max_f1 = f1
            max_f1_thres = thres
            p = precision
            r = recall
            
    return max_f1, max_f1_thres, p, r

def get_test_f1(score, label,thres):
    score = np.asarray(score)
    maxx = float(score.max())
    minn = float(score.min())
    
    label = np.asarray(label)
    actual = label > 0.1
    
    grain = 1000
    max_f1 = 0.0
    max_f1_thres = 0.0
    p = 0
    r = 0
       
    predict = score > thres
    f1, precision, recall, tp, tn, fp, fn = calc_p2p(predict, actual)
    max_f1 = f1
    max_f1_thres = thres
    p = precision
    r = recall
            
    
    return max_f1, max_f1_thres, p, r


    
def get_best_f1(score, label):
    score = np.asarray(score)
    maxx = float(score.max())
    minn = float(score.min())
    
    grain = 10
    max_f1 = 0.0
    max_f1_thres = 0.0
    p = 0
    r = 0
    for i in range(grain):
        thres = (maxx-minn)/grain * i + minn
        # thres = i / grain
        predict, actual = point_adjust(score, label, thres=thres)
        f1, precision, recall, tp, tn, fp, fn = calc_p2p(predict, actual)
        if f1 > max_f1:
            max_f1 = f1
            max_f1_thres = thres
            p = precision
            r = recall
            
    return max_f1, max_f1_thres, p, r


2024-02-18 21:27:10.912491: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-18 21:27:11.827817: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /data/kiho/cuda-11.0/extras/CUPTI/lib64:/data/kiho/cuda-11.0/lib64:/data/kiho/cuda-11.0/extras/CUPTI/lib64:/data/kiho/cuda-11.0/lib64::/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0/extras/CUPTI/lib64:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0/extras/CUPTI/lib64:/usr/local/cuda-11.7/lib64
2024-02-18 21:27:11.827931: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


## Dataset load & split

In [3]:
#=========================================================== Data load======================================================================================


Training_SWaT_RAW = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT/SWaT_Dataset_Normal_v1.csv")

TEST_SWaT_RAW = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT/SWaT_Dataset_Attack_v0.csv") 


C_TEST_SWaT_RAW=TEST_SWaT_RAW.drop([' Timestamp','label'], axis = 1)



MTS_cad_SWaT_1 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/1_MTS_CAD_prediction_score.csv")
MTAD_gat_2 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/2_mtad_gat_prediction_score.csv")
GANF_3 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/3_ganf_prediction_score.csv")
ANOMALY_transformer_4 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/4_anomaly_transformer_prediction_score.csv")
RANSynCoder_5 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/5_RANS_prediction_score.csv")
Autoencoder_6 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/6_autoencoder_prediction_score.csv")
USAD_7 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/7_usad_prediction_score.csv")
GDN_8 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/8_gdn_prediction_score.csv")
LSTM_9 = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/9_lstm_prediction_score.csv")
MSCRED_10 =pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT_prediction_value/10_mscred_prediction_score.csv")


list_SWaT_model=[MTS_cad_SWaT_1['score'],MTAD_gat_2['score'],GANF_3['score'],ANOMALY_transformer_4['score'],RANSynCoder_5['score'],Autoencoder_6['score'],USAD_7['score'],GDN_8['score'], LSTM_9['score'],MSCRED_10['score']] ###########

SWaT_anomaly_score_concate = pd.concat((list_SWaT_model[0], list_SWaT_model[1], list_SWaT_model[2], list_SWaT_model[3], list_SWaT_model[4], list_SWaT_model[5], list_SWaT_model[6], list_SWaT_model[7], list_SWaT_model[8], list_SWaT_model[9]), axis = 1)



SWaT_label=TEST_SWaT_RAW['label']



#=========================================================== Data split======================================================================================


X_train, X_test, y_train, y_test = train_test_split(SWaT_anomaly_score_concate, SWaT_label, test_size=0.95,  random_state=1234)

C_X_train, C_X_test, C_y_train, C_y_test = train_test_split(C_TEST_SWaT_RAW, SWaT_label, test_size = 0.95, random_state=1234)


SWaT_feature_score_concate = pd.concat((C_X_train,X_train), axis = 1)

SWaT_feature_score_concate_valid = pd.concat((C_X_train,X_train), axis = 1)

SWaT_feature_score_concate_test = pd.concat((C_X_test,X_test), axis = 1)


train_dataset = TensorDataset(torch.FloatTensor(SWaT_feature_score_concate.values), torch.FloatTensor(y_train.values))

valid_dataset = TensorDataset(torch.FloatTensor(SWaT_feature_score_concate_valid.values), torch.FloatTensor(y_train.values))

test_dataset = TensorDataset(torch.FloatTensor(SWaT_feature_score_concate_test.values), torch.FloatTensor(C_y_test.values))


train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

  Training_SWaT_RAW = pd.read_csv("/home/bedro/000_KD/2024_dataset/SWaT/SWaT_Dataset_Normal_v1.csv")


## Class of Meta-learner model (teacher model)

In [4]:
class NeuralNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, hidden_dim2, hidden_dim3, hidden_dim4, activation_fn_name):
        super(NeuralNet, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        #self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.activation_fn1 = self._get_activation_fn(activation_fn_name)
        #self.dropout1 = nn.Dropout(dropout_rate)
        
        self.fc2 = nn.Linear(hidden_dim, hidden_dim2)
        #self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.activation_fn2 = self._get_activation_fn(activation_fn_name)
        #self.dropout2 = nn.Dropout(dropout_rate)
        
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        #self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.activation_fn3 = self._get_activation_fn(activation_fn_name)
        #self.dropout3 = nn.Dropout(dropout_rate)
        
        self.fc4 = nn.Linear(hidden_dim3, hidden_dim4)
        #self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.activation_fn4 = self._get_activation_fn(activation_fn_name)
        #self.dropout4 = nn.Dropout(dropout_rate)
        
        self.fc5= nn.Linear(hidden_dim4, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        #x = self.bn1(x)
        x = self.activation_fn1(x)
        #x = self.dropout1(x)
        
        x = self.fc2(x)
        #x = self.bn2(x)
        x = self.activation_fn2(x)
        #x = self.dropout2(x)
        
        x = self.fc3(x)
        #x = self.bn3(x)
        x = self.activation_fn3(x)
        #x = self.dropout3(x)
        
        x = self.fc4(x)
        #x = self.bn3(x)
        x = self.activation_fn4(x)
        #x = self.dropout4(x)
        
        x = self.fc5(x)
        x = self.sigmoid(x)
        return x

    def _get_activation_fn(self, name):
        """Return an activation function given its name."""
        if name == "ReLU":
            return nn.ReLU()
        elif name == "LeakyReLU":
            return nn.LeakyReLU()
        elif name == "Tanh":
            return nn.Tanh()
        elif name == "Sigmoid":
            return nn.Sigmoid()
        else:
            raise ValueError(f"Unsupported activation function: {name}")

## Evaluation of Teacher model

In [5]:
teacher_load_student = joblib.load('/home/bedro/000_KD/teacher_student/best_optuna.pkl')

df_teacher = teacher_load_student.trials_dataframe().drop(['number','datetime_start','datetime_complete','duration','state'], axis=1)

trial_num = 1

best_params = teacher_load_student.trials[trial_num].params

print("best_params: ", best_params)


teacher_model = NeuralNet(input_dim=SWaT_feature_score_concate.shape[1], 
                  hidden_dim=best_params["hidden_dim"], 
                  hidden_dim2=best_params["hidden_dim2"],
                  hidden_dim3=best_params["hidden_dim3"],
                  hidden_dim4=best_params["hidden_dim4"],
                  activation_fn_name=best_params["activation_fn"])
    
print("teacher_model.summary(): ",teacher_model)              
                  
teacher_model.load_state_dict(torch.load(f'/home/bedro/000_KD/teacher_student/Teacher_model_trial_{trial_num}.pth'))
teacher_model.eval()


input_data = torch.randn(1, 61)
summary(teacher_model, input_size=input_data)




input_tensor_meta = torch.tensor(SWaT_feature_score_concate.iloc[0].to_numpy(), dtype=torch.float32)

meta_flops, meta_params = profile(teacher_model, inputs=(input_tensor_meta,))

print(f"meta-learner FLOPs: {meta_flops}, meta-learner Parameters: {meta_params}")


y_pred_values_valid=[]
y_true_valid=[]

with torch.no_grad():
    for data, target in valid_loader:  #test_loader to valid loader
        output_valid = teacher_model(data).squeeze()
        y_pred_values_valid.extend(output_valid.tolist())
        y_true_valid.extend(target.tolist())




y_pred_values_test = []
y_true_test = []


with torch.no_grad():
    for data, target in test_loader:
        output = teacher_model(data).squeeze()
        y_pred_values_test.extend(output.tolist())
        batch_true_labels_test = [int(label) for label in target.tolist()]
        y_true_test.extend(batch_true_labels_test)

# Use the threshold that gave the best F1 score during training
thresholds = np.linspace(0, 1, 100)
best_threshold = 0
max_f1 = 0
for thd in thresholds:
    y_pred = [1 if y > thd else 0 for y in y_pred_values_test]
    f1 = f1_score(y_true_test, y_pred, zero_division=1)
    if f1 > max_f1:
        max_f1 = f1
        best_threshold = thd


valid_f1,valid_treshold,_,_=get_trad_f1(y_pred_values_valid, y_true_valid)

test_f1,test_treshold,p,r=get_test_f1(y_pred_values_test, y_true_test,valid_treshold)

print("Teacher model f1 score is %f and threshold is %f\n" %(test_f1, test_treshold))

best_params:  {'hidden_dim': 149, 'hidden_dim2': 154, 'hidden_dim3': 141, 'hidden_dim4': 77, 'lr': 3.1884778106480725e-05, 'activation_fn': 'Tanh'}
teacher_model.summary():  NeuralNet(
  (fc1): Linear(in_features=61, out_features=149, bias=True)
  (activation_fn1): Tanh()
  (fc2): Linear(in_features=149, out_features=154, bias=True)
  (activation_fn2): Tanh()
  (fc3): Linear(in_features=154, out_features=141, bias=True)
  (activation_fn3): Tanh()
  (fc4): Linear(in_features=141, out_features=77, bias=True)
  (activation_fn4): Tanh()
  (fc5): Linear(in_features=77, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            9,238
├─Tanh: 1-2                              --
├─Linear: 1-3                            23,100
├─Tanh: 1-4                              --
├─Linear: 1-5                            21,855
├─Tanh: 1-6                              --
├─Linear: 1-7                            10,934
├─Tan

## Definition of knowledge distillation

In [6]:
from tensorflow import keras
import torch.optim as optim

import torch.nn.functional as F


def knowledge_distillation_loss(y_true, student_y_pred, teacher_preds_value, alpha, temperature): #0.5 1.0
    # Ensure that the student predictions have the same shape as the true labels
    student_y_pred = torch.squeeze(student_y_pred)

    # Cross-entropy loss
    ce_loss = F.binary_cross_entropy_with_logits(student_y_pred, y_true)#F.binary_cross_entropy_with_logits(student_y_pred, y_true)

    # Soften predictions and calculate distillation loss
    teacher_soft = torch.sigmoid(teacher_preds_value / temperature)
    student_soft = torch.sigmoid(student_y_pred / temperature)
    kd_loss = F.mse_loss(student_soft, teacher_soft)#F.mse_loss(student_soft, teacher_soft)#F.binary_cross_entropy_with_logits(student_soft, teacher_soft)

    # Combine losses
    combined_loss = (1 - alpha) * kd_loss + alpha * ce_loss
    return combined_loss


def train_on_batch(model, dataset_zip, optimizer, alpha, temp):
    total_loss = 0
    for (teacher_pred, X_train), true_label in dataset_zip:
        # Convert TensorFlow tensors to PyTorch tensors
        teacher_pred = torch.from_numpy(teacher_pred.numpy()).float()
        X_train = torch.from_numpy(X_train.numpy()).float()
        true_label = torch.from_numpy(true_label.numpy()).float()

        # Forward pass
        optimizer.zero_grad()  # Clear existing gradients
        student_y_pred = model(X_train)
        loss = knowledge_distillation_loss(true_label, student_y_pred, teacher_pred, alpha, temp)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    return total_loss / len(dataset_zip)

## Class of student model

In [7]:
class StudentModel(torch.nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.fc1 = torch.nn.Linear(51, 149)
        self.fc2 = torch.nn.Linear(149, 154)
        self.fc3 = torch.nn.Linear(154, 141)
        self.fc4 = torch.nn.Linear(141, 77)
        self.fc5 = torch.nn.Linear(77, 1)  

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        return torch.sigmoid(self.fc5(x))  


## Evaluation of student model

In [8]:
def get_trad_f1_final(score, label):
    score = np.asarray(score)
    maxx = float(score.max())
    minn = float(score.min())
    
   
    label = np.asarray(label)
    actual = label > 0.1
    
    predict = score > valid_thresh
    max_f1, p, r, tp, tn, fp, fn = calc_p2p(predict, actual)
    
    
    max_f1_thres= valid_thresh       
    print("Student model f1 score is %f and threshold is %f\n" %(max_f1, valid_thresh))
    return max_f1, max_f1_thres, p, r


y_train_pred_values_pretrain_teacher = []
y_train_true = []

# Pre-trained teacher model, we put train dataset to get a predictive output
with torch.no_grad():
    for data, target in train_loader:
        output = teacher_model(data).squeeze()
        y_train_pred_values_pretrain_teacher.extend(output.tolist())
        batch_true_labels = [int(label) for label in target.tolist()]
        y_train_true.extend(batch_true_labels)

student_model = StudentModel()  # Create an instance of the model



optimizer = optim.Adam(student_model.parameters(),lr=3.1884778106480725e-05)  # Pass the model instance  0.00001

print("student parameter")

input_data = torch.randn(1, 51)
summary(teacher_model, input_size=input_data)

epochs = 30
batch_size = 64

alpha_value = 0
temperature_value = 1

dataset_12 = tf.data.Dataset.from_tensor_slices((y_train_pred_values_pretrain_teacher, C_X_train))
dataset_label = tf.data.Dataset.from_tensor_slices(C_y_train)
dataset_zip = tf.data.Dataset.zip((dataset_12, dataset_label)).batch(batch_size)




for epoch in tqdm(range(epochs), desc="Training", unit="epoch"):
    print(f'Epoch {epoch + 1}/{epochs}')
    loss = train_on_batch(student_model, dataset_zip, optimizer, alpha=alpha_value, temp=temperature_value)
    print(f'Loss: {loss}')


student_model.eval()

#####use validation set to decide threshold 


with torch.no_grad():  # Disable gradient calculation
    y_predicted_valid = student_model(torch.tensor(C_X_train.to_numpy(), dtype=torch.float32))
    #y_predicted = student_model(torch.tensor(SWaT_feature_score_concate_test.to_numpy(), dtype=torch.float32))

y_predicted_np_valid = int(y_predicted_valid.numpy()) if y_predicted_valid.requires_grad else y_predicted_valid.detach().numpy()


predict_valid = y_predicted_np_valid.reshape(-1)
actual_valid = C_y_train.to_numpy().reshape(-1)


unique_values_valid_predict, counts_valid_predict = np.unique(y_predicted_np_valid, return_counts=True)


unique_values_ground_valid_actual, counts_ground_valid_actual = np.unique(actual_valid, return_counts=True)


valid_f1,valid_thresh,valid_p,valid_c = get_trad_f1(predict_valid,actual_valid)


input_tensor = torch.tensor(C_X_test.iloc[[0]].to_numpy(), dtype=torch.float32)
flops, params = profile(student_model, inputs=(input_tensor,))

print(f"FLOPs: {flops}, Parameters: {params}")




#### use test dataset

with torch.no_grad():  # Disable gradient calculation
    y_predicted = student_model(torch.tensor(C_X_test.to_numpy(), dtype=torch.float32))
    #y_predicted = student_model(torch.tensor(SWaT_feature_score_concate_test.to_numpy(), dtype=torch.float32))



y_predicted_np = int(y_predicted.numpy()) if y_predicted.requires_grad else y_predicted.detach().numpy()


predict = y_predicted_np.reshape(-1)
actual = C_y_test.to_numpy().reshape(-1)


unique_values, counts = np.unique(y_predicted_np, return_counts=True)

unique_values_ground, counts_ground = np.unique(actual, return_counts=True)



print(get_trad_f1_final(predict,actual))

student parameter
Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            9,238
├─Tanh: 1-2                              --
├─Linear: 1-3                            23,100
├─Tanh: 1-4                              --
├─Linear: 1-5                            21,855
├─Tanh: 1-6                              --
├─Linear: 1-7                            10,934
├─Tanh: 1-8                              --
├─Linear: 1-9                            78
├─Sigmoid: 1-10                          --
Total params: 65,205
Trainable params: 65,205
Non-trainable params: 0


2024-02-18 21:28:07.766718: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-18 21:28:10.037890: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 339 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:02:00.0, compute capability: 7.0
2024-02-18 21:28:10.038630: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30183 MB memory:  -> device: 1, name: Tesla V100-PCIE-32GB, pci bus id: 0000:03:00.0, compute capability: 7.0
2024-02-18 21:28:10.039207: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:G

Epoch 1/30


Training:   3%|████▍                                                                                                                               | 1/30 [00:01<00:35,  1.21s/epoch]

Loss: 0.00305538923434638
Epoch 2/30


Training:   7%|████████▊                                                                                                                           | 2/30 [00:02<00:33,  1.21s/epoch]

Loss: 0.0016736587832714247
Epoch 3/30


Training:  10%|█████████████▏                                                                                                                      | 3/30 [00:03<00:32,  1.22s/epoch]

Loss: 0.001551306899459219
Epoch 4/30


Training:  13%|█████████████████▌                                                                                                                  | 4/30 [00:04<00:32,  1.24s/epoch]

Loss: 0.0014302696898870306
Epoch 5/30


Training:  17%|██████████████████████                                                                                                              | 5/30 [00:05<00:29,  1.18s/epoch]

Loss: 0.001341354801266706
Epoch 6/30


Training:  20%|██████████████████████████▍                                                                                                         | 6/30 [00:07<00:27,  1.14s/epoch]

Loss: 0.0012599068149008665
Epoch 7/30


Training:  23%|██████████████████████████████▊                                                                                                     | 7/30 [00:08<00:26,  1.15s/epoch]

Loss: 0.0011832667897599756
Epoch 8/30


Training:  27%|███████████████████████████████████▏                                                                                                | 8/30 [00:09<00:24,  1.10s/epoch]

Loss: 0.0011038043559210564
Epoch 9/30


Training:  30%|███████████████████████████████████████▌                                                                                            | 9/30 [00:10<00:23,  1.13s/epoch]

Loss: 0.0010291710280357206
Epoch 10/30


Training:  33%|███████████████████████████████████████████▋                                                                                       | 10/30 [00:11<00:24,  1.23s/epoch]

Loss: 0.0009660114727952615
Epoch 11/30


Training:  37%|████████████████████████████████████████████████                                                                                   | 11/30 [00:12<00:22,  1.16s/epoch]

Loss: 0.0009146514510121051
Epoch 12/30


Training:  40%|████████████████████████████████████████████████████▍                                                                              | 12/30 [00:13<00:20,  1.13s/epoch]

Loss: 0.000873642941650649
Epoch 13/30


Training:  43%|████████████████████████████████████████████████████████▊                                                                          | 13/30 [00:15<00:19,  1.13s/epoch]

Loss: 0.0008397652921830327
Epoch 14/30


Training:  47%|█████████████████████████████████████████████████████████████▏                                                                     | 14/30 [00:16<00:18,  1.17s/epoch]

Loss: 0.0008114779105441853
Epoch 15/30


Training:  50%|█████████████████████████████████████████████████████████████████▌                                                                 | 15/30 [00:17<00:17,  1.17s/epoch]

Loss: 0.0007849864286736296
Epoch 16/30


Training:  53%|█████████████████████████████████████████████████████████████████████▊                                                             | 16/30 [00:18<00:16,  1.16s/epoch]

Loss: 0.0007633460978139924
Epoch 17/30


Training:  57%|██████████████████████████████████████████████████████████████████████████▏                                                        | 17/30 [00:19<00:14,  1.13s/epoch]

Loss: 0.0007437553517503788
Epoch 18/30


Training:  60%|██████████████████████████████████████████████████████████████████████████████▌                                                    | 18/30 [00:20<00:13,  1.15s/epoch]

Loss: 0.0007250980304162754
Epoch 19/30


Training:  63%|██████████████████████████████████████████████████████████████████████████████████▉                                                | 19/30 [00:21<00:12,  1.14s/epoch]

Loss: 0.0007082156220070084
Epoch 20/30


Training:  67%|███████████████████████████████████████████████████████████████████████████████████████▎                                           | 20/30 [00:23<00:11,  1.19s/epoch]

Loss: 0.0006754811593261613
Epoch 21/30


Training:  70%|███████████████████████████████████████████████████████████████████████████████████████████▋                                       | 21/30 [00:24<00:10,  1.20s/epoch]

Loss: 0.0006529406702627098
Epoch 22/30


Training:  73%|████████████████████████████████████████████████████████████████████████████████████████████████                                   | 22/30 [00:25<00:09,  1.21s/epoch]

Loss: 0.000640034887149971
Epoch 23/30


Training:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                              | 23/30 [00:27<00:08,  1.28s/epoch]

Loss: 0.0006306164975378372
Epoch 24/30


Training:  80%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 24/30 [00:28<00:07,  1.32s/epoch]

Loss: 0.0006237624324825219
Epoch 25/30


Training:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 25/30 [00:29<00:06,  1.29s/epoch]

Loss: 0.000616787868757094
Epoch 26/30


Training:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 26/30 [00:31<00:05,  1.31s/epoch]

Loss: 0.0006111081491711316
Epoch 27/30


Training:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 27/30 [00:32<00:03,  1.28s/epoch]

Loss: 0.0006051829861775104
Epoch 28/30


Training:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 28/30 [00:33<00:02,  1.25s/epoch]

Loss: 0.0006002363364762624
Epoch 29/30


Training:  97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 29/30 [00:34<00:01,  1.23s/epoch]

Loss: 0.0005964301242322924
Epoch 30/30


Training: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:35<00:00,  1.19s/epoch]

Loss: 0.0005910536501115938





[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
FLOPs: 63193.0, Parameters: 63715.0
Student model f1 score is 0.863134 and threshold is 0.136007

(0.8631335230222661, 0.13600705543425284, 0.9276897506719626, 0.8069783132374558)
