In [1]:
import pickle
import os

In [2]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import Parameter
from torch.nn.modules.module import Module
from tqdm import tqdm
from time import time 
from utils import save_model
from utils import DistilLog
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
folderpath = '..\\newdatasets'
file_names = [f"BGL_20l_train_{x}.pkl" for x in [0.1, 0.2, 0.4, 0.6]]

In [3]:
with open('../newdatasets/BGL_20l_train_0.1.pkl', 'rb') as file:
    raw_data = pickle.load(file)


In [4]:
full_data = raw_data[0] + raw_data[1]

In [5]:
len(full_data)

235675

In [6]:
num_classes = 2
batch_size = 50
learning_rate = 0.0003
hidden_size = 128
input_size = 300
sequence_length = 20
num_layers = 2


save_teacher_path = '../newdatasets/bgl_20_teacher.pth'
save_noKD_path = '../newdatasets/bgl_20_noKD.pth'

Teacher = DistilLog(input_size, hidden_size, num_layers, num_classes, is_bidirectional=False).to(device)


In [20]:
def train(model, train_loader, learning_rate):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0001)
    model.train()

    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    total_loss = 0
    for batch_idx, (data, target) in pbar:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output, _ = model(data)
        target = target.long()
        loss = criterion(output, target)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()


        if (batch_idx + 1) % 10 == 0:
            done = (batch_idx + 1) * len(data)
            percentage = 100. * batch_idx / len(train_loader)
            pbar.set_description(
                f'Train Epoch: [{done:5}/{len(train_loader.dataset)} ({percentage:3.0f}%)]  Loss: {total_loss:.6f}')

    return model

In [52]:
input_size = 300
def load_data(train_x, train_y, batch_size):
    #train_x = np.reshape(train_x, (train_x.shape[0], -1, input_size))
    train_y = train_y.astype(int)
    tensor_x = torch.Tensor(train_x)
    tensor_y = torch.from_numpy(train_y)
    train_dataset = TensorDataset(tensor_x, tensor_y)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True)
    return train_loader

def process_data_in_chunks(data, chunk_size=20000):
    for i in range(0, len(data), chunk_size):
        chunk = data[i:i + chunk_size]
        log_vec_chunk = [np.array(item['Embeddings']) for item in chunk]
        label = [np.array(item['Label']) for item in chunk]
        yield log_vec_chunk, label

for i in range (2):
    print("epoch: ", i+1)    
    for log, label in process_data_in_chunks(tmp_data):
        log = np.array(log)
        label = np.array(label)
        data_loader = load_data(log, label, batch_size)
        Teacher = train(Teacher, data_loader, learning_rate)


epoch:  1


0it [00:00, ?it/s]


epoch:  2


0it [00:00, ?it/s]


In [22]:
save_model(Teacher, save_teacher_path)

In [23]:
with open('../newdatasets/BGL_20l_train_0.8.pkl', 'rb') as file:
    raw_test_data = pickle.load(file)

test_data = raw_test_data[0] + raw_test_data[1]


In [30]:
def test(model, criterion = nn.CrossEntropyLoss()):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        TP = 0 
        FP = 0
        FN = 0 
        TN = 0

        for log, label in process_data_in_chunks(test_data):
            log = np.array(log)
            label = np.array(label)
            test_loader = load_data(log, label, batch_size)           
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output, _ = model(data)
                target = target.long()
                test_loss += criterion(output, target) # sum up batch loss
                
                output = torch.sigmoid(output)[:, 0].cpu().detach().numpy()
                predicted = (output < 0.2).astype(int)
                target = np.array([y.cpu() for y in target])

                TP += ((predicted == 1) * (target == 1)).sum()
                FP += ((predicted == 1) * (target == 0)).sum()
                FN += ((predicted == 0) * (target == 1)).sum()
                TN += ((predicted == 0) * (target == 0)).sum()
        P = 100 * TP / (TP + FP)
        R = 100 * TP / (TP + FN)
        F1 = 2 * P * R / (P + R)   
        accuracy = 100 * (TP + TN)/(TP + TN + FP + FN)
        #MCC = 100*(TP*TN + FP*FN)/math.sqrt((TP+FP)*(TN+FN)*(TN+FP)*(TP+FN))         
    return accuracy, test_loss, P, R, F1, TP, FP, TN, FN

In [31]:
accuracy, test_loss, P, R, F1, TP, FP, TN, FN = test(Teacher, criterion = nn.CrossEntropyLoss())

In [32]:
print('Result of testing teacher model')
print('false positive (FP): {}, false negative (FN): {}, true positive (TP): {}, true negative (TN): {}'.format(FP, FN, TP, TN))
print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {accuracy:.2f}%).')
print('Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%' .format(P, R, F1))

Result of testing teacher model
false positive (FP): 0, false negative (FN): 233, true positive (TP): 101, true negative (TN): 1666
Test set: Average loss: 40.8008, Accuracy: 88.35%).
Precision: 100.000%, Recall: 30.240%, F1-measure: 46.437%


In [72]:
import torch.nn.functional as F
def train_step(
    Teacher,
    Student,
    data_loader,
    optimizer,
    student_loss_fn,
    divergence_loss_fn,
    temp,
    alpha,
    epoch,
    device
):
    losses = []
    for data, targets in data_loader:
        # Get data to cuda if possible
        data = data.to(device)
        targets = targets.to(device)

        # forward
        with torch.no_grad():
            teacher_preds, _ = Teacher(data)

        student_preds, __ = Student(data)
        targets = targets.long()
        student_loss = student_loss_fn(student_preds, targets)
        
        ditillation_loss = divergence_loss_fn(
            F.softmax(student_preds / temp, dim=1),
            F.softmax(teacher_preds / temp, dim=1)
        )
        loss = alpha * student_loss + (1 - alpha) * ditillation_loss
        losses.append(loss.item())

        # backward
        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
    
    avg_loss = sum(losses) / len(losses)
    return avg_loss

In [73]:
def teach(epochs, Teacher, Student, data_loader, temp, alpha):
    Teacher = Teacher.to(device)
    Student = Student.to(device)
    student_loss_fn = nn.CrossEntropyLoss()
    divergence_loss_fn = nn.KLDivLoss(reduction="batchmean")
    optimizer = torch.optim.Adam(Student.parameters(), lr=0.01)

    Teacher.eval()
    Student.train()
    for epoch in range(epochs):
        print("Epochs: ", epoch+1)
        loss = train_step(
            Teacher,
            Student,
            data_loader,
            optimizer,
            student_loss_fn,
            divergence_loss_fn,
            temp,
            alpha,
            epoch,
            device
        )

        print(f"Loss:{loss:.2f}")

In [55]:
Student = DistilLog(input_size = input_size, hidden_size=4, num_layers = 1, num_classes = num_classes, is_bidirectional=False).to(device)



In [56]:
tmp_data = full_data[:20000]

In [74]:
for i in range (2):
    print("epoch: ", i+1)    
    for log, label in process_data_in_chunks(tmp_data):
        log = np.array(log)
        label = np.array(label)
        data_loader = load_data(log, label, batch_size)
        teach(epochs=1, Teacher=Teacher, Student=Student, data_loader=data_loader, temp=7, alpha=0.3)

#save_model(Student, save_student_path)

epoch:  1
Epochs:  1
Loss:-0.77
epoch:  2
Epochs:  1
Loss:-0.80


In [75]:
accuracy, test_loss, P, R, F1, TP, FP, TN, FN = test(Student, criterion = nn.CrossEntropyLoss())

In [76]:
print('Result of testing student model')
print('false positive (FP): {}, false negative (FN): {}, true positive (TP): {}, true negative (TN): {}'.format(FP, FN, TP, TN))
print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {accuracy:.2f}%).')
print('Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%' .format(P, R, F1))

Result of testing student model
false positive (FP): 8, false negative (FN): 454, true positive (TP): 8594, true negative (TN): 10944
Test set: Average loss: 52.2439, Accuracy: 97.69%).
Precision: 99.907%, Recall: 94.982%, F1-measure: 97.382%
