In [4]:
import pandas as pd
import numpy as np
import category_encoders as ce
import torch
from sklearn.preprocessing import MinMaxScaler
from denoising_diffusion_pytorch import Unet1D, GaussianDiffusion1D, Trainer1D, Dataset1D
import threading
import json

In [8]:
#%%




diffusion = None
trainer = None
scaler = None
encoder = None
data_cols = None
res_cached = []

infer_mutex = threading.Lock()


def load_model(input, weights):
    global trainer
    global diffusion
    global scaler
    global encoder
    global data_cols
    data = pd.read_csv(input)
    data = data.drop(columns = 'id')
    #Create object for one-hot encoding
    encoder=ce.OneHotEncoder(cols=['state','proto','attack_cat','service'],handle_unknown='return_nan',return_df=True,use_cat_names=True)
    #Fit and transform Data
    data_encoded = encoder.fit_transform(data)
    scaler = MinMaxScaler()
    dataset1_norm = scaler.fit_transform(data_encoded.values)
    dataset1 = torch.Tensor(dataset1_norm)
    dataset1 = torch.unsqueeze(dataset1,1)
    shape0 = dataset1.shape[0]
    shape1 = dataset1.shape[1]
    dataset1 = torch.concat([dataset1,torch.zeros(shape0,shape1,3)],dim=2)
    data_cols = data_encoded.columns
    ######################################################################################


    model = Unet1D(
        dim = 16,
        dim_mults = (1, 2, 4, 8),
        channels = 1
        )

    diffusion = GaussianDiffusion1D(
        model,
        seq_length = 208,
        timesteps = 1000,
        objective = 'pred_v'
        )

    dataset = Dataset1D(dataset1)  
    amp = False if not torch.cuda.is_available() else True

    trainer = Trainer1D(
        diffusion,
        dataset = dataset,
        train_batch_size = 1,
        train_lr = 8e-5,
        train_num_steps = 10000,         # total training steps
        gradient_accumulate_every = 2,    # gradient accumulation steps
        ema_decay = 0.995,                # exponential moving average decay
        amp = amp,                       # turn on mixed precision
        )
    trainer.load(weights)


def infer(size=100):
    # Acquire mutex before running inference
    with infer_mutex:
        if diffusion is None:
            input = 'UNSW_NB15_testing-set.csv'
            weights = 'network-traffic'
            # print(GDM_model(input,weights))
            load_model(input, weights)
        sampled_seq = diffusion.sample(batch_size = size)
        sampled_seq_sq = torch.squeeze(sampled_seq, 1)[:,0:-3]
        sampled_seq_sq = scaler.inverse_transform(sampled_seq_sq.cpu())
        a = sampled_seq_sq
        #### 
        argmax = np.argmax(a[:,1:134],axis=1) + 1
        for i in range(0,a.shape[0]):
            for j in range(0,a.shape[1]):
                if j == argmax[i] and j >= 1 and j <= 133:
                    a[i][j] = 1
                elif j != argmax[i] and j >= 1 and j <= 133:
                    a[i][j] = 0
        ###
        argmax = np.argmax(a[:,134:147],axis=1) + 134
        for i in range(0,a.shape[0]):
            for j in range(0,a.shape[1]):
                if j == argmax[i] and j >= 134 and j <= 146:
                    a[i][j] = 1
                elif j != argmax[i] and j >= 134 and j <= 146:
                    a[i][j] = 0
        ###
        argmax = np.argmax(a[:,147:156],axis=1) + 147
        for i in range(0,a.shape[0]):
            for j in range(0,a.shape[1]):
                if j == argmax[i] and j >= 147 and j <= 155:
                    a[i][j] = 1
                elif j != argmax[i] and j >= 147 and j <= 155:
                    a[i][j] = 0
        ###
        argmax = np.argmax(a[:,194:204],axis=1) + 194
        for i in range(0,a.shape[0]):
            for j in range(0,a.shape[1]):
                if j == argmax[i] and j >= 194 and j <= 203:
                    a[i][j] = 1
                elif j != argmax[i] and j >= 194 and j <= 203:
                    a[i][j] = 0
        sampled_seq_df = pd.DataFrame(a)
        sampled_seq_df.columns = data_cols
        data_decoded = encoder.inverse_transform(sampled_seq_df)
        columns_to_convert = ['spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'swin', 'stcpb', 'dtcpb', 'dwin', 'smean', 'dmean', 'trans_depth',
                         'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login',
                          'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label']

        data_decoded[columns_to_convert] = data_decoded[columns_to_convert].astype(float).round().astype(int)

        return data_decoded


def GDM_model(input,weights):

    ############################### Pre-processing ######################################
    data = pd.read_csv(input)
    data = data.drop(columns = 'id')
    #Create object for one-hot encoding
    encoder=ce.OneHotEncoder(cols=['state','proto','attack_cat','service'],handle_unknown='return_nan',return_df=True,use_cat_names=True)
    #Fit and transform Data
    data_encoded = encoder.fit_transform(data)
    scaler = MinMaxScaler()
    dataset1_norm = scaler.fit_transform(data_encoded.values)
    dataset1 = torch.Tensor(dataset1_norm)
    dataset1 = torch.unsqueeze(dataset1,1)
    shape0 = dataset1.shape[0]
    shape1 = dataset1.shape[1]
    dataset1 = torch.concat([dataset1,torch.zeros(shape0,shape1,3)],dim=2)
    ######################################################################################


    model = Unet1D(
        dim = 16,
        dim_mults = (1, 2, 4, 8),
        channels = 1
        )

    diffusion = GaussianDiffusion1D(
        model,
        seq_length = 208,
        timesteps = 1000,
        objective = 'pred_v'
        )

    dataset = Dataset1D(dataset1)  

    trainer = Trainer1D(
        diffusion,
        dataset = dataset,
        train_batch_size = 1,
        train_lr = 8e-5,
        train_num_steps = 10000,         # total training steps
        gradient_accumulate_every = 2,    # gradient accumulation steps
        ema_decay = 0.995,                # exponential moving average decay
        amp = False,                       # turn on mixed precision
        )
    trainer.load(weights)

    ####################################### Post processing ##############################################
    sampled_seq = diffusion.sample(batch_size = 100)
    sampled_seq_sq = torch.squeeze(sampled_seq, 1)[:,0:-3]
    sampled_seq_sq = scaler.inverse_transform(sampled_seq_sq.cpu())
    a = sampled_seq_sq
    #### 
    argmax = np.argmax(a[:,1:134],axis=1) + 1
    for i in range(0,a.shape[0]):
        for j in range(0,a.shape[1]):
            if j == argmax[i] and j >= 1 and j <= 133:
                a[i][j] = 1
            elif j != argmax[i] and j >= 1 and j <= 133:
                a[i][j] = 0
    ###
    argmax = np.argmax(a[:,134:147],axis=1) + 134
    for i in range(0,a.shape[0]):
        for j in range(0,a.shape[1]):
            if j == argmax[i] and j >= 134 and j <= 146:
                a[i][j] = 1
            elif j != argmax[i] and j >= 134 and j <= 146:
                a[i][j] = 0
    ###
    argmax = np.argmax(a[:,147:156],axis=1) + 147
    for i in range(0,a.shape[0]):
        for j in range(0,a.shape[1]):
            if j == argmax[i] and j >= 147 and j <= 155:
                a[i][j] = 1
            elif j != argmax[i] and j >= 147 and j <= 155:
                a[i][j] = 0
    ###
    argmax = np.argmax(a[:,194:204],axis=1) + 194
    for i in range(0,a.shape[0]):
        for j in range(0,a.shape[1]):
            if j == argmax[i] and j >= 194 and j <= 203:
                a[i][j] = 1
            elif j != argmax[i] and j >= 194 and j <= 203:
                a[i][j] = 0
    sampled_seq_df = pd.DataFrame(a)
    sampled_seq_df.columns = data_encoded.columns
    data_decoded = encoder.inverse_transform(sampled_seq_df)
    columns_to_convert = ['spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'swin', 'stcpb', 'dtcpb', 'dwin', 'smean', 'dmean', 'trans_depth',
                     'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login',
                      'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label']

    data_decoded[columns_to_convert] = data_decoded[columns_to_convert].astype(float).round().astype(int)

    return data_decoded


In [2]:
input = 'UNSW_NB15_testing-set.csv'
weights = 'network-traffic'
# print(GDM_model(input,weights))
load_model(input, weights)

loading from version 2.1.1


In [11]:
infer(1)

sampling loop time step: 100%|██████████| 1000/1000 [00:35<00:00, 28.54it/s]


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1.385228,unas,-,INT,1,0,32921,76475,86426.347753,238,...,1,5,0,0,1,2,4,0,Exploits,1


In [3]:
res = infer()

sampling loop time step: 100%|██████████| 1000/1000 [00:39<00:00, 25.34it/s]


In [5]:
res

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,2.186107,udp,dns,INT,1,0,7608,0,224710.763175,244,...,1,2,0,0,0,3,4,0,Normal,0
1,0.142956,udp,http,CON,1,0,28,73452,30012.220234,234,...,1,2,0,0,0,2,3,0,Fuzzers,1
2,0.843640,udp,-,INT,7,0,28,0,287947.178751,252,...,2,4,0,0,0,1,5,0,Reconnaissance,1
3,1.030422,tcp,http,FIN,1,0,7224,61691,0.000000,102,...,1,2,0,0,0,1,3,0,Exploits,1
4,0.000000,udp,-,INT,3,0,28,228643,105173.498708,253,...,1,4,0,0,0,2,4,0,DoS,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.649848,udp,dns,INT,28,41,28,1629,75124.085175,237,...,9,14,0,0,0,4,15,0,Generic,1
96,2.216400,tcp,radius,FIN,1,2,63871,0,0.000000,10,...,1,3,0,0,0,2,3,0,Normal,0
97,0.917668,udp,-,INT,21,61,28,54543,117435.008640,255,...,1,2,0,0,0,2,3,0,Exploits,1
98,0.171461,udp,dns,INT,97,7,28,0,117022.425287,252,...,5,14,0,0,0,3,28,0,Generic,1


In [5]:
with open('res.json', 'w') as f:
    json.dump(res.to_dict(), f)


In [8]:
d = res.to_dict()

In [9]:
d.keys()

dict_keys(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'])