In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

In [2]:
data_files_list = ["data/ids2018_processed/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv",
                   "data/ids2018_processed/Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv", 
                   "data/ids2018_processed/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv",                   
                   "data/ids2018_processed/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv",
                   "data/ids2018_processed/Friday-23-02-2018_TrafficForML_CICFlowMeter.csv",
                   "data/ids2018_processed/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv"]

save_dir = "data/ids2018_collaborative"

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


class AutoEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim, dropout_prob = 0.1):
        super(AutoEncoder, self).__init__()

        self.dropout = nn.Dropout(dropout_prob)
        
        self.encoder = nn.ModuleList([nn.Linear(in_features=input_dim, out_features=latent_dim*2)])
        self.encoder.append(nn.ReLU())
        self.encoder.append(nn.Linear(in_features=latent_dim*2, out_features=latent_dim))
        self.encoder.append(nn.ReLU())

        self.decoder = nn.ModuleList([nn.Linear(in_features=latent_dim, out_features=latent_dim*2)])
        self.decoder.append(nn.ReLU())
        self.decoder.append(nn.Linear(in_features=latent_dim*2, out_features=input_dim))
        self.decoder.append(nn.ReLU())

    def forward(self, x):
        z = self.dropout(x)
        for layer in self.encoder:
            z = layer(z)

        # decoding
        for layer in self.decoder:
            z = layer(z)

        x_hat = z

        return x_hat

In [9]:
def train(csv_path):
    basename = csv_path.split("/")[-1].replace('.csv', '')
    print("\n\n"+basename+"\n"+"="*50)
    # load the training data
    Xy = np.load("{}/{}_train.npy".format(save_dir, basename), allow_pickle=True).astype(float)
    
    # split it to training data and validation data (6:2)
    Xy_train, Xy_valid = train_test_split(Xy, test_size=0.25)
    X_train = Xy_train[:, 1:]
    valid_identifier, X_valid = Xy_valid[:, 0], Xy_valid[:, 1:]

    # normalize the input
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)

    # set the training environment
    model = AutoEncoder(56, 6)
    learning_rate = 0.015
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    cost = nn.MSELoss()
    
    # Training
    X_train = torch.FloatTensor(X_train)
    epochs = 500
    for epoch in tqdm(range(epochs)):
        optimizer.zero_grad()
        X_hat = model(X_train)
        loss = cost(X_train, X_hat)
        loss.backward()
        optimizer.step()
        if epoch%50 == 0:
            print(loss)
            
    # Validating        
    X_valid = torch.FloatTensor(X_valid)
    model.eval()
    X_hat = model(X_valid)
    
    # Get mean loss for each identifier
    valid_losses = ((X_valid-X_hat)**2).mean(dim=1).detach().numpy()
    df = pd.DataFrame(data={'id': valid_identifier, 'loss': valid_losses})
    losses = df.groupby('id').mean().values
    
    # Precompute the alpha thresholds
    total_negative = len(losses)
    alpha = np.arange(1.0, -0.01, -0.05).tolist()
    alpha_i = 0
    alpha_thresholds = [0]*len(alpha)
    for threshold in np.arange(0.0, 2.0005, 0.001):
        fpr = np.sum(losses>threshold)/total_negative
        if (fpr<alpha[alpha_i]):
            alpha_i += 1
        if (fpr>=alpha[alpha_i]):
            alpha_thresholds[alpha_i]=threshold

    print(alpha_thresholds)
    
    # Save the model
    with open("{}/model/{}_scaler.pkl".format(save_dir, basename), 'wb') as f:
        pickle.dump(scaler, f)
    with open("{}/model/{}_thresholds.pkl".format(save_dir, basename), 'wb') as f:
        pickle.dump(alpha_thresholds, f)
    torch.save(model.state_dict(), "{}/model/{}.model".format(save_dir, basename))


In [10]:
for d in data_files_list:
    train(d)



Wednesday-14-02-2018_TrafficForML_CICFlowMeter


  0%|          | 1/500 [00:00<06:10,  1.35it/s]

tensor(0.8295, grad_fn=<MeanBackward0>)


 10%|█         | 51/500 [00:24<03:33,  2.10it/s]

tensor(0.4449, grad_fn=<MeanBackward0>)


 20%|██        | 101/500 [00:49<03:13,  2.06it/s]

tensor(0.4143, grad_fn=<MeanBackward0>)


 30%|███       | 151/500 [01:14<02:59,  1.95it/s]

tensor(0.4066, grad_fn=<MeanBackward0>)


 40%|████      | 201/500 [01:41<02:33,  1.95it/s]

tensor(0.3978, grad_fn=<MeanBackward0>)


 50%|█████     | 251/500 [02:07<02:16,  1.82it/s]

tensor(0.3934, grad_fn=<MeanBackward0>)


 60%|██████    | 301/500 [02:34<01:41,  1.96it/s]

tensor(0.3936, grad_fn=<MeanBackward0>)


 70%|███████   | 351/500 [02:59<01:11,  2.07it/s]

tensor(0.3937, grad_fn=<MeanBackward0>)


 80%|████████  | 401/500 [03:22<00:48,  2.02it/s]

tensor(0.3905, grad_fn=<MeanBackward0>)


 90%|█████████ | 451/500 [03:48<00:25,  1.91it/s]

tensor(0.3897, grad_fn=<MeanBackward0>)


100%|██████████| 500/500 [04:14<00:00,  1.97it/s]


[0.061, 0.10300000000000001, 0.113, 0.122, 0.13, 0.137, 0.145, 0.152, 0.16, 0.168, 0.17500000000000002, 0.184, 0.195, 0.20600000000000002, 0.219, 0.23800000000000002, 0.263, 0.302, 0.378, 0.491, 2.0]


Thursday-15-02-2018_TrafficForML_CICFlowMeter


  0%|          | 1/500 [00:01<11:16,  1.36s/it]

tensor(0.8373, grad_fn=<MeanBackward0>)


 10%|█         | 51/500 [00:40<05:47,  1.29it/s]

tensor(0.4771, grad_fn=<MeanBackward0>)


 20%|██        | 101/500 [01:20<05:11,  1.28it/s]

tensor(0.3888, grad_fn=<MeanBackward0>)


 30%|███       | 151/500 [01:59<04:36,  1.26it/s]

tensor(0.3735, grad_fn=<MeanBackward0>)


 40%|████      | 201/500 [02:39<03:55,  1.27it/s]

tensor(0.3693, grad_fn=<MeanBackward0>)


 50%|█████     | 251/500 [03:19<03:24,  1.22it/s]

tensor(0.3703, grad_fn=<MeanBackward0>)


 60%|██████    | 301/500 [03:58<02:37,  1.27it/s]

tensor(0.3657, grad_fn=<MeanBackward0>)


 70%|███████   | 351/500 [04:38<01:56,  1.28it/s]

tensor(0.3669, grad_fn=<MeanBackward0>)


 80%|████████  | 401/500 [05:15<01:18,  1.27it/s]

tensor(0.3634, grad_fn=<MeanBackward0>)


 90%|█████████ | 451/500 [05:52<00:36,  1.33it/s]

tensor(0.3584, grad_fn=<MeanBackward0>)


100%|██████████| 500/500 [06:30<00:00,  1.28it/s]


[0.07100000000000001, 0.14200000000000002, 0.16, 0.17300000000000001, 0.184, 0.195, 0.20500000000000002, 0.216, 0.228, 0.23800000000000002, 0.251, 0.263, 0.278, 0.296, 0.316, 0.338, 0.364, 0.396, 0.442, 0.525, 2.0]


Wednesday-21-02-2018_TrafficForML_CICFlowMeter


  0%|          | 1/500 [00:00<03:30,  2.37it/s]

tensor(0.8438, grad_fn=<MeanBackward0>)


 10%|█         | 51/500 [00:14<01:55,  3.89it/s]

tensor(0.4967, grad_fn=<MeanBackward0>)


 20%|██        | 101/500 [00:27<01:57,  3.39it/s]

tensor(0.4691, grad_fn=<MeanBackward0>)


 30%|███       | 151/500 [00:40<01:26,  4.04it/s]

tensor(0.4631, grad_fn=<MeanBackward0>)


 40%|████      | 201/500 [00:53<01:16,  3.90it/s]

tensor(0.4613, grad_fn=<MeanBackward0>)


 50%|█████     | 251/500 [01:06<01:03,  3.92it/s]

tensor(0.4561, grad_fn=<MeanBackward0>)


 60%|██████    | 301/500 [01:19<00:53,  3.75it/s]

tensor(0.4535, grad_fn=<MeanBackward0>)


 70%|███████   | 351/500 [01:32<00:38,  3.90it/s]

tensor(0.4533, grad_fn=<MeanBackward0>)


 80%|████████  | 401/500 [01:46<00:29,  3.34it/s]

tensor(0.4545, grad_fn=<MeanBackward0>)


 90%|█████████ | 451/500 [02:00<00:12,  3.80it/s]

tensor(0.4491, grad_fn=<MeanBackward0>)


100%|██████████| 500/500 [02:13<00:00,  3.75it/s]


[0.004, 0.037, 0.043000000000000003, 0.048, 0.051000000000000004, 0.055, 0.058, 0.06, 0.063, 0.065, 0.068, 0.07100000000000001, 0.073, 0.076, 0.079, 0.083, 0.088, 0.094, 0.10200000000000001, 0.136, 2.0]


Thursday-22-02-2018_TrafficForML_CICFlowMeter


  0%|          | 1/500 [00:01<13:17,  1.60s/it]

tensor(0.8257, grad_fn=<MeanBackward0>)


 10%|█         | 51/500 [01:19<11:55,  1.59s/it]

tensor(0.4295, grad_fn=<MeanBackward0>)


 20%|██        | 101/500 [02:34<09:43,  1.46s/it]

tensor(0.3426, grad_fn=<MeanBackward0>)


 30%|███       | 151/500 [03:51<08:33,  1.47s/it]

tensor(0.3314, grad_fn=<MeanBackward0>)


 40%|████      | 201/500 [05:07<07:09,  1.44s/it]

tensor(0.3281, grad_fn=<MeanBackward0>)


 50%|█████     | 251/500 [06:21<06:11,  1.49s/it]

tensor(0.3244, grad_fn=<MeanBackward0>)


 60%|██████    | 301/500 [07:39<04:57,  1.50s/it]

tensor(0.3241, grad_fn=<MeanBackward0>)


 70%|███████   | 351/500 [08:55<03:58,  1.60s/it]

tensor(0.3086, grad_fn=<MeanBackward0>)


 80%|████████  | 401/500 [10:13<02:31,  1.53s/it]

tensor(0.3056, grad_fn=<MeanBackward0>)


 90%|█████████ | 451/500 [11:30<01:15,  1.54s/it]

tensor(0.3046, grad_fn=<MeanBackward0>)


100%|██████████| 500/500 [12:47<00:00,  1.53s/it]


[0.073, 0.11, 0.11800000000000001, 0.124, 0.129, 0.133, 0.137, 0.14100000000000001, 0.145, 0.149, 0.153, 0.158, 0.163, 0.168, 0.176, 0.186, 0.202, 0.241, 0.312, 0.381, 2.0]


Friday-23-02-2018_TrafficForML_CICFlowMeter


  0%|          | 1/500 [00:01<14:56,  1.80s/it]

tensor(0.8307, grad_fn=<MeanBackward0>)


 10%|█         | 51/500 [01:18<11:45,  1.57s/it]

tensor(0.5166, grad_fn=<MeanBackward0>)


 20%|██        | 101/500 [02:33<09:28,  1.43s/it]

tensor(0.4448, grad_fn=<MeanBackward0>)


 30%|███       | 151/500 [03:49<09:37,  1.65s/it]

tensor(0.4212, grad_fn=<MeanBackward0>)


 40%|████      | 201/500 [05:01<07:00,  1.41s/it]

tensor(0.3894, grad_fn=<MeanBackward0>)


 50%|█████     | 251/500 [06:14<05:52,  1.41s/it]

tensor(0.3834, grad_fn=<MeanBackward0>)


 60%|██████    | 301/500 [07:28<04:44,  1.43s/it]

tensor(0.3815, grad_fn=<MeanBackward0>)


 70%|███████   | 351/500 [08:41<03:29,  1.41s/it]

tensor(0.3782, grad_fn=<MeanBackward0>)


 80%|████████  | 401/500 [09:57<02:42,  1.64s/it]

tensor(0.3770, grad_fn=<MeanBackward0>)


 90%|█████████ | 451/500 [11:12<01:12,  1.48s/it]

tensor(0.3764, grad_fn=<MeanBackward0>)


100%|██████████| 500/500 [12:28<00:00,  1.50s/it]


[0.092, 0.176, 0.195, 0.20800000000000002, 0.22, 0.229, 0.23800000000000002, 0.248, 0.257, 0.265, 0.275, 0.28500000000000003, 0.294, 0.305, 0.317, 0.33, 0.34700000000000003, 0.369, 0.4, 0.468, 2.0]


Friday-02-03-2018_TrafficForML_CICFlowMeter


  0%|          | 1/500 [00:00<07:28,  1.11it/s]

tensor(0.8420, grad_fn=<MeanBackward0>)


 10%|█         | 51/500 [00:29<04:07,  1.82it/s]

tensor(0.5420, grad_fn=<MeanBackward0>)


 20%|██        | 101/500 [00:58<03:54,  1.70it/s]

tensor(0.4780, grad_fn=<MeanBackward0>)


 30%|███       | 151/500 [01:29<03:28,  1.67it/s]

tensor(0.4594, grad_fn=<MeanBackward0>)


 40%|████      | 201/500 [01:58<02:46,  1.79it/s]

tensor(0.4494, grad_fn=<MeanBackward0>)


 50%|█████     | 251/500 [02:28<02:41,  1.54it/s]

tensor(0.4414, grad_fn=<MeanBackward0>)


 60%|██████    | 301/500 [02:59<01:50,  1.79it/s]

tensor(0.4406, grad_fn=<MeanBackward0>)


 70%|███████   | 351/500 [03:28<01:32,  1.61it/s]

tensor(0.4204, grad_fn=<MeanBackward0>)


 80%|████████  | 401/500 [03:57<00:55,  1.78it/s]

tensor(0.4137, grad_fn=<MeanBackward0>)


 90%|█████████ | 451/500 [04:26<00:28,  1.71it/s]

tensor(0.4093, grad_fn=<MeanBackward0>)


100%|██████████| 500/500 [04:54<00:00,  1.70it/s]


[0.089, 0.164, 0.186, 0.202, 0.216, 0.229, 0.241, 0.254, 0.267, 0.279, 0.293, 0.307, 0.321, 0.336, 0.354, 0.377, 0.403, 0.446, 0.539, 0.922, 2.0]
