<a href="REF: https://github.com/GoogleCloudPlatform/df-ml-anomaly-detection/blob/master/README.md">Reference1<a/>  <br>
<a href="https://faker.readthedocs.io/en/master/">Reference2<a/> <br>
<a href="https://huggingface.co/docs/datasets/v1.11.0/processing.html#exporting-a-dataset-to-csv-json-parquet-or-to-python-objects">Reference3<a/>

In [1]:
import torch
import pandas as pd
from torch import nn
from IPython.display import clear_output
from tqdm import tqdm
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
batch_size = 64

In [32]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data.iloc[index, :]
        return sample
    
class CustomDataCollator(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, batch):
        batch = torch.tensor(batch, dtype=torch.float32)
        return batch

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
            )
        self.mu_layer = nn.Linear(hidden_dim, latent_dim)
        self.sigma_layer = nn.Linear(hidden_dim, latent_dim)
        
    def forward(self, x):
        x = self.linear(x)
        mean = self.mu_layer(x)
        log_var = self.sigma_layer(x)
        return mean, log_var
    
class Decoder(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, output_dim),
            nn.ReLU(),
            )
        
    def forward(self, x):
        x = self.linear(x)
        return x

class VAEModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super().__init__()
        self.enc_layer = Encoder(input_dim, hidden_dim, latent_dim)
        self.dec_layer = Decoder(latent_dim, hidden_dim, input_dim)

    def forward(self, x):
        mean, log_var = self.enc_layer(x)
        z = self.reparameterize(mean, log_var)
        x = self.dec_layer(z)
        return x , mean, log_var
    
    def reparameterize(self, mean, log_var):
        eps = torch.randn_like(mean, requires_grad=False)
        z = mean + torch.exp(0.5 * log_var) * eps
        return z

def network_loss(x, x_hat, mean, log_var):
    r_loss = F.mse_loss(x_hat, x, reduction='mean')
    kl_loss = -0.5 * torch.sum(1 + log_var - mean.pow(2) - log_var.exp())
    score = r_loss + kl_loss
    return score

class DetectAnomaly:
    def __init__(self, loss):
        self.mean = torch.mean(torch.stack(loss)).item()
        self.std_dev = torch.mean(torch.stack(loss)).item()
        self.threshold()
        
    def threshold(self):
        self.threshold = self.mean + 3 * self.std_dev
        
    def chekcer(self, loss):
        if loss > self.threshold:
            return 1
        else:
            return 0

In [35]:
data = pd.read_json(
    path_or_buf ='/Users/hardey/Desktop/GITHUB/AnomalyDetectionPipeline/data/-2023-10-16T10:25:00-2023-10-16T10:26:00-00000-of-00001',
    orient='record',
    lines=True)
data.drop(columns=['subscriberId','dstIP','UniqueIPs','UniquePorts','NumRecords'], inplace=True)
dataset = CustomDataset(data=data)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, collate_fn=CustomDataCollator())

In [36]:
model = VAEModel(9, 20, 1)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
paramerter = sum(n.numel() for n in model.parameters())
print(f"Toal number of parameters: {paramerter}")

Toal number of parameters: 971


In [37]:
num_epochs = 20000
losses = []
model.train()
for epoch in tqdm(range(num_epochs), desc="Epochs", colour="green"):
    total_loss = 0
    for batch in dataloader:
        x_hat, mean, log_var = model(batch)
        loss = network_loss(batch, x_hat, mean, log_var)
        total_loss+=loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    epoch_loss = total_loss/len(dataloader)
    losses.append(epoch_loss)
    if epoch%1000==0:
        print(f"Epoch: {epoch} Loss: {epoch_loss:.4f}")

Epochs:   0%|[32m          [0m| 0/20000 [00:00<?, ?it/s]

Epochs:   0%|[32m          [0m| 1/20000 [00:01<8:14:05,  1.48s/it]

Epoch: 0 Loss: 126748.8359


Epochs:   0%|[32m          [0m| 33/20000 [00:42<7:23:21,  1.33s/it]

In [684]:
batch[:2]

tensor([[ 62.0000, 826.0000, 265.7000,  65.0000, 642.0000, 380.2000,   1.0000,
           9.0000,   5.6000],
        [ 21.0000, 778.0000, 305.4445,  18.0000, 610.0000, 279.3333,   0.0000,
           9.0000,   4.5000]])

In [685]:
x_hat[:2]

tensor([[ 9.3107e-01, -2.3180e+05, -8.6308e+01, -1.1722e+05,  4.7765e+00,
         -1.7863e+05,  2.4713e+05,  1.1046e+04, -3.7970e+01],
        [ 6.7227e-01, -3.0429e+01, -3.1107e-01, -1.5859e+01, -6.0594e-01,
         -2.3214e+01,  3.2139e+01,  7.7376e-01, -1.0839e-01]],
       grad_fn=<SliceBackward0>)

In [None]:
detect = DetectAnomaly(losses)
detect.chekcer(loss)

In [None]:
# python src/feature-pipeline/feature.py --bucket gs://electric-armor-395015-netlog-bucket --file_name_suffix ".json" --netlog_bq_table "electric-armor-395015.netlog_dataset.log" --input_file_pattern '/anomaly/*.avro' --temp_location 'gs://electric-armor-395015-netlog-bucket/temp'

In [None]:
# %%bash
# pip install --upgrade pip
# pip install datasets gcsfs torch polars

In [None]:
!docker build -t hardeybisey/netlog-event-generator:latest .
!docker push hardeybisey/netlog-event-generator:latest
!python main.py --runner=DataflowRunner --project=electric-armor-395015 --region=europe-west2 --temp_location=gs://electric-armor-395015-netlog-bucket/tmp --sdk_container_image=hardeybisey/netlog-event-generator --topic=projects/electric-armor-395015/topics/netlog-topic --qps=1000 --event_type=anomaly --sdk_location=container --pickle_library=cloudpickle

In [127]:
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

In [152]:
reader = DataFileReader(open("/Users/hardey/Downloads/raw_2023-10-16T10 34 20+00 00_f4cc9c.avro", "rb"), DatumReader())
for user in reader:
    break
print(user)
reader.close()

{'subscription_name': 'gsc-sub', 'message_id': '8886770784205506', 'publish_time': datetime.datetime(2023, 10, 16, 10, 34, 20, 118000, tzinfo=<avro.timezones.UTCTzinfo object at 0x7fb92b52ba60>), 'attributes': {}, 'data': b'{"subscriberId": "067b56b3-9cf2-4b20-a695-22e7d21d2198", "srcIP": "23.53.254.31", "srcPort": 2781, "dstIP": "10.104.26.160", "dstPort": 3806, "txBytes": 372, "rxBytes": 275, "startTime": "2023-10-16T10:34:19", "endTime": "2023-10-16T10:34:24", "tcpFlag": "PSH", "protocolName": "TCP", "protocolNumber": 6}'}
