In [1]:
import dgl
print(dgl.__version__)

1.1.2


In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math
import pickle
import joblib

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

## Dataset
Experiments are done with NF-CSE-CIC-IDS2018-v2.csv dataset:
* Netflow format computer network dataset
* Target encoder is used in features (Similar to label encoding except labels are correlated)
* Labelencoder is used in 'Attack' column (6 attack types)
* y = (["Attack", "Label"])
* Data is splitted into train and test, then graphs are created seperately

In [5]:
file_name = "NF-CSE-CIC-IDS2018-v2.csv"
data = pd.read_csv(file_name)

In [6]:
data.Label.value_counts()

0    16635567
1     2258141
Name: Label, dtype: int64

## Data Preprocessing



In [7]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [8]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [9]:
data.Attack.unique()

array(['SSH-Bruteforce', 'Benign', 'DDoS attacks-LOIC-HTTP',
       'DDOS attack-HOIC', 'DoS attacks-Slowloris', 'DoS attacks-Hulk',
       'FTP-BruteForce', 'Infilteration', 'Bot', 'DoS attacks-GoldenEye',
       'Brute Force -Web', 'DoS attacks-SlowHTTPTest', 'SQL Injection',
       'DDOS attack-LOIC-UDP', 'Brute Force -XSS'], dtype=object)

In [10]:
data = data.groupby(by='Attack').sample(frac=0.1, random_state=13)

In [11]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,...,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557
Bot,14310,14310,14310,14310,14310,14310,14310,14310,14310,14310,...,14310,14310,14310,14310,14310,14310,14310,14310,14310,14310
Brute Force -Web,214,214,214,214,214,214,214,214,214,214,...,214,214,214,214,214,214,214,214,214,214
Brute Force -XSS,93,93,93,93,93,93,93,93,93,93,...,93,93,93,93,93,93,93,93,93,93
DDOS attack-HOIC,108086,108086,108086,108086,108086,108086,108086,108086,108086,108086,...,108086,108086,108086,108086,108086,108086,108086,108086,108086,108086
DDOS attack-LOIC-UDP,211,211,211,211,211,211,211,211,211,211,...,211,211,211,211,211,211,211,211,211,211
DDoS attacks-LOIC-HTTP,30730,30730,30730,30730,30730,30730,30730,30730,30730,30730,...,30730,30730,30730,30730,30730,30730,30730,30730,30730,30730
DoS attacks-GoldenEye,2772,2772,2772,2772,2772,2772,2772,2772,2772,2772,...,2772,2772,2772,2772,2772,2772,2772,2772,2772,2772
DoS attacks-Hulk,43265,43265,43265,43265,43265,43265,43265,43265,43265,43265,...,43265,43265,43265,43265,43265,43265,43265,43265,43265,43265
DoS attacks-SlowHTTPTest,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,...,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412


In [12]:
X = data.drop(columns=["Attack", "Label"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [13]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [14]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [15]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)


In [16]:
#train.to_parquet("train_raw.parquet")
#test.to_parquet("test_raw.parquet")

In [17]:
X_train

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h
10660988,172.31.66.109,172.31.0.2,1.401126e-08,1.108397e-08,0.000078,1.237078e-06,0.000098,1.237078e-06,1.405050e-08,1.405050e-08,...,0.000000e+00,0.000000,0.000000,1.458031e-07,1.458029e-07,1.136291e-07,1.109630e-08,0.000074,1.478535e-07,"[1.4011259652278657e-08, 1.1083974799736557e-0..."
15147714,172.31.66.68,172.31.0.2,1.245158e-08,1.008341e-08,0.000079,1.099371e-06,0.000097,1.099371e-06,1.248645e-08,1.248645e-08,...,0.000000e+00,0.000000,0.000000,1.295728e-07,1.295727e-07,9.858040e-08,9.861101e-09,0.000066,1.313950e-07,"[1.245157764089544e-08, 1.0083409899455166e-08..."
9587672,172.31.64.28,172.31.0.2,1.552131e-08,1.227854e-08,0.000077,1.370403e-06,0.000099,1.370403e-06,1.556478e-08,1.556478e-08,...,0.000000e+00,0.000000,0.000000,1.615169e-07,1.615167e-07,1.164452e-07,1.229219e-08,0.000082,1.637882e-07,"[1.552131057247672e-08, 1.2278540225056637e-08..."
7746586,172.31.67.18,172.31.0.2,7.552795e-09,5.974837e-09,0.000045,6.668492e-07,0.000086,6.668492e-07,7.573948e-09,7.573948e-09,...,0.000000e+00,0.000000,0.000000,7.859541e-08,7.859535e-08,5.499144e-08,5.883835e-09,0.000040,7.970069e-08,"[7.55279530717906e-09, 5.974837018934741e-09, ..."
693958,172.31.67.31,172.31.0.2,3.657431e-09,2.893307e-09,0.000049,6.458418e-07,0.000115,6.458418e-07,3.667675e-09,3.667675e-09,...,0.000000e+00,0.000000,0.000000,3.805972e-08,3.805969e-08,2.821517e-08,2.896524e-09,0.000010,3.859495e-08,"[3.657431287477647e-09, 2.8933070422099713e-09..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1271255,185.129.148.190,172.31.64.93,8.552135e-09,3.898077e-10,0.000081,4.350629e-07,0.000095,4.785692e-07,6.745891e-09,1.603694e-08,...,4.350629e-08,0.000356,0.002784,5.127688e-09,5.127683e-09,8.532038e-09,8.532038e-09,0.000000,5.199798e-09,"[8.552134708109645e-09, 3.89807729179286e-10, ..."
3505485,109.6.179.36,172.31.64.16,3.649593e-08,1.663491e-09,0.000108,9.283077e-07,0.000063,7.426462e-07,1.483944e-09,2.087102e-09,...,0.000000e+00,0.011927,0.011927,2.188222e-08,2.188221e-08,3.641017e-08,3.641017e-08,0.000000,2.218995e-08,"[3.6495930771830466e-08, 1.6634906235704336e-0..."
15562780,172.31.64.33,172.31.0.2,9.708617e-09,7.680256e-09,0.000058,8.571904e-07,0.000111,8.571904e-07,9.735808e-09,9.735808e-09,...,0.000000e+00,0.000000,0.000000,1.010292e-07,1.010291e-07,7.283673e-08,7.563280e-09,0.000051,1.024499e-07,"[9.70861679362884e-09, 7.680256204757472e-09, ..."
14792808,66.119.212.118,172.31.66.7,1.012270e-08,4.613943e-10,0.000080,5.149604e-07,0.000096,3.604723e-07,4.115945e-10,1.431217e-09,...,5.149604e-08,0.000422,0.003296,6.069367e-09,6.069362e-09,1.009891e-08,1.009891e-08,0.000000,6.154719e-09,"[1.0122699203852631e-08, 4.613943213589721e-10..."


In [18]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])


In [19]:
joblib.dump(lab_enc,'gnn_label_encoder.pkl',compress=9)

['gnn_label_encoder.pkl']

## Graph Generation
* Multigraph with
    - Edge features
        - h : A list of features in the data
        - Label (0-1)
        - Attack
    - Node features
        - {1,.., 1} same length as h

In [20]:
# Training graph

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [21]:
dgl.save_graphs("train.graph", [train_g])
dgl.save_graphs("test.graph", [test_g])

# Self-Supervised Learning
### E-GraphSAGE

In [22]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      """
      Reset parameters whenever object created
      """
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      """
      It sends the 'h' feature data from edges to nodes
      """
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      """
      update_all : message aggregation
      applies a linear transformation
      concatenates node features with aggregated neighbor features
      then applies a non-linear activation function (ReLU)
      """
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [23]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      """
      If corruption : permutate edge features
      Then send data into layers to find node&edge features
      """
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        efeats = efeats[e_perm]
      for i, layer in enumerate(self.layers):
        nfeats, e_feats = layer(g, nfeats, efeats)
      return nfeats.sum(1), e_feats.sum(1)


# Self-Supervised Learning
### Deep Graph Infomax (DGI)

In [24]:
class Discriminator(nn.Module):
    """
    The role of the discriminator is to distinguish between positive and negative node embeddings to
    maximize the mutual information between positive samples (observed) and negative samples (corrupted).
      - Mutual information provides a way to quantify how much knowledge about one variable can provide insights into another variable.

    The discriminator takes the positive&negative node embeddings and the summary vector as inputs
    The discriminator's task is to determine whether a given node embedding-summary pair is from a positive sample or a negative sample.
    It computes a score or logit for each input node embedding-summary pair using its learned parameters.
    """
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [25]:
class DGI(nn.Module):
    """
    Deep Graph Infomax

    Self-supervised graph representation learning technique that aims to learn meaningful node representations
    through maximizing mutual information between positive (observed) and negative (corrupted) samples.

    """
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

## Training DGI
* Same hyperparameters and optimizer specified in the "Anomal-E".

In [26]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [27]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [28]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [29]:
# Convert to GPU
train_g = train_g

In [30]:
cnt_wait = 0
best = 1e9
best_t = 0
dur = []
node_features = train_g.ndata['h']
edge_features = train_g.edata['h']
"""
for epoch in range(epochs):
    dgi.train()
    if epoch >= 3:
        t0 = time.time()

    dgi_optimizer.zero_grad()
    loss = dgi(train_g, node_features, edge_features)
    loss.backward()
    dgi_optimizer.step()

    if loss < best:
        best = loss
        best_t = epoch
        cnt_wait = 0
        torch.save(dgi.state_dict(), 'best_dgi.pkl')
    else:
        cnt_wait += 1

  # if cnt_wait == patience:
  #     print('Early stopping!')
  #     break

    if epoch >= 3:
        dur.append(time.time() - t0)

    if epoch % 50 == 0:

        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
              loss.item(),
              train_g.num_edges() / np.mean(dur) / 1000))
"""

'\nfor epoch in range(epochs):\n    dgi.train()\n    if epoch >= 3:\n        t0 = time.time()\n\n    dgi_optimizer.zero_grad()\n    loss = dgi(train_g, node_features, edge_features)\n    loss.backward()\n    dgi_optimizer.step()\n\n    if loss < best:\n        best = loss\n        best_t = epoch\n        cnt_wait = 0\n        torch.save(dgi.state_dict(), \'best_dgi.pkl\')\n    else:\n        cnt_wait += 1\n\n  # if cnt_wait == patience:\n  #     print(\'Early stopping!\')\n  #     break\n\n    if epoch >= 3:\n        dur.append(time.time() - t0)\n\n    if epoch % 50 == 0:\n\n        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "\n            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),\n              loss.item(),\n              train_g.num_edges() / np.mean(dur) / 1000))\n'

## Loading trained DGI
* Same hyperparameters and optimizer specified in the "Anomal-E".

In [31]:
dgi.load_state_dict(torch.load('best_dgi.pkl'))

<All keys matched successfully>

In [32]:
dgi(train_g, node_features, edge_features)

tensor(5.9682, grad_fn=<AddBackward0>)

In [33]:
train_g.adj().indices()

tensor([[17305, 17305, 17305,  ..., 10093, 17957, 36373],
        [17087, 17087, 17087,  ..., 17297, 17165, 17118]])

In [34]:
node_features.shape

torch.Size([64237, 1, 39])

In [35]:
edge_features.shape

torch.Size([2641554, 1, 39])

## Edge Embeddings


* Training DGI 
* Seperate encoders are trained for training and testing graph
* After encoding train and test graphs, encodings are converted into dataframe

In [36]:
dgi.encoder

SAGE(
  (layers): ModuleList(
    (0): SAGELayer(
      (W_apply): Linear(in_features=78, out_features=128, bias=True)
      (W_edge): Linear(in_features=256, out_features=256, bias=True)
    )
  )
)

In [37]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [38]:
train_g.adj().indices()

tensor([[17305, 17305, 17305,  ..., 10093, 17957, 36373],
        [17087, 17087, 17087,  ..., 17297, 17165, 17118]])

In [39]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [40]:
# Convert to GPU
test_g = test_g

In [41]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [42]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [43]:
df_train # Edge features, labels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,0.213101,0.084576,-0.100087,-0.085792,-0.504479,-0.286598,0.044959,-0.109970,-0.071820,-0.931623,...,-0.443378,-0.176202,-0.041914,-0.069238,0.232846,-0.652440,-0.520690,-0.528282,Benign,0
1,0.213101,0.084576,-0.100087,-0.085792,-0.504479,-0.286598,0.044959,-0.109970,-0.071820,-0.931623,...,-0.443378,-0.176202,-0.041914,-0.069238,0.232846,-0.652440,-0.520690,-0.528282,Benign,0
2,0.213101,0.084576,-0.100087,-0.085792,-0.504479,-0.286598,0.044959,-0.109970,-0.071820,-0.931623,...,-0.443378,-0.176202,-0.041914,-0.069238,0.232846,-0.652440,-0.520690,-0.528282,Benign,0
3,0.213101,0.084576,-0.100087,-0.085792,-0.504479,-0.286598,0.044959,-0.109970,-0.071820,-0.931623,...,-0.443378,-0.176202,-0.041914,-0.069238,0.232846,-0.652440,-0.520690,-0.528282,Benign,0
4,0.213101,0.084576,-0.100087,-0.085792,-0.504479,-0.286598,0.044959,-0.109970,-0.071820,-0.931623,...,-0.443378,-0.176202,-0.041914,-0.069238,0.232846,-0.652440,-0.520690,-0.528282,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2641549,0.238272,0.086273,-0.082979,-0.075177,-0.517827,-0.300409,0.021976,-0.122864,-0.048900,-0.925022,...,-0.459370,-0.149649,-0.044086,-0.073823,0.225230,-0.659248,-0.506145,-0.530789,Benign,0
2641550,0.254560,0.067924,-0.087581,-0.067362,-0.560228,-0.311366,0.002567,-0.094462,-0.053285,-0.910474,...,-0.451368,-0.077642,-0.046995,-0.065274,0.201917,-0.667047,-0.512167,-0.542340,Benign,0
2641551,0.254018,0.076960,-0.065283,-0.077068,-0.562148,-0.313146,-0.011261,-0.106118,-0.058118,-0.904931,...,-0.464070,-0.099481,-0.034150,-0.059855,0.233140,-0.666527,-0.509573,-0.542750,Benign,0
2641552,0.254432,0.075125,-0.074659,-0.076764,-0.565004,-0.303394,-0.004606,-0.099647,-0.059363,-0.909203,...,-0.454214,-0.091502,-0.042996,-0.059958,0.221726,-0.664430,-0.513076,-0.544610,Benign,0


In [44]:
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,0.212190,0.084235,-0.100871,-0.085811,-0.504105,-0.286591,0.045796,-0.109332,-0.072740,-0.931824,...,-0.442794,-0.176718,-0.041757,-0.069147,0.232927,-0.652265,-0.521252,-0.528130,Benign,0
1,0.212190,0.084235,-0.100871,-0.085811,-0.504105,-0.286591,0.045796,-0.109332,-0.072740,-0.931824,...,-0.442794,-0.176718,-0.041757,-0.069147,0.232927,-0.652265,-0.521252,-0.528130,Benign,0
2,0.212190,0.084235,-0.100871,-0.085811,-0.504105,-0.286591,0.045796,-0.109332,-0.072740,-0.931824,...,-0.442794,-0.176718,-0.041757,-0.069147,0.232927,-0.652265,-0.521252,-0.528130,Benign,0
3,0.212190,0.084235,-0.100871,-0.085811,-0.504105,-0.286591,0.045796,-0.109332,-0.072740,-0.931824,...,-0.442794,-0.176718,-0.041757,-0.069147,0.232927,-0.652265,-0.521252,-0.528130,Benign,0
4,0.212190,0.084235,-0.100871,-0.085811,-0.504105,-0.286591,0.045796,-0.109332,-0.072740,-0.931824,...,-0.442794,-0.176718,-0.041757,-0.069147,0.232927,-0.652265,-0.521252,-0.528130,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1132126,0.238991,0.080186,-0.089852,-0.066152,-0.511611,-0.311789,0.026515,-0.120675,-0.042585,-0.925042,...,-0.460239,-0.139137,-0.044112,-0.079347,0.211982,-0.662015,-0.504174,-0.528458,Benign,0
1132127,0.254403,0.075197,-0.073904,-0.075690,-0.563776,-0.306023,-0.005386,-0.100197,-0.058240,-0.908402,...,-0.455886,-0.091104,-0.041516,-0.060288,0.221319,-0.665002,-0.511931,-0.543905,Benign,0
1132128,0.226226,0.082544,-0.094559,-0.076982,-0.505063,-0.296730,0.037320,-0.120199,-0.053299,-0.929006,...,-0.452945,-0.162573,-0.045611,-0.076051,0.224357,-0.657492,-0.512366,-0.527756,Benign,0
1132129,0.242786,0.084904,-0.082050,-0.070827,-0.520388,-0.306714,0.018430,-0.121979,-0.045771,-0.923595,...,-0.461895,-0.140351,-0.043430,-0.075043,0.219978,-0.661053,-0.503455,-0.531070,Benign,0


In [46]:
!pip install pyarrow
!pip install fastparquet

Defaulting to user installation because normal site-packages is not writeable
Collecting pyarrow
  Downloading pyarrow-16.1.0-cp39-cp39-win_amd64.whl (25.9 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-16.1.0
Defaulting to user installation because normal site-packages is not writeable
Collecting fastparquet
  Downloading fastparquet-2024.5.0-cp39-cp39-win_amd64.whl (672 kB)
Collecting pandas>=1.5.0
  Downloading pandas-2.2.2-cp39-cp39-win_amd64.whl (11.6 MB)
Collecting cramjam>=2.3
  Downloading cramjam-2.8.3-cp39-none-win_amd64.whl (1.6 MB)
Collecting tzdata>=2022.7
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp39-cp39-win_amd64.whl (15.8 MB)
Installing collected packages: tzdata, numpy, pandas, cramjam, fastparquet
Successfully installed cramjam-2.8.3 fastparquet-2024.5.0 numpy-1.26.4 pandas-2.2.2 tzdata-2024.1


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.5.0 requires daal==2021.4.0, which is not installed.
scipy 1.7.3 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.26.4 which is incompatible.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.26.4 which is incompatible.


In [47]:
df_train.columns = df_train.columns.astype(str)
df_train.to_parquet("train_embedded.parquet")


In [48]:
df_test.columns = df_test.columns.astype(str)
df_test.to_parquet("test_embedded.parquet")