In [42]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import socket
import struct



## Preparation

In [9]:
import pandas as pd
import os.path
FILEID = 1
FILEIDS = [1, 3, 36, 39, 49, 52]
LOGFILE = f"https://mcfp.felk.cvut.cz/publicDatasets/IoTDatasets/CTU-IoT-Malware-Capture-{FILEID}-1/bro/conn.log.labeled"
NUMERIC_COLUMNS = ['ts', 'orig_p', 'resp_p', "orig_bytes", "resp_bytes", "missed_bytes", "orig_pkts", 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'duration'] # integer columns

In [10]:
def convert_to_csv(FILEID):
    if not os.path.isfile(f'csv/capture{FILEID}_1.csv'):
        LOGFILE = f"https://mcfp.felk.cvut.cz/publicDatasets/IoTDatasets/CTU-IoT-Malware-Capture-{FILEID}-1/bro/conn.log.labeled"
        fieldsIN = ['ts', 'uid', 'orig_h', 'orig_p', 'resp_h', 'resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state',
                'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'label', 'detailed_label']
        df = pd.read_csv(LOGFILE, sep="\x09|\x20\x20\x20", skiprows=10, skipfooter=2,
                        names=fieldsIN, header=None, engine='python')
                                
        df = df.drop(['tunnel_parents'], axis=1)
        df.to_csv(f'csv/capture{FILEID}_1.csv')
        print(df.shape)
        for column in NUMERIC_COLUMNS:
            df[column] = pd.to_numeric(df[column], errors='coerce')
        return df


In [11]:
def convert_ipv4(addr):
    return struct.unpack("!I", socket.inet_aton(addr))[0]

def con_proto(proto):
    protos = ['icmp', 'tcp', 'udp']
    return protos.index(proto)

In [12]:
def norm_df(df, split_into=None):
    """
    param: split_into [this is a percentage, how much of the dataset should be used]
    """
    assert 0 < split_into < 1, "Number must be percentage"
    df = df.replace('-', np.nan)
    df['orig_h'] = df['orig_h'].apply(convert_ipv4)
    df['resp_h'] = df['resp_h'].apply(convert_ipv4)
    # df = pd.get_dummies(df, columns=['proto'], dtype=int)
    df['proto'] = df['proto'].apply(con_proto)
    # print(dummies.head())
    df = df.drop(['Unnamed: 0','service', 'duration', 'missed_bytes', 'history', 'uid', 'conn_state', 'local_orig', 'local_resp', 'orig_ip_bytes', 'resp_ip_bytes', 'orig_pkts', 'resp_pkts', 'detailed_label', 'ts'], axis=1)
    df['label'] = (df['label'] == "Malicious").astype(int)
    # df['resp_bytes'] = df['resp_bytes'].apply(to_int)
    df = df.dropna()
    df['resp_bytes'] = df['resp_bytes'].astype(int)
    df['orig_bytes'] = df['orig_bytes'].astype(int)
    if (split_into): df = np.array_split(df.sample(frac=1), 1/split_into)[0]
    df = df.reset_index(drop=True)
    return df

In [13]:
SOURCEDF = norm_df(pd.read_csv('csv/capture1_1.csv'), 0.1)
print(f"{SOURCEDF.shape}\n")
print(f'{SOURCEDF.dtypes}\n')
print(f"data categories = [Malicious: {SOURCEDF[SOURCEDF['label'] == 1].shape[0]}, Benign: {SOURCEDF[SOURCEDF['label'] == 0].shape[0]}]")
SOURCEDF.head()


(21079, 8)

orig_h        int64
orig_p        int64
resp_h        int64
resp_p        int64
proto         int64
orig_bytes    int64
resp_bytes    int64
label         int64
dtype: object

data categories = [Malicious: 18872, Benign: 2207]


Unnamed: 0,orig_h,orig_p,resp_h,resp_p,proto,orig_bytes,resp_bytes,label
0,3232261223,53796,1443272024,8080,1,0,0,1
1,3232261223,45689,1573296037,8080,1,0,0,1
2,3232261223,50078,3127279976,8080,1,0,0,1
3,3232261223,45357,2958977667,9527,1,0,0,1
4,3232261223,56888,2345474255,8080,1,0,0,1


## Visualise Data

In [14]:
def viz(df):
    for label in df.columns:
        plt.hist(df[df["label"] == 1][label], color='blue', label='Malicious', alpha=0.7, density=1)
        plt.hist(df[df["label"] == 0][label], color='red', label='Benign', alpha=0.7, density=1)
        plt.title(label)
        plt.ylabel('?')
        plt.xlabel(label)
        plt.legend()
        plt.show()

# viz(SOURCEDF)

## Training using ANN
link: [here](https://www.analyticsvidhya.com/blog/2021/10/implementing-artificial-neural-networkclassification-in-python-from-scratch/)

or [here](https://towardsdatascience.com/visualizing-artificial-neural-networks-anns-with-just-one-line-of-code-b4233607209e)

this for encoding: [encoding](https://datagy.io/sklearn-one-hot-encode/)

In [15]:
from sklearn.preprocessing import StandardScaler
# used to sample the data in a way, that the dataset is equally distributed between categories
from imblearn.over_sampling import RandomOverSampler


In [16]:
def split(df):
    train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
    return train, valid, test

In [17]:
def scale(df: pd.DataFrame, oversample = False):
    X = df.drop(['label'], axis=1)
    y = df['label']

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)

    data = np.hstack((X, np.reshape(y, (-1, 1))))
    return data, X, y

In [18]:
train, valid, test = split(SOURCEDF)
train, X_train, y_train = scale(train, True) 
valid, X_valid, y_valid = scale(valid, False) 
test, X_test, y_test = scale(test, False) 

# kNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [20]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

In [21]:
y_pred = knn_model.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.88      0.91       477
           1       0.99      0.99      0.99      3739

    accuracy                           0.98      4216
   macro avg       0.96      0.94      0.95      4216
weighted avg       0.98      0.98      0.98      4216



# Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB

In [24]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [25]:
y_pred = nb_model.predict(X_test)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.11      1.00      0.20       477
           1       0.00      0.00      0.00      3739

    accuracy                           0.11      4216
   macro avg       0.06      0.50      0.10      4216
weighted avg       0.01      0.11      0.02      4216



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [29]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89       477
           1       0.98      0.99      0.99      3739

    accuracy                           0.98      4216
   macro avg       0.95      0.92      0.94      4216
weighted avg       0.97      0.98      0.97      4216



# SVM

In [30]:
from sklearn.svm import SVC

In [31]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [32]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92       477
           1       0.98      1.00      0.99      3739

    accuracy                           0.98      4216
   macro avg       0.99      0.92      0.95      4216
weighted avg       0.98      0.98      0.98      4216



# Neural Net

In [33]:
import tensorflow as tf

2023-06-16 19:47:02.843721: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-16 19:47:02.887916: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-16 19:47:02.888590: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [43]:
def plot_nn(history):
    fig, (ax1, ax2) = plt.subplots(1,2, fig_size=(10,4))
    ax1.plot(history.history['loss'], label='loss')
    ax1.plot(history.history['val_loss'], label='val_loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Binary Crossentropy')
    ax1.legend()
    ax1.grid(True)
    ax2.plot(history.history['accuracy'], label='accuracy')
    ax2.plot(history.history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(True)
    plt.show()




In [40]:
def nn_train(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame, epochs: int, nodes: int, dropout_prob: float, lr: int, batch_size: int):
    """Train a neural network

    Args:
        Y_train (pd.DataFrame): [the training data X]
        y_train (pd.DataFrame): [the training data y]
        X_valid, y_valid (pd.DataFrame): [validation data for the model, instead of doing learning rate for validation_split with 0.2]
        epochs (int): [the epochs, how many times the model goes through its training processs]
        nodes (int): how many nodes are created in each dense layer
        dropout_prob (float): [the probability of the dropout layer]
        lr (int): [the learning rate of the optimizer]
        batch_size (int): [the batch size]
    """    
    assert epochs < 1000, "Epochs too big"
    assert nodes < 64, "Nodes too big"
    assert 0 <= dropout_prob < 1, "Dropout Prob must be probability"
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(nodes, activation='relu', input_shape=(7,)),
        # take certain nodes at a specific rate, and dont train them
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(nodes, activation='relu'),
        tf.keras.layers.Dropout(dropout_prob),
        # this will set the output to 0 or 1, which helps with the classification
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy', metrics=['accuracy'])
    history = nn_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, y_valid), verbose=0)
    return nn_model, history


In [44]:
least_val_loss = float('inf')
least_loss_model = None
epochs = 100
for nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.2]:
        for lr in [0.1, 0.005, 0.001]:
            for batch_size in [32, 64, 128]:
                print(f"{nodes} nodes, dropout_prob {dropout_prob}, lr {lr}, batch size {batch_size}")
                model, history = nn_train(X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, epochs=epochs, nodes=nodes, dropout_prob=dropout_prob, lr=lr, batch_size=batch_size)
                plot_nn(history=history)
                val_loss = model.evaluate(X_valid, y_valid)
                if val_loss < least_val_loss:
                    least_val_loss = val_loss
                    least_loss_model = model


16 nodes, dropout_prob 0, lr 0.1, batch size 32
