In [115]:
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.svm import SVC
import numpy as np
import hdbscan
import os.path
from seaborn import scatterplot

from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import socket
import struct



## Preparation

In [116]:
import pandas as pd
import os.path
FILEID = 1
FILEIDS = [1, 3, 36, 39, 49, 52]
LOGFILE = f"https://mcfp.felk.cvut.cz/publicDatasets/IoTDatasets/CTU-IoT-Malware-Capture-{FILEID}-1/bro/conn.log.labeled"
NUMERIC_COLUMNS = ['ts', 'orig_p', 'resp_p', "orig_bytes", "resp_bytes", "missed_bytes", "orig_pkts", 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'duration'] # integer columns

In [117]:
def convert_to_csv(FILEID):
    if not os.path.isfile(f'csv/capture{FILEID}_1.csv'):
        LOGFILE = f"https://mcfp.felk.cvut.cz/publicDatasets/IoTDatasets/CTU-IoT-Malware-Capture-{FILEID}-1/bro/conn.log.labeled"
        fieldsIN = ['ts', 'uid', 'orig_h', 'orig_p', 'resp_h', 'resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state',
                'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'label', 'detailed_label']
        df = pd.read_csv(LOGFILE, sep="\x09|\x20\x20\x20", skiprows=10, skipfooter=2,
                        names=fieldsIN, header=None, engine='python')
                                
        df = df.drop(['tunnel_parents'], axis=1)
        df.to_csv(f'csv/capture{FILEID}_1.csv')
        print(df.shape)
        for column in NUMERIC_COLUMNS:
            df[column] = pd.to_numeric(df[column], errors='coerce')
        return df


In [149]:
def convert_ipv4(addr):
    return struct.unpack("!I", socket.inet_aton(addr))[0]

def con_proto(proto):
    protos = ['icmp', 'tcp', 'udp']
    return protos.index(proto)

In [150]:
def norm_df(df, rows=None):
    df = df.replace('-', np.nan)
    df['orig_h'] = df['orig_h'].apply(convert_ipv4)
    df['resp_h'] = df['resp_h'].apply(convert_ipv4)
    # df = pd.get_dummies(df, columns=['proto'], dtype=int)
    df['proto'] = df['proto'].apply(con_proto)
    # print(dummies.head())
    df = df.drop(['Unnamed: 0','service', 'duration', 'missed_bytes', 'history', 'uid', 'conn_state', 'local_orig', 'local_resp', 'orig_ip_bytes', 'resp_ip_bytes', 'orig_pkts', 'resp_pkts', 'detailed_label', 'ts'], axis=1)
    df['label'] = (df['label'] == "Malicious").astype(int)
    # df['resp_bytes'] = df['resp_bytes'].apply(to_int)
    df = df.dropna()
    df['resp_bytes'] = df['resp_bytes'].astype(int)
    df['orig_bytes'] = df['orig_bytes'].astype(int)
    df = df.reset_index(drop=True)
    if (rows): return df[:rows]
    else: return df

In [151]:
SOURCEDF = norm_df(pd.read_csv('csv/capture1_1.csv'), 500000)
print(f"{SOURCEDF.shape}\n")
print(f'{SOURCEDF.dtypes}\n')
print(f"data categories = [Malicious: {SOURCEDF[SOURCEDF['label'] == 1].shape[0]}, Benign: {SOURCEDF[SOURCEDF['label'] == 0].shape[0]}]")
SOURCEDF.head()


(210787, 8)

orig_h        int64
orig_p        int64
resp_h        int64
resp_p        int64
proto         int64
orig_bytes    int64
resp_bytes    int64
label         int64
dtype: object

data categories = [Malicious: 187771, Benign: 23016]


Unnamed: 0,orig_h,orig_p,resp_h,resp_p,proto,orig_bytes,resp_bytes,label
0,3232261223,60905,2209273747,23,1,0,0,1
1,3232261223,34243,2466726347,49560,1,0,0,0
2,3232261223,52259,3250416306,23,1,0,0,1
3,3232261223,42207,178903906,8080,1,0,0,1
4,3232261223,40459,3564025020,23,1,0,0,1


## Visualise Data

In [121]:
def viz(df):
    for label in df.columns:
        plt.hist(df[df["label"] == 1][label], color='blue', label='Malicious', alpha=0.7, density=1)
        plt.hist(df[df["label"] == 0][label], color='red', label='Benign', alpha=0.7, density=1)
        plt.title(label)
        plt.ylabel('?')
        plt.xlabel(label)
        plt.legend()
        plt.show()

# viz(SOURCEDF)

## Training using ANN
link: [here](https://www.analyticsvidhya.com/blog/2021/10/implementing-artificial-neural-networkclassification-in-python-from-scratch/)

or [here](https://towardsdatascience.com/visualizing-artificial-neural-networks-anns-with-just-one-line-of-code-b4233607209e)

this for encoding: [encoding](https://datagy.io/sklearn-one-hot-encode/)

In [122]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
# used to sample the data in a way, that the dataset is equally distributed between categories
from imblearn.over_sampling import RandomOverSampler


In [192]:
def split(df):
    train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
    return train, valid, test

In [193]:
def scale(df: pd.DataFrame, oversample = False):
    X = df.drop(['label'], axis=1)
    y = df['label']

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)

    data = np.hstack((X, np.reshape(y, (-1, 1))))
    return data, X, y

In [194]:
train, valid, test = split(SOURCEDF)
train, X_train, y_train = scale(train, True) 
valid, X_valid, y_valid = scale(valid, False) 
test, X_test, y_test = scale(test, False) 

# kNN

In [195]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [196]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

In [197]:
y_pred = knn_model.predict(X_test)

In [198]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96      4412
           1       1.00      0.99      1.00     37746

    accuracy                           0.99     42158
   macro avg       0.97      0.99      0.98     42158
weighted avg       0.99      0.99      0.99     42158

