<a href="https://colab.research.google.com/github/jwhwan9/colab/blob/main/Machine_Learning_For_IoRT_Security.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Machine Learning for IoRT Security of Robotic Control Systems**
---
### Author: Thierno Gueye
### Description: This notebook provides a detailed analysis of the application of machine learning techniques to the security of control systems of Robotic devices in an IoRT network.
---


In [None]:
# Importing relevant python libraries for Exploratory Data Analysis

import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# helper code for extracting the tar file from source
import tarfile

from os import mkdir
from os.path import isdir

def extract_tar(source, dest):
    src_path = source
    dst_path = dest

    # create destination dir if it does not exist
    if isdir(dst_path) == False:
        mkdir(dst_path)

    if src_path.endswith('tar.gz'):
        tar = tarfile.open(src_path, 'r:gz')
        tar.extractall(dst_path)
        tar.close()

# NOTE: This is a temporary source path due to storage limitations for google drive. The data can always be found from the
# official repository and transferred to google drive using Multcloud free drive transfer service.
data_file = '/content/drive/MyDrive/Projects - Freelance/Colab Notebooks/Thierno Gueye/Code Files/iot_23_datasets_small.tar.gz'
extracted = '/content/sample_data/extracted_files'    # a temporary extraction location

# call extract function
extract_tar(data_file, extracted)

# Data Pre-Processing

In [None]:
# connecting to the google drive platform

malware_dataset_ids = [1,17,20,21,3,33,34,35,36,39,42,43,44,48,49,52,60,7,8,9]
benign_dataset_ids = [7,5,4]
data_columns = ['ts',
              'uid',
              'id.orig_h',
              'id.orig_p',
              'id.resp_h',
              'id.resp_p',
              'proto',
              'service',
              'duration',
              'orig_bytes',
              'resp_bytes',
              'conn_state',
              'local_orig',
              'local_resp',
              'missed_bytes',
              'history',
              'orig_pkts',
              'orig_ip_bytes',
              'resp_pkts',
              'resp_ip_bytes',
              'label']
dataset_dict = {}

# Dynamically creating dictionary keys to hold the malware datasets
for i, id in enumerate(malware_dataset_ids):
  filepath = f"./sample_data/extracted_files/opt/Malware-Project/BigDataset/" + \
              f"IoTScenarios/CTU-IoT-Malware-Capture-{id}-1/bro/conn.log.labeled"
  dataset_dict[f"df{i+1}"] = pd.read_table(filepath_or_buffer=filepath, skiprows=10, nrows=100000)
  locals().update(dataset_dict)    # Creating the variables


# Adding the benign datasets
for i, id in enumerate(benign_dataset_ids):
  if id == 7:
    filepath = f"./sample_data/extracted_files/opt/Malware-Project/BigDataset/" + \
                f"IoTScenarios/CTU-Honeypot-Capture-{id}-1/Somfy-01/bro/conn.log.labeled"
  else:
    filepath = f"./sample_data/extracted_files/opt/Malware-Project/BigDataset/" + \
                f"IoTScenarios/CTU-Honeypot-Capture-{id}-1/bro/conn.log.labeled"

  dataset_dict[f"df{21+i}"] = pd.read_table(filepath_or_buffer=filepath, skiprows=10, nrows=100000)
  locals().update(dataset_dict)    # Creating the variables


# Extra housekeeping on the dataset
for i,x in enumerate(dataset_dict):
  dataset_dict[f"df{i+1}"].columns = data_columns
  dataset_dict[f"df{i+1}"].drop(dataset_dict[f"df{i+1}"].tail(1).index,inplace=True)


In [None]:
! ls './sample_data/extracted_files/'

opt


In [None]:
dataset = pd.concat(dataset_dict.values())
dataset.head(5)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,1525879832.01624,CDe43c1PtgynajGI6,192.168.100.103,60905.0,131.174.215.147,23.0,tcp,-,2.998796,0,...,S0,-,-,0.0,S,3.0,180.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
1,1525879832.024985,CJaDcG3MZzvf1YVYI4,192.168.100.103,44301.0,91.42.47.63,23.0,tcp,-,-,-,...,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
2,1525879832.044975,CMBrup3BLXivSp4Avc,192.168.100.103,50244.0,120.210.108.200,23.0,tcp,-,-,-,...,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty) Malicious PartOfAHorizontalPortScan
3,1525879833.016171,CfHl9r3XMYtDQRrHnh,192.168.100.103,34243.0,147.7.65.203,49560.0,tcp,-,2.998804,0,...,S0,-,-,0.0,S,3.0,180.0,0.0,0.0,(empty) Benign -
4,1525879833.044906,C7USrA15nFVkniMqC5,192.168.100.103,34840.0,145.164.35.6,21288.0,tcp,-,-,-,...,S0,-,-,0.0,S,1.0,60.0,0.0,0.0,(empty) Benign -


In [None]:
dataset.shape

(1446621, 21)

In [None]:
# Now observing the various labels of records in the dataset
dataset['label'].value_counts()

PartOfAHorizontalPortScan     825939
Okiru                         262690
Benign                        199756
DDoS                          138777
C&C                            15100
Attack                          3915
C&C-HeartBeat                    349
C&C-FileDownload                  43
C&C-Torii                         30
FileDownload                      13
C&C-HeartBeat-FileDownload         8
C&C-Mirai                          1
Name: label, dtype: int64

# Data Cleansing

### Since the dataset contains some messy data, we can clean-up with a re-labeling of values, trimming of unnecesary features i.e. ts, uid, etc...

In [None]:
# Renaming improperly labeled classes.

dataset.loc[(dataset.label == '-   Malicious   PartOfAHorizontalPortScan'), 'label'] = 'PartOfAHorizontalPortScan'
dataset.loc[(dataset.label == '(empty)   Malicious   PartOfAHorizontalPortScan'), 'label'] = 'PartOfAHorizontalPortScan'
dataset.loc[(dataset.label == '-   Malicious   Okiru'), 'label'] = 'Okiru'
dataset.loc[(dataset.label == '(empty)   Malicious   Okiru'), 'label'] = 'Okiru'
dataset.loc[(dataset.label == '-   Benign   -'), 'label'] = 'Benign'
dataset.loc[(dataset.label == '(empty)   Benign   -'), 'label'] = 'Benign'
dataset.loc[(dataset.label == '-   benign   -'), 'label'] = 'Benign'
dataset.loc[(dataset.label == '-   Malicious   DDoS'), 'label'] = 'DDoS'
dataset.loc[(dataset.label == '-   Malicious   C&C'), 'label'] = 'C&C'
dataset.loc[(dataset.label == '(empty)   Malicious   C&C'), 'label'] = 'C&C'
dataset.loc[(dataset.label == '-   Malicious   Attack'), 'label'] = 'Attack'
dataset.loc[(dataset.label == '(empty)   Malicious   Attack'), 'label'] = 'Attack'
dataset.loc[(dataset.label == '-   Malicious   C&C-HeartBeat'), 'label'] = 'C&C-HeartBeat'
dataset.loc[(dataset.label == '(empty)   Malicious   C&C-HeartBeat'), 'label'] = 'C&C-HeartBeat'
dataset.loc[(dataset.label == '-   Malicious   C&C-FileDownload'), 'label'] = 'C&C-FileDownload'
dataset.loc[(dataset.label == '-   Malicious   C&C-Torii'), 'label'] = 'C&C-Torii'
dataset.loc[(dataset.label == '-   Malicious   C&C-HeartBeat-FileDownload'), 'label'] = 'C&C-HeartBeat-FileDownload'
dataset.loc[(dataset.label == '-   Malicious   FileDownload'), 'label'] = 'FileDownload'
dataset.loc[(dataset.label == '-   Malicious   C&C-Mirai'), 'label'] = 'C&C-Mirai'
dataset.loc[(dataset.label == '-   Malicious   Okiru-Attack'), 'label'] = 'Okiru-Attack'

# Viewing the cleaner output
dataset['label'].value_counts()

PartOfAHorizontalPortScan     825939
Okiru                         262690
Benign                        199756
DDoS                          138777
C&C                            15100
Attack                          3915
C&C-HeartBeat                    349
C&C-FileDownload                  43
C&C-Torii                         30
FileDownload                      13
C&C-HeartBeat-FileDownload         8
C&C-Mirai                          1
Name: label, dtype: int64

Based on domain understanding of network analysis, certain features are identified as irrelevant to the next session of analysis and machine learning. These include the timestamp feature 'ts'; the unique identifier 'uid', which is a random sequence for making the record unique; id_orig_h and id_resp_h, which are just ip-addresses of the requesting devices; local_orig and local_resp, which just contains the '-' values representing no information; history only contains irrelevant data, so it can be dumped;


In [None]:
# Dropping irrelevant columns
columns_to_drop = ['ts','uid','id.orig_h','id.resp_h','local_orig','local_resp','history']
dataset.drop(columns_to_drop, axis=1, inplace=True)
dataset.head()

Unnamed: 0,id.orig_p,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,60905.0,23.0,tcp,-,2.998796,0,0,S0,0.0,3.0,180.0,0.0,0.0,PartOfAHorizontalPortScan
1,44301.0,23.0,tcp,-,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan
2,50244.0,23.0,tcp,-,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan
3,34243.0,49560.0,tcp,-,2.998804,0,0,S0,0.0,3.0,180.0,0.0,0.0,Benign
4,34840.0,21288.0,tcp,-,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,Benign


In [None]:
# issue with dataset [service]
dataset['service'].unique()

array(['-', 'http', 'dns', 'irc', 'ssh', 'dhcp', 'ssl'], dtype=object)

In [None]:
# Replacing '-' with '0' for all instances in the dataframe
dataset['service'] = dataset['service'].replace('-',np.nan)
dataset.head()

Unnamed: 0,id.orig_p,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,60905.0,23.0,tcp,,2.998796,0,0,S0,0.0,3.0,180.0,0.0,0.0,PartOfAHorizontalPortScan
1,44301.0,23.0,tcp,,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan
2,50244.0,23.0,tcp,,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,PartOfAHorizontalPortScan
3,34243.0,49560.0,tcp,,2.998804,0,0,S0,0.0,3.0,180.0,0.0,0.0,Benign
4,34840.0,21288.0,tcp,,-,-,-,S0,0.0,1.0,60.0,0.0,0.0,Benign


In [None]:
# issue with dataset [service]
dataset['service'].unique()

array([nan, 'http', 'dns', 'irc', 'ssh', 'dhcp', 'ssl'], dtype=object)

In [None]:
# Finally, we check for NANs in the dataset
dataset.isna().sum()

id.orig_p              0
id.resp_p              0
proto                  0
service          1434755
duration               0
orig_bytes             0
resp_bytes             0
conn_state             0
missed_bytes           0
orig_pkts              0
orig_ip_bytes          0
resp_pkts              0
resp_ip_bytes          0
label                  0
dtype: int64

Since the SERVICE feature has too many missing values i.e. 99.12% of the total observations, it is dropped as well.

In [None]:
# dropping the SERVICE feature
dataset.drop('service', axis=1, inplace=True)

In [None]:
# Let us check the various protocols used in network connections
dataset['proto'].unique()

array(['tcp', 'udp', 'icmp'], dtype=object)

In [None]:
# Also, we observe the connection states in network connections
dataset['conn_state'].unique()

array(['S0', 'REJ', 'SF', 'OTH', 'RSTOS0', 'RSTR', 'S2', 'RSTRH', 'RSTO',
       'S1', 'SHR', 'SH', 'S3'], dtype=object)

In [None]:
# Since our protocols and connection states are categorical, we convert them to dummy numerical features
dataset = pd.get_dummies(dataset, columns=['proto', 'conn_state'])
dataset.head()

Unnamed: 0,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,...,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR
0,60905.0,23.0,2.998796,0,0,0.0,3.0,180.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
1,44301.0,23.0,-,-,-,0.0,1.0,60.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
2,50244.0,23.0,-,-,-,0.0,1.0,60.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
3,34243.0,49560.0,2.998804,0,0,0.0,3.0,180.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,34840.0,21288.0,-,-,-,0.0,1.0,60.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
# Writing the dataframe to a comma seperated variable (csv) file for easy sharing and future work
dataset.to_csv("/content/drive/MyDrive/Projects - Freelance/Colab Notebooks/Thierno Gueye/Code Files/IoT-23 Updated Clean Dataset.csv")

In [None]:
# This piece of code removes the extracted files to save storage space
! rm -rf '/content/sample_data/extracted_files'