In [1]:
# Author: Kaylani Bochie
# github.com/kaylani2
# kaylani AT gta DOT ufrj DOT br

import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from scipy.io import arff




In [2]:
###############################################################################
## Define constants
###############################################################################
# Random state for reproducibility
STATE = 0
np.random.seed (STATE)
## Hard to not go over 80 columns
IOT_DIRECTORY = '../../../../datasets/cardiff/IoT-Arff-Datasets/'
IOT_ATTACK_TYPE_FILENAME = 'AttackTypeClassification.arff'
FILE_NAME = IOT_DIRECTORY + IOT_ATTACK_TYPE_FILENAME


In [4]:

###############################################################################
## Load dataset
###############################################################################
pd.set_option ('display.max_rows', None)
pd.set_option ('display.max_columns', 5)
data = arff.loadarff (FILE_NAME)
df = pd.DataFrame (data [0])
print ('Dataframe shape (lines, collumns):', df.shape, '\n')
print ('First 5 entries:\n', df [:5], '\n')

## Fraction dataframe for quicker testing (copying code is hard)
#df = df.sample (frac = 0.1, replace = True, random_state = STATE)
#print ('Using fractured dataframe.')

### Decode byte strings into ordinary strings:
print ('Decoding byte strings into ordinary strings.')
strings = df.select_dtypes ( [np.object])
strings = strings.stack ().str.decode ('utf-8').unstack ()
for column in strings:
    df [column] = strings [column]
print ('Done.\n')




Dataframe shape (lines, collumns): (220785, 136) 

First 5 entries:
    packet_id     num  ...  class_is_malicious  class_attack_type
0  1230361.0    98.0  ...                b'0'             b'N/A'
1  1131523.0    66.0  ...                b'0'             b'N/A'
2    63682.0  1294.0  ...                b'0'             b'N/A'
3    64788.0    95.0  ...                b'0'             b'N/A'
4    78512.0    60.0  ...                b'0'             b'N/A'

[5 rows x 136 columns] 

Decoding byte strings into ordinary strings.
Done.



In [5]:
###############################################################################
## Display generic (dataset independent) information
###############################################################################
print ('Dataframe shape (lines, collumns):', df.shape, '\n')
print ('First 5 entries:\n', df [:5], '\n')
#print ('Dataframe attributes:\n', df.keys (), '\n')
df.info (verbose = False) # Make it true to find individual atribute types
#print (df.describe ()) # Brief statistical description on NUMERICAL atributes

print ('Dataframe contains NaN values:', df.isnull ().values.any ())
nanColumns = [i for i in df.columns if df [i].isnull ().any ()]
print ('Number of NaN columns:', len (nanColumns))
#print ('NaN columns:', nanColumns, '\n')




Dataframe shape (lines, collumns): (220785, 136) 

First 5 entries:
    packet_id     num  ...  class_is_malicious  class_attack_type
0  1230361.0    98.0  ...                   0                N/A
1  1131523.0    66.0  ...                   0                N/A
2    63682.0  1294.0  ...                   0                N/A
3    64788.0    95.0  ...                   0                N/A
4    78512.0    60.0  ...                   0                N/A

[5 rows x 136 columns] 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220785 entries, 0 to 220784
Columns: 136 entries, packet_id to class_attack_type
dtypes: float64(112), object(24)
memory usage: 229.1+ MB
Dataframe contains NaN values: True
Number of NaN columns: 94


In [6]:
###############################################################################
## Display specific (dataset dependent) information
###############################################################################
print ('Label types:', df ['class_attack_type'].unique ())
print ('Label distribution:\n', df ['class_attack_type'].value_counts ())




Label types: ['N/A' 'DoS' 'iot-toolkit' 'MITM' 'Scanning']
Label distribution:
 N/A            110390
Scanning        51479
DoS             41236
MITM             9085
iot-toolkit      8595
Name: class_attack_type, dtype: int64


In [7]:
###############################################################################
## Data pre-processing
###############################################################################
df.replace (['NaN', 'NaT'], np.nan, inplace = True)
df.replace ('?', np.nan, inplace = True)
df.replace ('Infinity', np.nan, inplace = True)



In [8]:
## Remove NaN values
print ('Column | NaN values')
print (df.isnull ().sum ())
### K: 150k samples seems to be a fine cutting point for this dataset
print ('Removing attributes with more than half NaN and inf values.')
df = df.dropna (axis = 'columns', thresh = 150000)
print ('Dataframe contains NaN values:', df.isnull ().values.any ())
print ('Column | NaN values (after dropping columns)')
print (df.isnull ().sum ())
### K: This leaves us with the following attributes to encode:
### Attribute            NaN values
#   ip.hdr_len           7597
#   ip.dsfield.dscp      7597
#   ip.dsfield.ecn       7597
#   ip.len               7597
#   ip.flags             7597
#   ip.frag_offset       7597
#   ip.ttl               7597
#   ip.proto             7597
#   ip.checksum.status   7597git log --all --graph --decorate
### K: Options: Remove these samples or handle them later.
### K: Removing them for now.
print ('Removing samples with NaN values (not a lot of these).')
df = df.dropna (axis = 'rows', thresh = df.shape [1])
print ('Column | NaN values (after dropping rows)')
print (df.isnull ().sum ())
print ('Dataframe contains NaN values:', df.isnull ().values.any ())

### K: We probably want to remove attributes that have only one sampled value.
print ('Removing attributes that have only one sampled value.')
print ('Column | # of different values')
nUniques = df.nunique ()
for column, nUnique in zip (df.columns, nUniques):
  if (nUnique <= 7):
    print (column, df [column].unique ())
  else:
    print (column, nUnique)

  if (nUnique == 1): # Only one value: DROP.
    df.drop (axis = 'columns', columns = column, inplace = True)


df.info (verbose = False)
### K: dtypes: float64 (27), int64 (1), object (5)
#print (df.columns.to_series ().groupby (df.dtypes).groups, '\n\n')
print ('Objects:', list (df.select_dtypes ( ['object']).columns), '\n')
### K: Objects: [
# 'ip.flags.df', {0, 1}
# 'ip.flags.mf', {0, 1}
# 'packet_type', {in, out}
# LABELS:
# 'class_device_type', {AmazonEcho, BelkinCam, Hive, SmartThings,
#                       Lifx, TPLinkCam, TPLinkPlug, AP, Firewall, unknown}
# 'class_is_malicious' {0, 1}
#]

### K: Look into each attribute to define the best encoding strategy.
### K: NOTE: packet_type and class_device_type are labels for different
### applications, not attributes. They must not be used to aid classification.
print ('Dropping class_device_type and class_is_malicious.')
print ('These are labels for other scenarios.')
df.drop (axis = 'columns', columns = 'class_device_type', inplace = True)
df.drop (axis = 'columns', columns = 'class_is_malicious', inplace = True)

### K: NOTE: ip.flags.df and ip.flags.mf only have numerical values, but have
### been loaded as objects because (probably) of missing values, so we can
### just convert them instead of treating them as categorical.
print ('ip.flags.df and ip.flags.mf have been incorrectly read as objects.')
print ('Converting them to numeric.')
df ['ip.flags.df'] = pd.to_numeric (df ['ip.flags.df'])
df ['ip.flags.mf'] = pd.to_numeric (df ['ip.flags.mf'])
print ('Objects:', list (df.select_dtypes ( ['object']).columns), '\n')



###############################################################################
## Encode Label
###############################################################################
print ('Enconding label.')
print ('Label types before conversion:', df ['class_attack_type'].unique ())
#df ['class_attack_type'] = df ['class_attack_type'].replace ('N/A', 0)
#df ['class_attack_type'] = df ['class_attack_type'].replace ('DoS', 1)
#df ['class_attack_type'] = df ['class_attack_type'].replace ('iot-toolkit', 2)
#df ['class_attack_type'] = df ['class_attack_type'].replace ('MITM', 3)
#df ['class_attack_type'] = df ['class_attack_type'].replace ('Scanning', 4)
print ('Label types after conversion:', df ['class_attack_type'].unique ())




Column | NaN values
packet_id                            0
num                                  0
len                                  0
caplen                               0
timestamp                            0
frame.encap_type                     0
frame.offset_shift                   0
frame.time_epoch                     0
frame.time_delta                     0
frame.time_delta_displayed           0
frame.time_relative                  0
frame.number                         0
frame.len                            0
frame.cap_len                        0
frame.marked                         0
frame.ignored                        0
eth.lg                               0
eth.ig                               0
ip.version                        7597
ip.hdr_len                        7597
ip.dsfield.dscp                   7597
ip.dsfield.ecn                    7597
ip.src                          220785
ip.dst                          220785
ip.len                            7597
ip.fl

eth.lg [0.]
eth.ig [0.]
ip.version ['4']
ip.hdr_len [20. 24.]
ip.dsfield.dscp [ 0.  4. 48. 32.  6. 46.  5.]
ip.dsfield.ecn [0.]
ip.len 512
ip.flags [40.  0. 21. 20.  1.]
ip.flags.rb ['0']
ip.flags.df ['1' '0']
ip.flags.mf ['0' '1']
ip.frag_offset 190
ip.ttl 61
ip.proto [ 1.  6. 17.  2.]
ip.checksum.status [2.]
packet_type ['out' 'in']
class_device_type ['AmazonEcho' 'BelkinCam' 'Hive' 'SmartThings' 'Lifx' 'TPLinkCam'
 'TPLinkPlug']
class_is_malicious ['0' '1']
class_attack_type ['N/A' 'DoS' 'iot-toolkit' 'MITM' 'Scanning']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 213188 entries, 0 to 220784
Columns: 25 entries, packet_id to class_attack_type
dtypes: float64(19), object(6)
memory usage: 42.3+ MB
Objects: ['ip.flags.df', 'ip.flags.mf', 'packet_type', 'class_device_type', 'class_is_malicious', 'class_attack_type'] 

Dropping class_device_type and class_is_malicious.
These are labels for other scenarios.
ip.flags.df and ip.flags.mf have been incorrectly read as objects.
Converting

In [19]:
###############################################################################
## Handle categorical attributes
###############################################################################
print ('\nHandling categorical attributes (label encoding).')
from sklearn.preprocessing import LabelEncoder
myLabelEncoder = LabelEncoder ()
df ['packet_type'] = myLabelEncoder.fit_transform (df ['packet_type'])

### TODO: onehotencoder ta dando nan na saida, ajeitar isso ai
#from sklearn.preprocessing import OneHotEncoder
#enc = OneHotEncoder (handle_unknown = 'error')
#enc_df = pd.DataFrame (enc.fit_transform (df [ ['packet_type']]).toarray ())
#df = df.join (enc_df)
#df.drop (axis = 'columns', columns = 'packet_type', inplace = True)
#
#### K: NOTE: This transformed the dataframe in a way that the last column is
#### no longer the target. We have to fix that:
#cols_at_end = ['class_attack_type']
#df = df [ [c for c in df if c not in cols_at_end]
#        + [c for c in cols_at_end if c in df]]

print ('Label types:', df ['class_attack_type'].unique ())
print ('Label distribution:\n', df ['class_attack_type'].value_counts ())

print(df.columns)
###############################################################################
## Convert dataframe to a numpy array
###############################################################################
print ('\nConverting dataframe to numpy array.')
X = df.iloc [:, :-1].values
y = df.iloc [:, -1].values


print(X.shape)


Handling categorical attributes (label encoding).
Label types: ['N/A' 'DoS' 'iot-toolkit' 'MITM' 'Scanning']
Label distribution:
 N/A            110390
Scanning        50956
DoS             41236
iot-toolkit      8433
MITM             2173
Name: class_attack_type, dtype: int64
Index(['packet_id', 'num', 'len', 'caplen', 'timestamp', 'frame.time_epoch',
       'frame.time_delta', 'frame.time_delta_displayed', 'frame.time_relative',
       'frame.number', 'frame.len', 'frame.cap_len', 'ip.hdr_len',
       'ip.dsfield.dscp', 'ip.len', 'ip.flags', 'ip.flags.df', 'ip.flags.mf',
       'ip.frag_offset', 'ip.ttl', 'ip.proto', 'packet_type',
       'class_attack_type'],
      dtype='object')

Converting dataframe to numpy array.
(213188, 22)


In [28]:
###############################################################################
## Split dataset into train and test sets
###############################################################################
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 4/10,
                                                     random_state = STATE)
print ('X_train shape:', X_train.shape)
print ('y_train shape:', y_train.shape)
print ('X_test shape:', X_test.shape)
print ('y_test shape:', y_test.shape)


###############################################################################
## Apply normalization
###############################################################################
print ('Applying normalization (standard)')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler ()
scaler.fit (X_train)
#print ('Mean before scalling:', scaler.mean_)
X_train = scaler.transform (X_train)
scaler.fit (X_train)
#print ('Mean after scalling:', scaler.mean_)

scaler.fit (X_test)
X_test = scaler.transform (X_test)

#### K: One hot encode the output.
#import keras.utils
#from keras.utils import to_categorical
numberOfClasses = df ['class_attack_type'].value_counts ()
#y_train = keras.utils.to_categorical (y_train, numberOfClasses)
#y_test = keras.utils.to_categorical (y_test, numberOfClasses)

print('\n\nDebugging number of samples')
numberOfClasses


X_train shape: (127912, 22)
y_train shape: (127912,)
X_test shape: (85276, 22)
y_test shape: (85276,)
Applying normalization (standard)


N/A            110390
Scanning        50956
DoS             41236
iot-toolkit      8433
MITM             2173
Name: class_attack_type, dtype: int64

In [29]:
###############################################################################
## Create learning model (Naive Bayes)
###############################################################################
print ('Creating learning model.')
# training a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import f1_score, classification_report, multilabel_confusion_matrix
gnb = GaussianNB ()
gnb.fit (X_train, y_train)
gnb_predictions = gnb.predict (X_test)

# accuracy on X_test
accuracy = gnb.score (X_test, y_test)
print ('acc:', accuracy)

# creating a confusion matrix
print (confusion_matrix (y_test, gnb_predictions) )
print ('\n\n')
#print (multilabel_confusion_matrix (y_test, gnb_predictions) )
print ('\n\n')
print (classification_report (y_test, gnb_predictions, target_names = df ['class_attack_type'].unique (), digits = 3))
print ('\n\n')
print ('precision score', precision_score (y_test, gnb_predictions, average = 'macro') )
print ('recall score', recall_score (y_test, gnb_predictions, average = 'macro') )
print ('f1 score', f1_score (y_test, gnb_predictions, average = 'macro') )

sys.exit ()



Creating learning model.
acc: 0.6179464327595103
[[ 8693     0  8006     0     0]
 [    0     0   816     0     0]
 [    0     0 44003     0     0]
 [    0     0 20435     0     0]
 [    0     0  3323     0     0]]








  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         N/A      1.000     0.521     0.685     16699
         DoS      0.000     0.000     0.000       816
 iot-toolkit      0.575     1.000     0.730     44003
        MITM      0.000     0.000     0.000     20435
    Scanning      0.000     0.000     0.000      3323

    accuracy                          0.618     85276
   macro avg      0.315     0.304     0.283     85276
weighted avg      0.492     0.618     0.511     85276






  _warn_prf(average, modifier, msg_start, len(result))


precision score 0.314915842941645
recall score 0.3041140188035212
f1 score 0.2829046451496244


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [15]:
###############################################################################
## Analyze results
###############################################################################
from sklearn.metrics import confusion_matrix, classification_report
### K: NOTE: Only look at test results when publishing...
# gnb.predict outputs one hot encoding, our test is label encoded....
y_pred = gnb.predict (X_test)
#print ('y_pred shape:', y_pred.shape)
#print ('y_test shape:', y_test.shape)
#print (y_pred [:50])
y_pred = y_pred.round ()
#print (y_pred [:50])
#print (confusion_matrix (y_test, y_pred))
print (classification_report (y_test, y_pred, digits = 3))
scoreArray = gnb.evaluate (X_test, y_test, verbose = True)
print ('Test loss:', scoreArray [0])
print ('Test accuracy:', scoreArray [1])

import matplotlib.pyplot as plt

plt.plot (history.history ['categorical_accuracy'])
plt.plot (history.history ['val_categorical_accuracy'])
plt.title ('gnb accuracy')
plt.ylabel ('Categorical Accuracy')
plt.xlabel ('Epoch')
plt.legend (['Train', 'Validation'], loc = 'upper left')
plt.show ()

plt.plot (history.history ['accuracy'])
plt.plot (history.history ['val_accuracy'])
plt.title ('gnb accuracy')
plt.ylabel ('Accuracy')
plt.xlabel ('Epoch')
plt.legend (['Train', 'Validation'], loc = 'upper left')
plt.show ()

plt.plot (history.history ['loss'])
plt.plot (history.history ['val_loss'])
plt.title ('gnb loss')
plt.ylabel ('Loss')
plt.xlabel ('Epoch')
plt.legend (['Train', 'Validation'], loc = 'upper left')
plt.show ()



sys.exit ()

TypeError: ufunc 'rint' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''