NSL-KDD Network Intrusion Detection
===

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

random_seed = 17

## Auxiliary Functions

In [2]:
# Show the Model Classification Report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def show_clf_result(model_name, actual, prediction):
    
    acc = accuracy_score(actual, prediction)
    cm = confusion_matrix(actual, prediction)
    report = classification_report(actual, prediction)
    
    print("%s Classification Report" %model_name, "\n")
    print('Test Accuracy : %s' %acc, "\n")
    print("Confusion Matrix\n", cm, "\n")
    print('Classification Metrics\n', report)

## Data Loading

In [3]:
train = pd.read_csv('NSL_KDD-master/KDDTrain+.csv', header = None)
test = pd.read_csv('NSL_KDD-master/KDDTest+.csv', header = None)
data = pd.concat([train, test], axis = 0)


# Drop the last column which is unknown
data = data.drop(42, axis = 1)

# Assign the column name
columns_name = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 
                'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_hot_login',
                'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
                'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 
                'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type']
data.columns = columns_name

print('Train Data : ', train.shape)
print('Test Data :', test.shape)
print("Full Data : ", data.shape)
data.head()

Train Data :  (125973, 43)
Test Data : (22543, 43)
Full Data :  (148516, 42)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


## Create the Label Column
- Using attack diction to map to four attack type lable : DoS, Probe, R2L, U2R 

In [4]:
data['attack_type'].unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back', 'guess_passwd',
       'ftp_write', 'multihop', 'rootkit', 'buffer_overflow', 'imap',
       'warezmaster', 'phf', 'land', 'loadmodule', 'spy', 'perl', 'saint',
       'mscan', 'apache2', 'snmpgetattack', 'processtable', 'httptunnel',
       'ps', 'snmpguess', 'mailbomb', 'named', 'sendmail', 'xterm', 'worm',
       'xlock', 'xsnoop', 'sqlattack', 'udpstorm'], dtype=object)

In [5]:
attack_dict = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

data['label'] = data['attack_type'].map(attack_dict)
data = data.drop(['attack_type'], axis = 1)
print(data['label'].value_counts())

normal    77053
DoS       53385
Probe     14077
R2L        3749
U2R         252
Name: label, dtype: int64


In [6]:
label_idx = [40]
nominal_idx = [1, 2, 3]
binary_idx = [6, 11, 13, 14, 20, 21]
numeric_idx = list(set(range(41)).difference(label_idx).difference(nominal_idx).difference(binary_idx))

print(data.shape)
print(len(label_idx) + len(nominal_idx) + len(binary_idx) + len(numeric_idx))

(148516, 42)
41


## Check Missing Value & Constant Variable

In [7]:
# Check Missing Value
data is not None and not data.empty and not data.isnull().values.any()

# Check Constant Variable
for col in data.columns:
    if len(data[col].unique()) == 1:
        print("Remove Constant Variable : ", col)
        data = data.drop(col, axis=1)
print(data.shape)

Remove Constant Variable :  num_outbound_cmds
(148516, 41)


## Encoding with Categorical Variables
- One Hot Encoding
- Binary Encoding (yet)

In [8]:
data_num_ohe = pd.get_dummies(data.iloc[:, :-1])

data_ohe = pd.concat([data_num_ohe, data.iloc[:, -1]], axis = 1)
print('Full Data Shape after OHE : ', data_ohe.shape)

Full Data Shape after OHE :  (148516, 122)


In [9]:
data_ohe.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,DoS
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal


## Feature Selection Using BoostARoota

In [17]:
#!pip install boostaroota

from boostaroota import BoostARoota

label_dict = {
    'normal' : 0,
    'DoS' : 1,
    'Probe' : 2,
    'R2L' : 3,
    'U2R' : 4} 

X = data_ohe.iloc[:, :-1]
y = data_ohe.iloc[:, -1].map(label_dict)

br = BoostARoota(metric='mlogloss', iters = 4, max_rounds = 10)
br.fit(X, y)
br.keep_vars_

Round:  1  iteration:  1
Round:  1  iteration:  2
Round:  1  iteration:  3
Round:  1  iteration:  4
Round:  2  iteration:  1
Round:  2  iteration:  2
Round:  2  iteration:  3
Round:  2  iteration:  4
BoostARoota ran successfully! Algorithm went through  2  rounds.


0                   duration
1                  src_bytes
2                  dst_bytes
3                       land
4             wrong_fragment
5                     urgent
6                        hot
7          num_failed_logins
8                  logged_in
9            num_compromised
10                root_shell
11              su_attempted
12                  num_root
13        num_file_creations
14                num_shells
15          num_access_files
16              is_hot_login
17            is_guest_login
18                     count
19                 srv_count
20               serror_rate
21           srv_serror_rate
22               rerror_rate
23           srv_rerror_rate
24             same_srv_rate
25             diff_srv_rate
26        srv_diff_host_rate
27            dst_host_count
28        dst_host_srv_count
29    dst_host_same_srv_rate
               ...          
38         protocol_type_tcp
39         protocol_type_udp
40               service_X11
42            

In [18]:
new_data_ohe = br.transform(X)
print(new_data_ohe.shape)
new_data_ohe.head()

(148516, 68)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,service_urp_i,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


## Splitting Training, Testing Data

In [20]:
X_train, y_train = new_data_ohe.iloc[:train.shape[0], :-1], data_ohe.iloc[:train.shape[0], -1]
X_test, y_test = new_data_ohe.iloc[train.shape[0]:, :-1], data_ohe.iloc[train.shape[0]:, -1]

print('Training : ', X_train.shape, ' Testing : ', X_test.shape)

Training :  (125973, 67)  Testing :  (22543, 67)


In [21]:
X_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,service_time,service_urp_i,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S3,flag_SF
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
y_train.value_counts()

normal    67343
DoS       45927
Probe     11656
R2L         995
U2R          52
Name: label, dtype: int64

## First Trial : Random Forest, XGBoost, CatBoost
Without tuning hyperparameter
- Random Forest Test Acc : 0.76
- XGBoost Test Acc :

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

kfold = StratifiedKFold(n_splits = 10)

### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state = random_seed)
model_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [24]:
predict_rf = model_rf.predict(X_test)
show_clf_result('Random Forest', y_test, predict_rf)

Random Forest Classification Report 

Test Accuracy : 0.747105531651 

Confusion Matrix
 [[5777   33    0    0 1648]
 [ 166 1482    0    0  773]
 [   0    3  128    0 2623]
 [   0    0    4    8  188]
 [  70  191    2    0 9447]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.96      0.77      0.86      7458
      Probe       0.87      0.61      0.72      2421
        R2L       0.96      0.05      0.09      2754
        U2R       1.00      0.04      0.08       200
     normal       0.64      0.97      0.77      9710

avg / total       0.81      0.75      0.71     22543



### XGBoost

In [26]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(objective = 'multi:softmax',
                          n_jobs = -1,
                          random_state = random_seed)
model_xgb.fit(X_train, y_train,
              eval_set = [(X_train, y_train), (X_test, y_test)], 
              eval_metric='merror',
              early_stopping_rounds = 5,
              verbose=True)

[0]	validation_0-merror:0.029887	validation_1-merror:0.27259
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.029268	validation_1-merror:0.267356
[2]	validation_0-merror:0.024862	validation_1-merror:0.271392
[3]	validation_0-merror:0.022933	validation_1-merror:0.271836
[4]	validation_0-merror:0.023172	validation_1-merror:0.271836
[5]	validation_0-merror:0.02314	validation_1-merror:0.271126
[6]	validation_0-merror:0.023346	validation_1-merror:0.270949
Stopping. Best iteration:
[1]	validation_0-merror:0.029268	validation_1-merror:0.267356



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='multi:softprob',
       random_state=17, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [27]:
predict_xgb = model_xgb.predict(X_test)
show_clf_result('XGBoost', y_test, predict_xgb)

XGBoost Classification Report 

Test Accuracy : 0.729051146697 

Confusion Matrix
 [[5355   76    0    0 2027]
 [ 337 1478    0    0  606]
 [   0    4  162    1 2587]
 [ 107    0    3   10   80]
 [  58  219    1    2 9430]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.91      0.72      0.80      7458
      Probe       0.83      0.61      0.70      2421
        R2L       0.98      0.06      0.11      2754
        U2R       0.77      0.05      0.09       200
     normal       0.64      0.97      0.77      9710

avg / total       0.79      0.73      0.69     22543



In [None]:
import xgboost as xgb

label_dict = {
    'normal' : 0,
    'DoS' : 1,
    'Probe' : 2,
    'R2L' : 3,
    'U2R' : 4} 

train_xgb = xgb.DMatrix(X_train, y_train.map(label_dict))
test_xgb = xgb.DMatrix(X_test, y_test.map(label_dict))

watchlist  = [(test_xgb, 'test'), (train_xgb, 'train')]

In [None]:
params = {
    'objective': 'multi:softmax',
    'num_class' : 5,
}
num_rounds = 10

model_xgb = xgb.train(params, train_xgb, num_rounds, watchlist,
                early_stopping_rounds = 5)

In [None]:
predict_xgb = model_xgb.predict(test_xgb)

label_iv_dict = {
    0 : 'normal',
    1 : 'DoS',
    2 : 'Probe',
    3 : 'R2L',
    4 : 'U2R'}
predict_xgb = np.vectorize(label_iv_dict.get)(predict_xgb)

show_clf_result('XGBoost', y_test, predict_xgb)

## GBDT

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

model_gbdt = GradientBoostingClassifier(random_state = random_seed, verbose = 1)
model_gbdt.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1      142028.0629            3.08m
         2      117303.4288            2.91m
         3       98525.4997            2.84m
         4       83517.6391            2.78m
         5       71333.2791            2.74m
         6       61270.9264            2.69m
         7       52851.7826            2.64m
         8       45877.1042            2.59m
         9       39930.5474            2.55m
        10       35011.5807            2.51m
        20       11055.6757            2.17m
        30        4912.1926            1.89m
        40 8554578293079598.0000            1.60m
        50 8554578293084744.0000            1.30m
        60 8554578293084301.0000            1.02m
        70 8554578293084064.0000           46.50s
        80 8554578293083884.0000           30.48s
        90 8554578293083747.0000           15.19s
       100 8554578293083615.0000            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=17, subsample=1.0, verbose=1,
              warm_start=False)

In [29]:
predict_gbdt = model_gbdt.predict(X_test)
show_clf_result('Random Forest', y_test, predict_gbdt)

Random Forest Classification Report 

Test Accuracy : 0.771946945837 

Confusion Matrix
 [[5978   27    0    0 1453]
 [ 166 1673   19    0  563]
 [   0    8  305    2 2439]
 [   0    6    5   12  177]
 [  72  201    1    2 9434]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.96      0.80      0.87      7458
      Probe       0.87      0.69      0.77      2421
        R2L       0.92      0.11      0.20      2754
        U2R       0.75      0.06      0.11       200
     normal       0.67      0.97      0.79      9710

avg / total       0.82      0.77      0.74     22543



## Ensemble Method : Major Voting

In [30]:
from sklearn.ensemble import VotingClassifier

model_mv = VotingClassifier(estimators=[('rf', model_rf),
                                        ('gbdt', model_gbdt),
                                        ('xgb', model_xgb)], voting='soft', n_jobs = -1)

model_mv = model_mv.fit(X_train, y_train)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x1125ff908>>
Traceback (most recent call last):
  File "/Users/kennyhsieh/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 368, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


      Iter       Train Loss   Remaining Time 
         1      142028.0629            4.51m
         2      117303.4288            4.28m
         3       98525.4997            4.10m
         4       83517.6391            3.87m
         5       71333.2791            3.86m
         6       61270.9264            3.93m
         7       52851.7826            3.87m
         8       45877.1042            3.80m
         9       39930.5474            3.78m
        10       35011.5807            3.70m
        20       11055.6757            2.94m
        30        4912.1926            2.52m
        40 8554578293079598.0000            2.07m
        50 8554578293084744.0000            1.66m
        60 8554578293084301.0000            1.29m
        70 8554578293084064.0000           56.59s
        80 8554578293083884.0000           36.59s
        90 8554578293083747.0000           17.78s
       100 8554578293083615.0000            0.00s


In [31]:
predict_mv = model_mv.predict(X_test)
show_clf_result('Major Voting', y_test, predict_mv)

Major Voting Classification Report 

Test Accuracy : 0.768353812713 

Confusion Matrix
 [[6092   22    0    0 1344]
 [ 166 1494   12    0  749]
 [   0    4  287    2 2461]
 [   0    1    3    7  189]
 [  70  197    1    1 9441]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.96      0.82      0.88      7458
      Probe       0.87      0.62      0.72      2421
        R2L       0.95      0.10      0.19      2754
        U2R       0.70      0.04      0.07       200
     normal       0.67      0.97      0.79      9710

avg / total       0.82      0.77      0.73     22543



## Ensemble Method : Stacking

## Trivial

In [None]:
# Reverse Mapping
#inv_attack_dict = {v: k for k, v in attack_dict.items()}
#train['label'] = train['label'].map(inv_attack_dict)