NSL-KDD Network Intrusion Detection
===

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

random_seed = 17

## Auxiliary Functions

In [3]:
# Show the Model Classification Report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def show_clf_result(model_name, actual, prediction):
    
    acc = accuracy_score(actual, prediction)
    cm = confusion_matrix(actual, prediction)
    report = classification_report(actual, prediction)
    
    print("%s Classification Report" %model_name, "\n")
    print('Test Accuracy : %s' %acc, "\n")
    print("Confusion Matrix\n", cm, "\n")
    print('Classification Metrics\n', report)

## Data Loading

In [4]:
train = pd.read_csv('NSL_KDD-master/KDDTrain+.csv', header = None)
test = pd.read_csv('NSL_KDD-master/KDDTest+.csv', header = None)
data = pd.concat([train, test], axis = 0)


# Drop the last column which is unknown
data = data.drop(42, axis = 1)

# Assign the column name
columns_name = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 
                'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_hot_login',
                'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
                'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 
                'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type']
data.columns = columns_name

print('Train Data : ', train.shape)
print('Test Data :', test.shape)
print("Full Data : ", data.shape)
data.head()

Train Data :  (125973, 43)
Test Data : (22543, 43)
Full Data :  (148516, 42)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


## Create the Label Column
- Using attack diction to map to four attack type lable : DoS, Probe, R2L, U2R 

In [5]:
data['attack_type'].unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back', 'guess_passwd',
       'ftp_write', 'multihop', 'rootkit', 'buffer_overflow', 'imap',
       'warezmaster', 'phf', 'land', 'loadmodule', 'spy', 'perl', 'saint',
       'mscan', 'apache2', 'snmpgetattack', 'processtable', 'httptunnel',
       'ps', 'snmpguess', 'mailbomb', 'named', 'sendmail', 'xterm', 'worm',
       'xlock', 'xsnoop', 'sqlattack', 'udpstorm'], dtype=object)

In [6]:
attack_dict = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

data['label'] = data['attack_type'].map(attack_dict)
data = data.drop(['attack_type'], axis = 1)
print(data['label'].value_counts())

normal    77053
DoS       53385
Probe     14077
R2L        3749
U2R         252
Name: label, dtype: int64


In [7]:
label_idx = [40]
nominal_idx = [1, 2, 3]
binary_idx = [6, 11, 13, 14, 20, 21]
numeric_idx = list(set(range(41)).difference(label_idx).difference(nominal_idx).difference(binary_idx))

print(data.shape)
print(len(label_idx) + len(nominal_idx) + len(binary_idx) + len(numeric_idx))

(148516, 42)
41


## Check Missing Value & Constant Variable

In [8]:
# Check Missing Value
data is not None and not data.empty and not data.isnull().values.any()

# Check Constant Variable
for col in data.columns:
    if len(data[col].unique()) == 1:
        print("Remove Constant Variable : ", col)
        data = data.drop(col, axis=1)
print(data.shape)

Remove Constant Variable :  num_outbound_cmds
(148516, 41)


## Encoding with Categorical Variables
- One Hot Encoding
- Binary Encoding (yet)

In [9]:
data_num_ohe = pd.get_dummies(data.iloc[:, :-1])

data_ohe = pd.concat([data_num_ohe, data.iloc[:, -1]], axis = 1)
print('Full Data Shape after OHE : ', data_ohe.shape)

Full Data Shape after OHE :  (148516, 122)


In [10]:
data_ohe.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,DoS
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal


## Splitting Training, Testing Data

In [11]:
X_train, y_train = data_ohe.iloc[:train.shape[0], :-1], data_ohe.iloc[:train.shape[0], -1]
X_test, y_test = data_ohe.iloc[train.shape[0]:, :-1], data_ohe.iloc[train.shape[0]:, -1]

print('Training : ', X_train.shape, ' Testing : ', X_test.shape)

Training :  (125973, 121)  Testing :  (22543, 121)


In [12]:
X_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


## First Trial : Random Forest, XGBoost, GBDT
Without tuning hyperparameter
- Random Forest Test Acc : 0.76
- XGBoost Test Acc :

In [24]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits = 3, random_state = random_seed)

### Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state = random_seed)
model_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [42]:
predict_rf = model_rf.predict(X_test)
show_clf_result('Random Forest', y_test, predict_rf)

Random Forest Classification Report 

Test Accuracy : 0.764361442576 

Confusion Matrix
 [[6023  288    0    0 1147]
 [ 161 1618    0    0  642]
 [   0    3  133    2 2616]
 [   0    5    3    7  185]
 [  69  191    0    0 9450]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.96      0.81      0.88      7458
      Probe       0.77      0.67      0.71      2421
        R2L       0.98      0.05      0.09      2754
        U2R       0.78      0.04      0.07       200
     normal       0.67      0.97      0.80      9710

avg / total       0.82      0.76      0.72     22543



### XGBoost

In [46]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(objective = 'multi:softmax',
                          n_jobs = -1,
                          random_state = random_seed)
model_xgb.fit(X_train, y_train,
              eval_set = [(X_train, y_train), (X_test, y_test)], 
              eval_metric = 'merror',
              early_stopping_rounds = 10,
              verbose = True)

[0]	validation_0-merror:0.029887	validation_1-merror:0.27259
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 10 rounds.
[1]	validation_0-merror:0.029268	validation_1-merror:0.267356
[2]	validation_0-merror:0.024862	validation_1-merror:0.271392
[3]	validation_0-merror:0.022933	validation_1-merror:0.271836
[4]	validation_0-merror:0.023172	validation_1-merror:0.271836
[5]	validation_0-merror:0.02314	validation_1-merror:0.271126
[6]	validation_0-merror:0.023346	validation_1-merror:0.270949
[7]	validation_0-merror:0.020965	validation_1-merror:0.270683
[8]	validation_0-merror:0.020631	validation_1-merror:0.266735
[9]	validation_0-merror:0.020362	validation_1-merror:0.266646
[10]	validation_0-merror:0.020528	validation_1-merror:0.266646
[11]	validation_0-merror:0.020528	validation_1-merror:0.266469
[12]	validation_0-merror:0.020409	validation_1-merror:0.267134
[13]	validation_0-merror:0.013

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='multi:softprob',
       random_state=17, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [47]:
predict_xgb = model_xgb.predict(X_test)
show_clf_result('XGBoost', y_test, predict_xgb)

XGBoost Classification Report 

Test Accuracy : 0.771547708823 

Confusion Matrix
 [[6188   61    0    0 1209]
 [ 166 1472   45    0  738]
 [   0    4  285    2 2463]
 [   0    2    2    4  192]
 [  67  197    1    1 9444]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.96      0.83      0.89      7458
      Probe       0.85      0.61      0.71      2421
        R2L       0.86      0.10      0.18      2754
        U2R       0.57      0.02      0.04       200
     normal       0.67      0.97      0.80      9710

avg / total       0.81      0.77      0.74     22543



## GBDT

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

model_gbdt = GradientBoostingClassifier(random_state = random_seed, verbose = 1)
model_gbdt.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1      142028.0629            4.06m
         2      117303.4288            3.98m
         3       98525.4997            4.08m
         4       83517.6391            4.13m
         5       71333.2791            4.14m
         6       61270.9264            4.08m
         7       52851.7826            3.97m
         8       45877.1042            3.87m
         9       39930.5474            3.88m
        10       35011.5807            3.84m
        20       11052.8459            3.39m
        30        4861.1494            3.20m
        40        4593.1640            2.69m
        50        3819.5501            2.26m
        60        3375.6090            1.82m
        70        3084.0521            1.35m
        80        2884.6771           52.15s
        90        2693.2860           25.93s
       100        2587.9867            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=17, subsample=1.0, verbose=1,
              warm_start=False)

In [34]:
predict_gbdt = model_gbdt.predict(X_test)
show_clf_result('Gradient Boosting Decision Tree', y_test, predict_gbdt)

Gradient Boosting Decision Tree Classification Report 

Test Accuracy : 0.767732777359 

Confusion Matrix
 [[5969   41    0    0 1448]
 [ 106 1617  218    0  480]
 [   0    8  289    2 2455]
 [   0    7    7   11  175]
 [  70  216    1    2 9421]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.97      0.80      0.88      7458
      Probe       0.86      0.67      0.75      2421
        R2L       0.56      0.10      0.18      2754
        U2R       0.73      0.06      0.10       200
     normal       0.67      0.97      0.80      9710

avg / total       0.78      0.77      0.74     22543



## Ensemble Method : Major Voting

In [48]:
from sklearn.ensemble import VotingClassifier

model_mv = VotingClassifier(estimators=[('rf', model_rf),
                                        ('gbdt', model_gbdt),
                                        ('xgb', model_xgb)], voting='soft', n_jobs = -1)

model_mv = model_mv.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1      142028.0629            6.31m
         2      117303.4288            5.58m
         3       98525.4997            5.62m
         4       83517.6391            5.51m
         5       71333.2791            5.36m
         6       61270.9264            5.24m
         7       52851.7826            5.13m
         8       45877.1042            5.02m
         9       39930.5474            4.87m
        10       35011.5807            4.85m
        20       11052.8459            4.15m
        30        4861.1494            3.64m
        40        4593.1640            3.03m
        50        3819.5501            2.45m
        60        3375.6090            1.90m
        70        3084.0521            1.38m
        80        2884.6771           53.89s
        90        2693.2860           26.39s
       100        2587.9867            0.00s


In [49]:
predict_mv = model_mv.predict(X_test)
show_clf_result('Major Voting', y_test, predict_mv)

Major Voting Classification Report 

Test Accuracy : 0.772878498869 

Confusion Matrix
 [[6163   53    0    0 1242]
 [ 165 1532    8    0  716]
 [   0    4  277    2 2471]
 [   0    1    3    7  189]
 [  68  197    0    1 9444]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.96      0.83      0.89      7458
      Probe       0.86      0.63      0.73      2421
        R2L       0.96      0.10      0.18      2754
        U2R       0.70      0.04      0.07       200
     normal       0.67      0.97      0.79      9710

avg / total       0.82      0.77      0.74     22543



## Ensemble Method : Stacking

## Trivial

In [None]:
# Reverse Mapping
#inv_attack_dict = {v: k for k, v in attack_dict.items()}
#train['label'] = train['label'].map(inv_attack_dict)