NSL-KDD Network Intrusion Detection
===

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

random_seed = 17

## Auxiliary Functions

In [2]:
# Show the Model Classification Report 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def show_clf_result(model_name, actual, prediction):
    
    acc = accuracy_score(actual, prediction)
    cm = confusion_matrix(actual, prediction)
    report = classification_report(actual, prediction)
    
    print("%s Classification Report" %model_name, "\n")
    print('Test Accuracy : %s' %acc, "\n")
    print("Confusion Matrix\n", cm, "\n")
    print('Classification Metrics\n', report)

## Data Loading

In [3]:
train = pd.read_csv('NSL_KDD-master/KDDTrain+.csv', header = None)
test = pd.read_csv('NSL_KDD-master/KDDTest+.csv', header = None)
data = pd.concat([train, test], axis = 0)


# Drop the last column which is unknown
data = data.drop(42, axis = 1)

# Assign the column name
columns_name = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 
                'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_hot_login',
                'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
                'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 
                'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type']
data.columns = columns_name

print('Train Data : ', train.shape)
print('Test Data :', test.shape)
print("Full Data : ", data.shape)
data.head()

Train Data :  (125973, 43)
Test Data : (22543, 43)
Full Data :  (148516, 42)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


## Create the Label Column
- Using attack diction to map to four attack type lable : DoS, Probe, R2L, U2R 

In [4]:
data['attack_type'].unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back', 'guess_passwd',
       'ftp_write', 'multihop', 'rootkit', 'buffer_overflow', 'imap',
       'warezmaster', 'phf', 'land', 'loadmodule', 'spy', 'perl', 'saint',
       'mscan', 'apache2', 'snmpgetattack', 'processtable', 'httptunnel',
       'ps', 'snmpguess', 'mailbomb', 'named', 'sendmail', 'xterm', 'worm',
       'xlock', 'xsnoop', 'sqlattack', 'udpstorm'], dtype=object)

In [5]:
attack_dict = {
    'normal': 'normal',
    
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',

    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

data['label'] = data['attack_type'].map(attack_dict)
data = data.drop(['attack_type'], axis = 1)
print(data['label'].value_counts())

normal    77053
DoS       53385
Probe     14077
R2L        3749
U2R         252
Name: label, dtype: int64


In [6]:
label_idx = [40]
nominal_idx = [1, 2, 3]
binary_idx = [6, 11, 13, 14, 20, 21]
numeric_idx = list(set(range(41)).difference(label_idx).difference(nominal_idx).difference(binary_idx))

print(data.shape)
print(len(label_idx) + len(nominal_idx) + len(binary_idx) + len(numeric_idx))

(148516, 42)
41


## Check Missing Value & Constant Variable

In [7]:
# Check Missing Value
data is not None and not data.empty and not data.isnull().values.any()

# Check Constant Variable
for col in data.columns:
    if len(data[col].unique()) == 1:
        print("Remove Constant Variable : ", col)
        data = data.drop(col, axis=1)
print(data.shape)

Remove Constant Variable :  num_outbound_cmds
(148516, 41)


## Encoding with Categorical Variables
- One Hot Encoding
- Binary Encoding (yet)

In [8]:
data_num_ohe = pd.get_dummies(data.iloc[:, :-1])

data_ohe = pd.concat([data_num_ohe, data.iloc[:, -1]], axis = 1)
print('Full Data Shape after OHE : ', data_ohe.shape)

Full Data Shape after OHE :  (148516, 122)


In [9]:
data_ohe.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,normal
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,DoS
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,normal


## Splitting Training, Testing Data

In [24]:
X_train, y_train = data_ohe.iloc[:train.shape[0], :-1], data_ohe.iloc[:train.shape[0], -1]
X_test, y_test = data_ohe.iloc[train.shape[0]:, :-1], data_ohe.iloc[train.shape[0]:, -1]

print('Training : ', X_train.shape, ' Testing : ', X_test.shape)

Training :  (125973, 121)  Testing :  (22543, 121)


In [25]:
X_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


## First Trial : Random Forest, XGBoost, CatBoost
Without tuning hyperparameter
- Random Forest Test Acc : 0.76
- XGBoost Test Acc :

In [26]:
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

kfold = StratifiedKFold(n_splits = 10)

### Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state = random_seed)
model_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [36]:
predict_rf = model_rf.predict(X_test)
show_clf_result('Random Forest', y_test, predict_rf)

Random Forest Classification Report 

Test Accuracy : 0.764361442576 

Confusion Matrix
 [[6023  288    0    0 1147]
 [ 161 1618    0    0  642]
 [   0    3  133    2 2616]
 [   0    5    3    7  185]
 [  69  191    0    0 9450]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.96      0.81      0.88      7458
      Probe       0.77      0.67      0.71      2421
        R2L       0.98      0.05      0.09      2754
        U2R       0.78      0.04      0.07       200
     normal       0.67      0.97      0.80      9710

avg / total       0.82      0.76      0.72     22543



### XGBoost

In [None]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(objective = 'multi:softmax',
                          n_jobs = -1,
                          random_state = random_seed)
model_xgb.fit(X_train, y_train,
              eval_set = [(X_train, y_train), (X_test, y_test)], 
              eval_metric='merror',
              early_stopping_rounds = 10,
              verbose=True)

In [69]:
predict_xgb = model_xgb.predict(X_test)
show_clf_result('XGBoost', y_test, predict_xgb)

AttributeError: 'DataFrame' object has no attribute 'feature_names'

In [66]:
import xgboost as xgb

label_dict = {
    'normal' : 0,
    'DoS' : 1,
    'Probe' : 2,
    'R2L' : 3,
    'U2R' : 4} 

train_xgb = xgb.DMatrix(X_train, y_train.map(label_dict))
test_xgb = xgb.DMatrix(X_test, y_test.map(label_dict))

watchlist  = [(test_xgb, 'test'), (train_xgb, 'train')]

In [67]:
params = {
    'objective': 'multi:softmax',
    'num_class' : 5,
}
num_rounds = 10

model_xgb = xgb.train(params, train_xgb, num_rounds, watchlist,
                early_stopping_rounds = 5)

[0]	test-merror:0.254758	train-merror:0.005581
Multiple eval metrics have been passed: 'train-merror' will be used for early stopping.

Will train until train-merror hasn't improved in 5 rounds.
[1]	test-merror:0.255556	train-merror:0.004295
[2]	test-merror:0.250854	train-merror:0.003961
[3]	test-merror:0.235639	train-merror:0.003096
[4]	test-merror:0.239276	train-merror:0.002604
[5]	test-merror:0.239232	train-merror:0.002596
[6]	test-merror:0.241006	train-merror:0.002421
[7]	test-merror:0.247438	train-merror:0.002247
[8]	test-merror:0.230803	train-merror:0.002
[9]	test-merror:0.230981	train-merror:0.00185


In [68]:
predict_xgb = model_xgb.predict(test_xgb)

label_iv_dict = {
    0 : 'normal',
    1 : 'DoS',
    2 : 'Probe',
    3 : 'R2L',
    4 : 'U2R'}
predict_xgb = np.vectorize(label_iv_dict.get)(predict_xgb)

show_clf_result('XGBoost', y_test, predict_xgb)

XGBoost Classification Report 

Test Accuracy : 0.769019207736 

Confusion Matrix
 [[6138  223    0    0 1097]
 [ 164 1579   33    0  645]
 [   0  129  176    1 2448]
 [   0    2    2    6  190]
 [  67  204    1    1 9437]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.96      0.82      0.89      7458
      Probe       0.74      0.65      0.69      2421
        R2L       0.83      0.06      0.12      2754
        U2R       0.75      0.03      0.06       200
     normal       0.68      0.97      0.80      9710

avg / total       0.80      0.77      0.73     22543



## GBDT

In [42]:
from sklearn.ensemble import GradientBoostingClassifier

model_gbdt = GradientBoostingClassifier(random_state = random_seed, verbose = 1)
model_gbdt.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1      142028.0629            4.28m
         2      117303.4288            4.14m
         3       98525.4997            4.09m
         4       83517.6391            4.14m
         5       71333.2791            4.25m
         6       61270.9264            4.76m
         7       52851.7826            4.65m
         8       45877.1042            4.53m
         9       39930.5474            4.41m
        10       35011.5807            4.30m
        20       11052.8459            3.58m
        30        4861.1494            3.18m
        40        4593.1640            2.63m
        50        3819.5501            2.19m
        60        3375.6090            1.70m
        70        3084.0521            1.25m
        80        2884.6771           50.48s
        90        2693.2860           25.08s
       100        2587.9867            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=17, subsample=1.0, verbose=1,
              warm_start=False)

In [43]:
predict_gbdt = model_gbdt.predict(X_test)
show_clf_result('Random Forest', y_test, predict_gbdt)

Random Forest Classification Report 

Test Accuracy : 0.767732777359 

Confusion Matrix
 [[5969   41    0    0 1448]
 [ 106 1617  218    0  480]
 [   0    8  289    2 2455]
 [   0    7    7   11  175]
 [  70  216    1    2 9421]] 

Classification Metrics
              precision    recall  f1-score   support

        DoS       0.97      0.80      0.88      7458
      Probe       0.86      0.67      0.75      2421
        R2L       0.56      0.10      0.18      2754
        U2R       0.73      0.06      0.10       200
     normal       0.67      0.97      0.80      9710

avg / total       0.78      0.77      0.74     22543



## Ensemble Method : Major Voting

['DoS' 'DoS' 'normal' ..., 'DoS' 'normal' 'Probe']


In [44]:
from sklearn.ensemble import VotingClassifier

model_mv = VotingClassifier(estimators=[('rf', model_rf),
                                        ('gbdt', model_gbdt),
                                        ('xgb', model_xgb)], voting='hard', n_jobs = -1)

model_mv = model_mv.fit(X_train, y_train)

TypeError: Cannot clone object '<xgboost.core.Booster object at 0x1a48cbda90>' (type <class 'xgboost.core.Booster'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' methods.

## Ensemble Method : Stacking

## Trivial

In [None]:
# Reverse Mapping
#inv_attack_dict = {v: k for k, v in attack_dict.items()}
#train['label'] = train['label'].map(inv_attack_dict)