In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import seaborn as sns


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if 'csv' in filename:
            print(os.path.join(dirname, filename))

/kaggle/input/unsw-nb15/UNSW-NB15_4.csv
/kaggle/input/unsw-nb15/NUSW-NB15_features.csv
/kaggle/input/unsw-nb15/UNSW-NB15_1.csv
/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv
/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.csv
/kaggle/input/unsw-nb15/UNSW-NB15_2.csv
/kaggle/input/unsw-nb15/UNSW-NB15_LIST_EVENTS.csv
/kaggle/input/unsw-nb15/UNSW-NB15_3.csv


#  Utils

In [2]:
def input_train_test():
    root = '../input/unsw-nb15/'
    train = pd.read_csv(root+'UNSW_NB15_training-set.csv')
    test = pd.read_csv(root+'UNSW_NB15_testing-set.csv')
    
    if train.shape[0] == 82332:
        print("Train and test sets are reversed here. Fixing them.")
        train, test = test, train
    drop_columns = ['attack_cat', 'id']
    for df in [train, test]:
        for col in drop_columns:
            if col in df.columns:
                print('Dropping '+col)
                df.drop([col], axis=1, inplace=True)
    return train, test

def get_cat_columns(train):
    categorical = []
    for col in train.columns:
        if train[col].dtype == 'object':
            categorical.append(col)
    return categorical
    
def label_encode(train, test):
    for col in get_cat_columns(train):
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))
    return train, test

def feature_engineer(df):
    df.loc[~df['state'].isin(['FIN', 'INT', 'CON', 'REQ', 'RST']), 'state'] = 'others'
    df.loc[~df['service'].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3']), 'service'] = 'others'
    df.loc[df['proto'].isin(['igmp', 'icmp', 'rtp']), 'proto'] = 'igmp_icmp_rtp'
    df.loc[~df['proto'].isin(['tcp', 'udp', 'arp', 'ospf', 'igmp_icmp_rtp']), 'proto'] = 'others'
    return df

def get_train_test(train, test, label_encoding=False, scaler=None):
    x_train, y_train = train.drop(['label'], axis=1), train['label']
    x_test, y_test = test.drop(['label'], axis=1), test['label']
    
    x_train, x_test = feature_engineer(x_train), feature_engineer(x_test)
    
    categorical_columns = get_cat_columns(x_train)
    non_categorical_columns = [x for x in x_train.columns if x not in categorical_columns]
    if scaler is not None:
        x_train[non_categorical_columns] = scaler.fit_transform(x_train[non_categorical_columns])
        x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns])

    if label_encoding:
        x_train, x_test = label_encode(x_train, x_test)
        features = x_train.columns
    else:
        x_train = pd.get_dummies(x_train)
        x_test = pd.get_dummies(x_test)
        print("Column mismatch {0}, {1}".format(set(x_train.columns)- set(x_test.columns),  set(x_test.columns)- set(x_train.columns)))
        features = list(set(x_train.columns) & set(x_test.columns))
    print(f"Number of features {len(features)}")
    x_train = x_train[features]
    x_test = x_test[features]

    return x_train, y_train, x_test, y_test

def show_feature_importance(importance, columns):
    feature_importance = pd.DataFrame(zip(columns, importance), columns=['Feature', 'Importance'])
    feature_importance['Importance'] /= feature_importance['Importance'].sum()*0.01
    return feature_importance.sort_values(by="Importance", ascending=False)

# Prepare data

In [3]:
train, test = input_train_test()
categorical_columns = get_cat_columns(train)

Train and test sets are reversed here. Fixing them.
Dropping attack_cat
Dropping id
Dropping attack_cat
Dropping id


In [4]:
folds = 10
seed = 1
num_round = 2000
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
X, Y, x_test, y_test = get_train_test(train, test, label_encoding=True, scaler= StandardScaler())
importance_dict = {
    "feature": X.columns
}

Number of features 42


# On train data

In [5]:
clf = RandomForestClassifier(random_state=1)
clf.fit(X, Y)
feature_importance = clf.feature_importances_
importance_dict['train'] =  feature_importance
# show_feature_importance(feature_importance, X.columns)

## Ten-fold cross validation

In [6]:
feature_importances = []

for tr_idx, val_idx in tqdm(kf.split(X, Y), total=folds):
    x_train, y_train = X.iloc[tr_idx], Y[tr_idx]
    # x_val, y_val = X.iloc[val_idx], Y[val_idx]
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    
    feature_importances.append(clf.feature_importances_)

feature_importance = np.mean(feature_importances, axis=0)
importance_dict['train_10_fold'] =  feature_importance
# show_feature_importance(feature_importance, X.columns)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




# Combined data (train+test)

In [7]:
total_x, total_y = pd.concat([X, x_test]), pd.concat([Y, y_test])

In [8]:
clf = RandomForestClassifier()
clf.fit(total_x, total_y)
feature_importance = clf.feature_importances_
importance_dict['combined'] =  feature_importance
# show_feature_importance(feature_importance, X.columns)

## Ten-fold cross validation

In [9]:
feature_importances = []

for tr_idx, val_idx in tqdm(kf.split(total_x, total_y), total=folds):
    x_train, y_train = total_x.iloc[tr_idx], total_y.iloc[tr_idx]
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    
    feature_importances.append(clf.feature_importances_)

feature_importance = np.mean(feature_importances, axis=0)
importance_dict['combined_10_fold'] =  feature_importance
# show_feature_importance(feature_importance, X.columns)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [10]:
importance_df = pd.DataFrame(importance_dict)
for col in importance_df.columns:
    if col=='feature':
        continue
    importance_df[col] = importance_df[col]*100/importance_df[col].sum()
    
importance_df['mean'] = importance_df[[col for col in importance_df.columns if col!='feature']].mean(axis=1)
importance_df.sort_values('train_10_fold', ascending=False)

Unnamed: 0,feature,train,train_10_fold,combined,combined_10_fold,mean
9,sttl,14.372745,12.665923,7.763551,11.780516,11.645684
31,ct_state_ttl,20.692945,9.821032,11.229277,10.700906,13.11104
12,dload,3.155873,8.699663,7.728671,5.406754,6.24774
10,dttl,5.039761,7.191795,0.462771,4.033134,4.181865
8,rate,9.161695,4.893206,3.26911,4.192147,5.37904
11,sload,1.354376,4.354479,4.465606,5.067883,3.810586
27,dmean,3.712603,4.337488,3.355333,3.001092,3.601629
25,ackdat,0.753237,3.792346,3.859893,2.794008,2.799871
6,sbytes,3.29207,3.618872,5.533858,4.075151,4.129988
40,ct_srv_dst,4.902709,3.168857,2.634713,3.791126,3.624351


In [11]:
importance_df.to_csv("importance.csv", index=False)