# Feature Importance 

## Construct

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
# import lightgbm as lgb
# from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import preprocessing 
import pandas as pd
from sklearn import preprocessing 

In [2]:
# 1. Reading Train and test dataset.
# 2. Check if dataset is reversed.
# 3. Drop 'id', and 'attack_cat' columns.
def input_train_test():
    train = pd.read_csv('UNSW_NB15_training-set.csv')
    test = pd.read_csv('UNSW_NB15_testing-set.csv')
    
    if train.shape < test.shape:
        # Reversing the dataset
        train, test = test, train
        # Dropping the columns
        drop_columns = ["id", "attack_cat"]
        print("Dropped 'id', and 'attack_cat' columns")
        train.drop(drop_columns, axis=1, inplace=True)
        test.drop(drop_columns, axis=1, inplace=True)
        print("Train and Test sets are reversed, Corrected Shape:")
        print("Train shape: ", train.shape)
        print("Test shape: ", test.shape)
    else:
        print("The dataset, is already reversed")
        print("Train shape: ", train.shape)
        print("Test shape: ", test.shape)
    return train, test

In [3]:
# identifying object columns and appending them inside cat_cols[]
def get_cat_cols(train):
    # Defining an empty list
    cat_cols = []
    # Iterating through the columns and checking for columns with datatyp "Object"
    for col in train.columns:
        if train[col].dtype == 'object':
            cat_cols.append(col) # appending "object" columns to cat_cols
    return cat_cols

In [4]:
# Label encode string values/categorical values.
def label_encode(train, test):
    for col in get_cat_cols(train):
        le = preprocessing.LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))
    return train, test

In [5]:
def feature_engineer(df):
    df.loc[~df['state'].isin(['FIN', 'INT', 'CON', 'REQ', 'RST']), 'state'] = 'others'
    df.loc[~df['service'].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3']), 'service'] = 'others'
    df.loc[df['proto'].isin(['igmp', 'icmp', 'rtp']), 'proto'] = 'igmp_icmp_rtp'
    df.loc[~df['proto'].isin(['tcp', 'udp', 'arp', 'ospf', 'igmp_icmp_rtp']), 'proto'] = 'others'
    return df

In [6]:
def get_train_test(train, test, label_encoding=False, scaler=None):
    x_train, y_train = train.drop(['label'], axis=1), train['label']
    x_test, y_test = test.drop(['label'], axis=1), test['label']
    x_train, x_test = feature_engineer(x_train), feature_engineer(x_test)
    
    cat_cols = get_cat_cols(x_train)
    non_cat_cols = [x for x in x_train.columns if x not in cat_cols]
    
    if scaler is not None:
        x_train[non_cat_cols] = scaler.fit_transform(x_train[non_cat_cols])
        x_test[non_cat_cols] = scaler.transform(x_test[non_cat_cols])
    
    if label_encoding:
        x_train, x_test = label_encode(x_train, x_test)
        features = x_train.columns
    else:
        x_train = pd.get_dummies(x_train)
        x_test = pd.get_dummies(x_test)
        print("Column mismatch {0}, {1}".format(set(x_train.columns)- set(x_test.columns),  set(x_test.columns)- set(x_train.columns)))
        features = list(set(x_train.columns) & set(x_test.columns))
    print(f"Number of features {len(features)}")
    x_train = x_train[features]
    x_test = x_test[features]

    return x_train, y_train, x_test, y_test

In [7]:
def show_feature_importance(importance, columns):
    feature_importance = pd.DataFrame(zip(columns, importance), columns=['Feature', 'Importance'])
    feature_importance['Importance'] /= feature_importance['Importance'].sum()*0.01
    return feature_importance.sort_values(by="Importance", ascending=False)

## Preparing the data 

In [8]:
train, test = input_train_test()

Dropped 'id', and 'attack_cat' columns
Train and Test sets are reversed, Corrected Shape:
Train shape:  (175341, 43)
Test shape:  (82332, 43)


In [9]:
cat_cols = get_cat_cols(train)

In [10]:
folds = 10
seed = 1
num_round = 2000
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
X, Y, x_test, y_test = get_train_test(train, test, label_encoding=True, scaler= StandardScaler())

importance_dict = {
    "feature": X.columns
}

Number of features 42


In [14]:
clf = RandomForestClassifier(random_state=1)
clf.fit(X, Y)
feature_importance = clf.feature_importances_
importance_dict['train'] =  feature_importance

Unnamed: 0,Feature,Importance
9,sttl,16.255094
31,ct_state_ttl,8.961909
10,dttl,7.311621
27,dmean,4.991935
12,dload,4.567464
8,rate,4.253581
40,ct_srv_dst,3.435446
11,sload,3.310175
23,tcprtt,3.202787
7,dbytes,3.18363


In [None]:
show_feature_importance(feature_importance, X.columns)

Unnamed: 0,Feature,Importance
9,sttl,16.255094
31,ct_state_ttl,8.961909
10,dttl,7.311621
27,dmean,4.991935
12,dload,4.567464
8,rate,4.253581
40,ct_srv_dst,3.435446
11,sload,3.310175
23,tcprtt,3.202787
7,dbytes,3.18363
