# Feature Importance

## (1) Importing libraries

In [2]:
# Data manipulation and visualization
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Scikit-learn libraries.
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# Gradient boosting framework(tree based learning algorithms).
import lightgbm as lgb
# For creating progress meters.
from tqdm import tqdm_notebook as tqdm

# Misc imports
import warnings
import os
warnings.simplefilter(action='ignore', category=FutureWarning)


## (2) Defining Functions

In [3]:
def get_filenames(path):
    """Function to print out 
    all available files at given path.

    Args:
        path (str): print out all files in path
    """
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            if 'csv' in filename:
                print(os.path.join(dirname, filename))
                
                
def get_train_test():
    """ (1) imports training and testing datasets.
        (2) Correcting reversed datasets.
        (3) Dropping "attack_Cat", and "id" columns.

    Returns:
       train, test: dataframes
    """
    root = "../Dataset/"
    train = pd.read_csv(root+'UNSW_NB15_training-set.csv')
    test = pd.read_csv(root+'UNSW_NB15_testing-set.csv')
    
    cols_to_drop = ['attack_cat', 'id']
    
    if train.shape < test.shape:
        print("Training and testing sets are reveresed. Correcting..")
        train, test = test, train
        print(f"✅ Corrected training shape:\t {train.shape}\n✅ Corrected testing shape:\t {test.shape}\n")

    for df in [train, test]:
        for col in cols_to_drop:
            if col in df.columns:
                print(f"❌ Dropped:\t {col}")
                df.drop([col], axis=1, inplace=True)
    return train, test

def get_categorical_columns(train):
    """inputs training set and returns a list of columns of dtype object.

    Args:
        train (dataframe): dataframe in

    Returns:
        list: returns a list with columns of dtype object.
    """
    categorical_columns = []
    for col in train.columns:
        if train[col].dtype == 'object':
            categorical_columns.append(col)
    return categorical_columns


def label_encode(train, test):
    """ Label encodes categorical columns in dataframes

    Args:
        train (dataframe): dataframe in
        test (dataframe): dataframe in

    Returns:
        train, test: label encoded dataframes
    """
    for col in get_categorical_columns(train):
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))
    return train, test

def feature_engineer(df):
    """ feature engineering for input labels. Combining several and columns.

    Args:
        df (dataframe): Dataframe in

    Returns:
        dataframe: returns a feature engineered dataframe.
    """
    # Everything except: 'FIN', 'INT', 'CON', 'REQ', 'RST is renamed 'others'
    df.loc[~df['state'].isin(['FIN', 'INT', 'CON', 'REQ', 'RST']), 'state'] = 'others'
    # Everything except: ''-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3' is renamed 'others'
    df.loc[~df['service'].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3']), 'service'] = 'others'
    # Merging 'igmp', 'icmp', 'rtp' into one protocol: 'igmp_icmp_rtp'
    df.loc[df['proto'].isin(['igmp', 'icmp', 'rtp']), 'proto'] = 'igmp_icmp_rtp'
    # Everything except: 'tcp', 'udp' ,'arp', 'ospf', 'igmp_icmp_rtp' is renamed to 'others'
    df.loc[~df['proto'].isin(['tcp', 'udp','arp', 'ospf', 'igmp_icmp_rtp']), 'proto'] = 'others'
    return df


def get_input_output(train, test, label_encoding=False, scaler=None):
    """_summary_

    Args:
        train (dataframe): _description_
        test (dataframe): _description_
        label_encoding (bool, optional): if we want to apply labelencoding, otherwise onehot encoding. Defaults to False.
        scaler (bool, optional): Apply standardscaler for numerical values. Defaults to None.

    Returns:
        x_train, x_test, y_train, y_test: returns  scaled, splitted, and labelencoded(OHE) input and output variables
    """
    x_train, y_train = train.drop(['label'], axis=1), train['label']
    x_test, y_test = test.drop(['label'], axis=1), test['label']
    
    x_train, x_test = feature_engineer(x_train), feature_engineer(x_test)
    
    # Getting categorical columns for x_train from custom function
    categorical_columns = get_categorical_columns(x_train)
    # Using list apprehension for appending columns that are not in the categorical_columns list
    non_categorical_columns = [col for col in x_train.columns if col not in categorical_columns]
    
    # applying StandardScaler for non categorical columns
    if scaler is not None:
        x_train[non_categorical_columns] = scaler.fit_transform(x_train[non_categorical_columns])
        x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns])
        
    if label_encoding:
        x_train, x_test = label_encode(x_train, x_test)
        features = x_train.columns
    else:
        x_train = pd.get_dummies(x_train)
        x_test = pd.get_dummies(x_test)
        # print("Column mismatch {0}, {1}".format(set(x_train.columns)- set(x_test.columns),  set(x_test.columns)- set(x_train.columns)))
        features = list(set(x_train.columns) & set(x_test.columns))
        
    print(f"Number of features {len(features)}")
    x_train = x_train[features]
    x_test = x_test[features]

    return x_train, y_train, x_test, y_test


def display_feature_importance(importance, columns):
    """Create a new dataframe and show importance score for each column.

    Args:
        importance (float): value of importance
        columns (dataframe cols): _description_

    Returns:
        dataframe: Dataframe with scores.
    """
    feature_importance = pd.DataFrame(zip(columns, importance), columns=['Feature', 'Importance'])
    feature_importance['Importance'] /= feature_importance['Importance'].sum()*0.01
    return feature_importance.sort_values(by="Importance", ascending=False)

## (3) Data preperation

In [4]:
train, test = get_train_test()
categorical_columns = get_categorical_columns(train)

Training and testing sets are reveresed. Correcting..
✅ Corrected training shape:	 (175341, 45)
✅ Corrected testing shape:	 (82332, 45)

❌ Dropped:	 attack_cat
❌ Dropped:	 id
❌ Dropped:	 attack_cat
❌ Dropped:	 id


In [5]:
folds = 10
seed = 1
# num_round = 2000
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed, )
X, Y, x_test, y_test = get_input_output(
    train, 
    test, 
    label_encoding=True, 
    scaler= StandardScaler()
    )
importance_dict = {
    "feature": X.columns
}

Number of features 42


In [6]:
importance_dict

{'feature': Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
        'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
        'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
        'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
        'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
        'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
        'ct_srv_dst', 'is_sm_ips_ports'],
       dtype='object')}

## (4) Training Data (using Random Forest Classifier)

In [7]:
clf = RandomForestClassifier(random_state=1)
clf.fit(X, Y)
feature_importance = clf.feature_importances_
importance_dict['train'] =  feature_importance

### Ten-fold Cross Validation

In [8]:
feature_importances = []

for tr_idx, val_idx in tqdm(kf.split(X, Y), total=folds):
    x_train, y_train = X.iloc[tr_idx], Y[tr_idx]
    # x_val, y_val = X.iloc[val_idx], Y[val_idx]
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    
    feature_importances.append(clf.feature_importances_)

feature_importance = np.mean(feature_importances, axis=0)
importance_dict['train_10_fold'] =  feature_importance
importance_dict
# display_feature_importance(feature_importance, X.columns)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for tr_idx, val_idx in tqdm(kf.split(X, Y), total=folds):


  0%|          | 0/10 [00:00<?, ?it/s]

{'feature': Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
        'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
        'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
        'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
        'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
        'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
        'ct_srv_dst', 'is_sm_ips_ports'],
       dtype='object'),
 'train': array([2.86258647e-02, 1.38794819e-02, 4.56995035e-03, 1.85914358e-02,
        9.04958442e-03, 2.32241003e-02, 3.06772470e-02, 3.18363016e-02,
        4.25358099e-02, 1.62550945e-01, 7.31162119e-02, 3.31017499e-02,
        4.56746370e-02, 7.71580994e-03, 5.81202549e-03, 2.87987003e-02,
        2.28465204e-02, 2.02777156e-02, 1.47097081e-02, 5.69893653e-03,
        7.60287536e-03, 7.77447642e-03, 7.57

## (5) Testing and Training Data (using Random Forest Classifier)

In [9]:
x_total, y_total = pd.concat([X, x_test]), pd.concat([Y, y_test])

In [10]:
clf = RandomForestClassifier()
clf.fit(x_total, y_total)
feature_importance = clf.feature_importances_
importance_dict['combined'] =  feature_importance
importance_dict
# display_feature_importance(feature_importance, X.columns)

{'feature': Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
        'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
        'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
        'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
        'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
        'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
        'ct_srv_dst', 'is_sm_ips_ports'],
       dtype='object'),
 'train': array([2.86258647e-02, 1.38794819e-02, 4.56995035e-03, 1.85914358e-02,
        9.04958442e-03, 2.32241003e-02, 3.06772470e-02, 3.18363016e-02,
        4.25358099e-02, 1.62550945e-01, 7.31162119e-02, 3.31017499e-02,
        4.56746370e-02, 7.71580994e-03, 5.81202549e-03, 2.87987003e-02,
        2.28465204e-02, 2.02777156e-02, 1.47097081e-02, 5.69893653e-03,
        7.60287536e-03, 7.77447642e-03, 7.57

### Ten-fold Cross Validation

In [11]:
feature_importances = []

for tr_idx, val_idx in tqdm(kf.split(x_total, y_total), total=folds):
    x_train, y_train = x_total.iloc[tr_idx], y_total.iloc[tr_idx]
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    
    feature_importances.append(clf.feature_importances_)

feature_importance = np.mean(feature_importances, axis=0)
importance_dict['combined_10_fold'] =  feature_importance
importance_dict
# display_feature_importance(feature_importance, X.columns)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for tr_idx, val_idx in tqdm(kf.split(x_total, y_total), total=folds):


  0%|          | 0/10 [00:00<?, ?it/s]

{'feature': Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
        'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
        'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
        'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
        'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
        'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
        'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
        'ct_srv_dst', 'is_sm_ips_ports'],
       dtype='object'),
 'train': array([2.86258647e-02, 1.38794819e-02, 4.56995035e-03, 1.85914358e-02,
        9.04958442e-03, 2.32241003e-02, 3.06772470e-02, 3.18363016e-02,
        4.25358099e-02, 1.62550945e-01, 7.31162119e-02, 3.31017499e-02,
        4.56746370e-02, 7.71580994e-03, 5.81202549e-03, 2.87987003e-02,
        2.28465204e-02, 2.02777156e-02, 1.47097081e-02, 5.69893653e-03,
        7.60287536e-03, 7.77447642e-03, 7.57

In [12]:
importance_df = pd.DataFrame(importance_dict)
for col in importance_df.columns:
    if col=='feature':
        continue
    importance_df[col] = importance_df[col]*100/importance_df[col].sum()
       
importance_df['mean'] = importance_df[[col for col in importance_df.columns if col!='feature']].mean(axis=1)
importance_df = importance_df.sort_values('train_10_fold', ascending=False)
importance_df

Unnamed: 0,feature,train,train_10_fold,combined,combined_10_fold,mean
9,sttl,16.255094,16.898919,16.283185,11.433526,15.217681
31,ct_state_ttl,8.961909,11.689666,7.486178,11.26959,9.851836
12,dload,4.567464,6.57954,3.938003,5.517444,5.150613
10,dttl,7.311621,4.989468,4.812165,4.316437,5.357423
8,rate,4.253581,4.978529,4.456069,4.473814,4.540498
27,dmean,4.991935,4.257137,2.797483,2.470074,3.629157
11,sload,3.310175,3.711741,5.334926,4.99517,4.338003
23,tcprtt,3.202787,3.202635,2.662689,2.823354,2.972866
6,sbytes,3.067725,2.957175,4.166208,4.279369,3.617619
25,ackdat,1.805762,2.834778,1.650304,2.537491,2.207084


### (6) Saving the file

In [13]:
importance_df.to_csv("feature_importance.csv", index=False)
importance_df

Unnamed: 0,feature,train,train_10_fold,combined,combined_10_fold,mean
9,sttl,16.255094,16.898919,16.283185,11.433526,15.217681
31,ct_state_ttl,8.961909,11.689666,7.486178,11.26959,9.851836
12,dload,4.567464,6.57954,3.938003,5.517444,5.150613
10,dttl,7.311621,4.989468,4.812165,4.316437,5.357423
8,rate,4.253581,4.978529,4.456069,4.473814,4.540498
27,dmean,4.991935,4.257137,2.797483,2.470074,3.629157
11,sload,3.310175,3.711741,5.334926,4.99517,4.338003
23,tcprtt,3.202787,3.202635,2.662689,2.823354,2.972866
6,sbytes,3.067725,2.957175,4.166208,4.279369,3.617619
25,ackdat,1.805762,2.834778,1.650304,2.537491,2.207084


In [14]:
importance_df_round = pd.read_csv("feature_importance.csv",index_col=False)
importance_df_round = importance_df_round.round(7)
importance_df_round

Unnamed: 0,feature,train,train_10_fold,combined,combined_10_fold,mean
0,sttl,16.255094,16.898919,16.283185,11.433526,15.217681
1,ct_state_ttl,8.961908,11.689666,7.486178,11.26959,9.851835
2,dload,4.567464,6.57954,3.938003,5.517444,5.150613
3,dttl,7.311621,4.989468,4.812165,4.316437,5.357423
4,rate,4.253581,4.978529,4.456069,4.473814,4.540498
5,dmean,4.991935,4.257137,2.797483,2.470074,3.629157
6,sload,3.310175,3.711741,5.334926,4.995171,4.338003
7,tcprtt,3.202787,3.202635,2.662689,2.823354,2.972866
8,sbytes,3.067725,2.957175,4.166208,4.279369,3.617619
9,ackdat,1.805762,2.834778,1.650304,2.537491,2.207084


In [15]:
importance_df_round.to_csv("feature_importance_rounded.csv", index=False)