In [1]:
import pandas as pd 
import numpy as np
from sklearn import tree 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    roc_auc_score,
    f1_score,
    r2_score,
    mean_squared_error,
)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('dataset/UNSW_NB15_training-set.csv') 
X = df.drop(['label','attack_cat','id'], axis=1)
y= df['label']

In [4]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_transformer_low = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoding", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

categorical_transformer_high = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        # 'OrdianlEncoder' Raise a ValueError when encounters an unknown value. Check https://github.com/scikit-learn/scikit-learn/pull/13423
        ("encoding", OrdinalEncoder()),
    ]
)

def get_card_split(df, cols, n=11):
    """
    Splits categorical columns into 2 lists based on cardinality (i.e # of unique values)
    Parameters
    ----------
    df : Pandas DataFrame
        DataFrame from which the cardinality of the columns is calculated.
    cols : list-like
        Categorical columns to list
    n : int, optional (default=11)
        The value of 'n' will be used to split columns.
    Returns
    -------
    card_low : list-like
        Columns with cardinality < n
    card_high : list-like
        Columns with cardinality >= n
    """
    cond = df[cols].nunique() > n
    card_high = cols[cond]
    card_low = cols[~cond]
    return card_low, card_high


In [5]:
def transformData(X):
    numeric_features = X.select_dtypes(include=[np.number]).columns
    categorical_features = X.select_dtypes(include=["object"]).columns

    categorical_low, categorical_high = get_card_split(
        X, categorical_features
    )

    ct = ColumnTransformer(
        transformers=[
            ("numeric", numeric_transformer, numeric_features),
            ("categorical_low", categorical_transformer_low, categorical_low),
            ("categorical_high", categorical_transformer_high, categorical_high),
        ]
    ) 

    return ct.fit_transform(X)

In [6]:
acc = 0
X_trans = transformData(X)
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_trans, y,test_size=.1, random_state=(314*i)) 
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train) 
    y_pred = clf.predict(X_test)
    acc += accuracy_score(y_test, y_pred, normalize=True)
print(acc/10)


0.9677677920816128


In [7]:
abl_anal = []
for i in range(X.shape[1]): 
    X_cp = X 
    colname = X_cp.columns[i]
    X_cp = X_cp.drop(X_cp.columns[i], axis=1)
    acc = 0

    # print(f'original size:{X.shape[1]} dropped size:{X_cp.shape[1]}')
    X_trans = transformData(X_cp)
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X_trans, y,test_size=.1, random_state=(314*i)) 
        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(X_train, y_train) 
        y_pred = clf.predict(X_test)
        acc += accuracy_score(y_test, y_pred, normalize=True)
    acc=acc/10
    # print({'colname': colname, 'acc': acc})
    abl_anal.append({'colname': colname, 'acc': acc})
# print(abl_anal)

In [8]:
abl_df = pd.DataFrame.from_records(abl_anal) 
abl_df.sort_values(by=['acc'])

Unnamed: 0,colname,acc
35,ct_dst_src_ltm,0.942215
40,ct_srv_dst,0.965254
6,sbytes,0.965946
30,ct_srv_src,0.966043
2,service,0.966116
39,ct_src_ltm,0.967148
9,sttl,0.967209
32,ct_dst_ltm,0.967416
7,dbytes,0.967671
25,ackdat,0.967756


In [9]:
abl_df.to_csv('data/ablation.csv')