In [16]:
import pandas as pd 
import numpy as np
from sklearn import tree 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    roc_auc_score,
    f1_score,
    r2_score,
    mean_squared_error,
    precision_score,
)

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
df = pd.read_csv('dataset/UNSW_NB15_training-set.csv') 
X = df.drop(['label','attack_cat','id'], axis=1)
y= df['label']

In [19]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_transformer_low = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoding", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

categorical_transformer_high = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        # 'OrdianlEncoder' Raise a ValueError when encounters an unknown value. Check https://github.com/scikit-learn/scikit-learn/pull/13423
        ("encoding", OrdinalEncoder()),
    ]
)

def get_card_split(df, cols, n=11):
    """
    Splits categorical columns into 2 lists based on cardinality (i.e # of unique values)
    Parameters
    ----------
    df : Pandas DataFrame
        DataFrame from which the cardinality of the columns is calculated.
    cols : list-like
        Categorical columns to list
    n : int, optional (default=11)
        The value of 'n' will be used to split columns.
    Returns
    -------
    card_low : list-like
        Columns with cardinality < n
    card_high : list-like
        Columns with cardinality >= n
    """
    cond = df[cols].nunique() > n
    card_high = cols[cond]
    card_low = cols[~cond]
    return card_low, card_high


In [20]:
def transformData(X):
    numeric_features = X.select_dtypes(include=[np.number]).columns
    categorical_features = X.select_dtypes(include=["object"]).columns

    categorical_low, categorical_high = get_card_split(
        X, categorical_features
    )

    ct = ColumnTransformer(
        transformers=[
            ("numeric", numeric_transformer, numeric_features),
            ("categorical_low", categorical_transformer_low, categorical_low),
            ("categorical_high", categorical_transformer_high, categorical_high),
        ]
    ) 

    return ct.fit_transform(X)

In [21]:
acc = 0
X_trans = transformData(X)
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X_trans, y,test_size=.1, random_state=(314*i)) 
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train) 
    y_pred = clf.predict(X_test)
    acc += accuracy_score(y_test, y_pred, normalize=True)
print(acc/10)


0.9679985426281272


In [22]:
abl_anal = []
for i in range(X.shape[1]): 
    X_cp = X 
    colname = X_cp.columns[i]
    X_cp = X_cp.drop(X_cp.columns[i], axis=1)
    acc = 0
    acc1 = 0
    auc = 0
    prec = 0
    bas = 0
    r2s = 0
    mse = 0

    # print(f'original size:{X.shape[1]} dropped size:{X_cp.shape[1]}')
    X_trans = transformData(X_cp)
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X_trans, y,test_size=.1, random_state=(314*i)) 
        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(X_train, y_train) 
        y_pred = clf.predict(X_test)
        acc += accuracy_score(y_test, y_pred, normalize=True)
        acc1 += f1_score(y_test, y_pred,)
        auc += roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        prec += precision_score(y_test, y_pred)
        bas += balanced_accuracy_score(y_test, y_pred)
        r2s += r2_score(y_test, y_pred)
        mse += mean_squared_error(y_test, y_pred)
    acc=acc/10
    acc1=acc/10
    auc=auc/10
    prec = prec/10
    bas = bas/10
    r2s = r2s/10
    mse = mse/10
    # print({'colname': colname, 'acc': acc})
    abl_anal.append({'colname': colname, 'acc': acc, 'f1 score': acc1, 'ROC auc: ': auc, 'Precision Score: ': prec, 'Balanced Accuracy Score: ': bas, 'R2 Score: ': r2s, 'mean_squared_error: ': mse})
# print(abl_anal)

In [23]:
abl_df = pd.DataFrame.from_records(abl_anal) 
abl_df.sort_values(by=['acc'])

Unnamed: 0,colname,acc,f1 score,ROC auc:,Precision Score:,Balanced Accuracy Score:,R2 Score:,mean_squared_error:
35,ct_dst_src_ltm,0.942191,0.094219,0.942157,0.948354,0.941703,0.766356,0.057809
40,ct_srv_dst,0.9654,0.09654,0.965456,0.968721,0.965053,0.860168,0.0346
2,service,0.966068,0.096607,0.965735,0.968863,0.965682,0.862858,0.033932
6,sbytes,0.966687,0.096669,0.966432,0.96991,0.966369,0.865357,0.033313
30,ct_srv_src,0.966821,0.096682,0.966817,0.970193,0.966512,0.865903,0.033179
39,ct_src_ltm,0.966893,0.096689,0.966811,0.970715,0.966643,0.866195,0.033107
7,dbytes,0.967501,0.09675,0.967211,0.970453,0.967157,0.868657,0.032499
32,ct_dst_ltm,0.967501,0.09675,0.967261,0.97054,0.967175,0.868645,0.032499
9,sttl,0.967525,0.096752,0.967238,0.970266,0.967174,0.868748,0.032475
26,smean,0.967658,0.096766,0.96737,0.970574,0.967317,0.869282,0.032342


In [24]:
abl_df.to_csv('data/ablation.csv')

In [25]:
from sklearn import ensemble

X_trans = transformData(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_trans, y, test_size=.1, random_state=(314))
clf_rf = ensemble.RandomForestClassifier()
clf_rf = clf_rf.fit(X_train, y_train) 




In [26]:
columns_list = []
for col in X.columns:
    columns_list.append(col) 
columns_list[32]

'ct_dst_ltm'

In [27]:
rf_importances = clf_rf.feature_importances_ 
importance_and_features = list(zip(columns_list, rf_importances)) 
importance_and_features.sort(key=lambda a: a[1], reverse=True) 
importance_and_features[0:14]

[('ct_dst_ltm', 0.08777335030644823),
 ('sbytes', 0.07554670986262815),
 ('trans_depth', 0.07106958384818324),
 ('state', 0.053518412814627395),
 ('tcprtt', 0.05049241092430272),
 ('rate', 0.045400695643371375),
 ('ct_state_ttl', 0.04127264749102655),
 ('ct_ftp_cmd', 0.04048627025768656),
 ('spkts', 0.035742176364791586),
 ('dpkts', 0.03377020191025366),
 ('dbytes', 0.03165500472586889),
 ('dur', 0.030434068197999806),
 ('dmean', 0.029322059916309588),
 ('sloss', 0.029225990581237215)]