In [73]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

In [74]:
df = pd.read_csv("UNSW_NB15_training-set.csv")

In [75]:
df.describe()

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,...,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0
mean,87671.0,1.359389,20.298664,18.969591,8844.844,14928.92,95406.19,179.546997,79.609567,73454030.0,...,5.383538,4.206255,8.729881,0.014948,0.014948,0.133066,6.955789,9.100758,0.015752,0.680622
std,50616.731112,6.480249,136.887597,110.258271,174765.6,143654.2,165401.0,102.940011,110.506863,188357400.0,...,8.047104,5.783585,10.956186,0.126048,0.126048,0.701208,8.321493,10.756952,0.124516,0.466237
min,1.0,0.0,1.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,43836.0,8e-06,2.0,0.0,114.0,0.0,32.78614,62.0,0.0,13053.34,...,1.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0
50%,87671.0,0.001582,2.0,2.0,430.0,164.0,3225.807,254.0,29.0,879674.8,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,4.0,0.0,1.0
75%,131506.0,0.668069,12.0,10.0,1418.0,1102.0,125000.0,254.0,252.0,88888890.0,...,5.0,3.0,12.0,0.0,0.0,0.0,9.0,12.0,0.0,1.0
max,175341.0,59.999989,9616.0,10974.0,12965230.0,14655550.0,1000000.0,255.0,254.0,5988000000.0,...,51.0,46.0,65.0,4.0,4.0,30.0,60.0,62.0,1.0,1.0


In [76]:
def preprocess(df: pd.DataFrame):
    # drop doops
    # drop nulls
    # labelencode
    # normalize
    df = df.copy(deep=True)
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    
    X, Y = df.drop('label', axis=1), df['label']
    lc = LabelEncoder()
    sc = StandardScaler()

    for col in X.columns:
        if X[col].dtype == object:
            X[col] = lc.fit_transform(X[col])

    df = pd.DataFrame(sc.fit_transform(X), columns=X.columns, index=X.index)
    df['label'] = Y
    return df

In [77]:
def feature_selection(df, threshold):
    # remove features that have multi collinearity above thold.
    df = df.copy(deep=True)

    corr_matrix = df.drop('label', axis=1).corr().abs()

    upr_triangle = np.triu(corr_matrix, k=1)

    return [col for i, col in enumerate(corr_matrix.columns) if any(upr_triangle[i, :] > threshold)]

In [78]:
def sampling(df):
    df = df.copy(deep=True)
    X, Y = df.drop('label', axis=1), df['label']

    smote = SMOTE(random_state=42, k_neighbors=6)
    X, Y = smote.fit_resample(X, Y)

    df = pd.DataFrame(X, columns=X.columns, index=X.index)
    df['label'] = Y

    return df

In [79]:
def pca_transform(df, pca):
    df = df.copy(deep=True)

    X, Y = df.drop('label', axis=1), df['label']

    trX = pca.transform(X)

    df = pd.DataFrame(trX, columns=[f'PC{i+1}' for i in range(trX.shape[1])], index=X.index)
    df['label'] = Y
    return df

In [80]:
df = preprocess(df)

In [81]:
df

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,-1.732041,-0.191029,0.151809,-0.702307,-0.409218,-0.104456,-0.135769,-0.049134,-0.102726,-0.576371,...,-0.554373,-0.705529,-0.118590,-0.118590,-0.189768,-0.715714,-0.753074,-0.126508,0.848024,0
1,-1.732021,-0.109485,0.151809,-0.702307,-0.409218,-0.046014,0.172599,-0.046410,0.188544,-0.576345,...,-0.554373,-0.614256,-0.118590,-0.118590,-0.189768,-0.715714,-0.288257,-0.126508,0.848024,0
2,-1.732001,0.040699,0.151809,-0.702307,-0.409218,-0.089845,-0.026933,-0.048527,-0.012133,-0.576734,...,-0.554373,-0.522983,-0.118590,-0.118590,-0.189768,-0.595543,-0.288257,-0.126508,0.848024,0
3,-1.731982,0.049729,0.151809,0.599130,-0.409218,-0.060624,-0.063212,-0.047016,-0.098563,-0.576737,...,-0.554373,-0.522983,7.814915,7.814915,-0.189768,-0.595543,-0.753074,-0.126508,0.848024,0
4,-1.731962,-0.140417,0.151809,-0.702307,-0.409218,-0.075235,-0.117630,-0.047554,-0.102057,-0.576617,...,-0.554373,2.854115,-0.118590,-0.118590,-0.189768,-0.595543,2.779535,-0.126508,0.848024,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,1.731962,-0.209773,0.420235,0.165317,0.742936,-0.133677,-0.172047,-0.049958,-0.103923,0.094951,...,1.520470,1.393748,-0.118590,-0.118590,-0.189768,2.048221,1.385084,-0.126508,0.210364,1
175337,1.731982,-0.131728,0.151809,-0.702307,-0.409218,-0.075235,-0.099490,-0.047062,-0.101459,-0.576616,...,-0.554373,-0.614256,-0.118590,-0.118590,-0.189768,-0.715714,-0.753074,-0.126508,2.123345,1
175338,1.732001,-0.209773,0.420235,0.165317,0.742936,-0.133677,-0.172047,-0.049958,-0.103923,0.094951,...,-0.208566,0.389746,-0.118590,-0.118590,-0.189768,-0.475371,0.269523,-0.126508,0.210364,1
175339,1.732021,-0.209773,0.420235,0.165317,0.742936,-0.133677,-0.172047,-0.049958,-0.103923,0.094951,...,1.693374,1.941386,-0.118590,-0.118590,-0.189768,2.769248,1.942865,-0.126508,0.210364,1


In [82]:
high_corr_features = feature_selection(df, 0.8)

In [83]:
high_corr_features

['spkts',
 'dpkts',
 'sbytes',
 'dbytes',
 'dttl',
 'sinpkt',
 'swin',
 'tcprtt',
 'ct_srv_src',
 'ct_dst_ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'is_ftp_login']

In [84]:
df.drop(columns=high_corr_features, inplace=True)

In [85]:
df = sampling(df)

In [86]:
df

Unnamed: 0,id,dur,proto,service,state,rate,sttl,sload,dload,sloss,...,trans_depth,response_body_len,ct_state_ttl,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,-1.732041,-0.191029,0.151809,-0.702307,-0.409218,-0.576371,0.703839,-0.389897,-0.273700,-0.075040,...,-0.136415,-0.039557,-1.366486,-0.118590,-0.189768,-0.715714,-0.753074,-0.126508,0.848024,0
1,-1.732021,-0.109485,0.151809,-0.702307,-0.409218,-0.576345,-1.141901,-0.389928,-0.069233,-0.044739,...,-0.136415,-0.039557,-0.318711,-0.118590,-0.189768,-0.715714,-0.288257,-0.126508,0.848024,0
2,-1.732001,0.040699,0.151809,-0.702307,-0.409218,-0.576734,-1.141901,-0.389964,-0.252044,-0.059890,...,-0.136415,-0.039557,-0.318711,-0.118590,-0.189768,-0.595543,-0.288257,-0.126508,0.848024,0
3,-1.731982,0.049729,0.151809,0.599130,-0.409218,-0.576737,-1.141901,-0.389958,-0.275821,-0.059890,...,-0.136415,-0.039557,-0.318711,7.814915,-0.189768,-0.595543,-0.753074,-0.126508,0.848024,0
4,-1.731962,-0.140417,0.151809,-0.702307,-0.409218,-0.576617,0.723268,-0.389927,-0.275561,-0.044739,...,-0.136415,-0.039557,-0.318711,-0.118590,-0.189768,-0.595543,2.779535,-0.126508,0.848024,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238677,-1.260766,0.749012,0.151809,-0.702307,-0.409218,-0.576665,-1.443048,-0.389945,-0.226154,0.031013,...,-0.136415,-0.039557,-1.366486,-0.118590,-0.189768,-0.549885,-0.506255,-0.126508,0.848024,0
238678,-0.861817,-0.128238,0.151809,-0.702307,-0.409218,-0.576642,0.723268,-0.389932,-0.275763,-0.044739,...,-0.136415,-0.039557,-0.318711,-0.118590,-0.189768,-0.715714,-0.284188,-0.126508,0.848024,0
238679,0.277653,-0.175018,0.151809,-0.702307,-2.713526,-0.576631,-1.141901,-0.389817,-0.276576,-0.044739,...,-0.136415,-0.039557,1.776839,-0.118590,-0.189768,-0.259692,-0.103546,-0.126508,0.848024,0
238680,-1.094044,0.032858,0.151809,0.599130,-0.409218,-0.576345,-1.443048,-0.389881,-0.268151,0.091614,...,-0.136415,-0.039557,-1.366486,7.814915,-0.189768,-0.235029,-0.693022,-0.126508,0.848024,0


In [88]:
pca = PCA(n_components=6).fit(df.drop('label', axis=1))
df = pca_transform(df, pca)

In [89]:
df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,label
0,1.063921,-0.341721,-0.151694,-0.171319,-0.957681,-0.620338,0
1,2.764198,-1.227245,0.063388,0.530284,1.151130,-0.435360,0
2,3.000743,0.808575,0.712509,0.826814,0.854572,-0.434460,0
3,1.671491,-0.000399,0.468493,-0.005991,-1.147238,1.315119,0
4,0.603319,1.068508,-0.833377,-0.158404,-0.214104,-1.474197,0
...,...,...,...,...,...,...,...
238677,3.837874,-0.114789,0.977723,1.593083,1.456441,0.356893,0
238678,1.459664,1.905376,-0.601915,-1.018789,-0.018435,-1.618371,0
238679,1.093596,1.930366,-0.408889,-1.341430,-0.879407,-1.175609,0
238680,1.566409,-0.538057,0.114896,-0.256579,-1.379304,1.344228,0


In [90]:
from sklearn.model_selection import train_test_split

In [91]:
X, Y = df.drop('label', axis=1), df['label']

In [92]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report

In [95]:
sample_model = LogisticRegression()
sample_model.fit(x_train, y_train)
sample_model_preds = sample_model.predict(x_test)

print(classification_report(y_test, sample_model_preds))

              precision    recall  f1-score   support

           0       0.92      0.87      0.90     23839
           1       0.88      0.93      0.90     23898

    accuracy                           0.90     47737
   macro avg       0.90      0.90      0.90     47737
weighted avg       0.90      0.90      0.90     47737



In [96]:
models = [LogisticRegression(), KNeighborsClassifier(), SVC(), DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]

In [99]:
for model in models:
    print(type(model).__name__)
    score = cross_val_score(model, x_train, y_train, cv=2, scoring='accuracy').mean()
    print(score)

LogisticRegression
0.8989028322749906
KNeighborsClassifier
0.9825970818046419
SVC
0.9542381334274608
DecisionTreeClassifier
0.9732226568623492
RandomForestClassifier
0.9842834338552177
GradientBoostingClassifier
0.9506978436113043
AdaBoostClassifier
0.920003147667086


In [None]:
# X = df.drop('label', axis=1)
# y = df['label']

# Define base models
model1 = LogisticRegression()
model2 = KNeighborsClassifier()
model3 = SVC()
model4 = DecisionTreeClassifier()
model5 = RandomForestClassifier()
model6 = AdaBoostClassifier()
model7 = GradientBoostingClassifier()

# Define ensemble model
ensemble = VotingClassifier(estimators=[('lr', model1), ('knn', model2), ('svc', model3), ('dt', model4), ('rf', model5), ('ab', model6), ('gb', model7)], voting='hard')

# Define hyperparameters to tune
params = {
    'lr__C': [1], 
    'rf__n_estimators': [10], 
    'svc__C': [1], 
    'knn__n_neighbors': [3], 
    'dt__max_depth': [None],
    'ab__n_estimators': [5],
    'gb__n_estimators': [5],
    'gb__learning_rate': [0.1]
}

# Perform grid search
grid = GridSearchCV(estimator=ensemble, param_grid=params, cv=5)
grid.fit(x_test, y_test)

# Print best parameters
print(grid.best_params_)

In [102]:
chosen_model_1 = KNeighborsClassifier(n_neighbors=3)
chosen_model_2 = RandomForestClassifier(n_estimators=3)
chosen_model_3 = DecisionTreeClassifier(max_depth=None)
chosen_model_4 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=5)

# Define ensemble model
start_time = time.time()

ensemble = VotingClassifier(estimators=[('knn', chosen_model_1), ('rf', chosen_model_2), ('dt', chosen_model_3), ('gb', chosen_model_4)], voting='hard')
ensemble.fit(x_train, y_train)
ensemble_preds = ensemble.predict(x_test)

end_time = time.time()

print(f"\nExecution took {end_time - start_time} seconds.")
print("\nEnsemble Model Performance:")
print(classification_report(y_test, ensemble_preds))


Execution took 5.896521806716919 seconds.

Ensemble Model Performance:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     23839
           1       0.99      0.98      0.99     23898

    accuracy                           0.99     47737
   macro avg       0.99      0.99      0.99     47737
weighted avg       0.99      0.99      0.99     47737

