In [28]:
from data_loader import safe_agg, load_data, combine_datasets, prepare_dataset, allign_columns
import pandas as pd
import numpy as np

In [29]:
aggregations = {
    'syn_flag_counts':          safe_agg(pd.Series.sum), 
    'rst_flag_counts':          safe_agg(pd.Series.sum),
    'ack_flag_counts':          safe_agg(pd.Series.sum), 
    'packets_count':            safe_agg(pd.Series.sum),
    'fwd_packets_count':        safe_agg(pd.Series.sum),
    'bwd_packets_count':        safe_agg(pd.Series.sum),
    'dst_port':                 safe_agg(pd.Series.nunique), 
    'src_port':                 safe_agg(pd.Series.nunique),
    'src_ip':                   safe_agg(pd.Series.nunique),
    'dst_ip':                   safe_agg(pd.Series.nunique),

    'bytes_rate': safe_agg(pd.Series.mean), 
    'requests_rate': safe_agg(pd.Series.count),
    'psh_flag_counts': safe_agg(pd.Series.sum),

    'calculated_bwd_avg_segment_size': safe_agg(pd.Series.mean),
    'bwd_payload_bytes_mean':          safe_agg(pd.Series.sum),
    'bwd_init_win_bytes':             safe_agg(pd.Series.mean),
    'subflow_bwd_bytes':              safe_agg(pd.Series.sum),
    'bwd_total_payload_bytes':        safe_agg(pd.Series.sum),
    'fwd_payload_bytes_min':          safe_agg(pd.Series.min),
    'bwd_payload_bytes_max':          safe_agg(pd.Series.max),
    
    'packet_IAT_std':  safe_agg(pd.Series.std),
    'packet_IAT_min':  safe_agg(pd.Series.min),
    'packet_IAT_total': safe_agg(pd.Series.sum),
}

In [30]:
ddos =  load_data(f"./BCCC-CIC-IDS-2017/ddos_loit.csv", "malign")
benign = load_data(f"./BCCC-CIC-IDS-2017/friday_benign.csv", "benign")

combined = combine_datasets(ddos, benign)

combined = prepare_dataset(combined, aggregations, include_labels=True, filter_subnet=True)

In [31]:
set1 = prepare_dataset(ddos, aggregations, include_labels=True, filter_subnet=True)
set2 = prepare_dataset(benign, aggregations, include_labels=True, filter_subnet=True)

set1, set2 = allign_columns(set1, set2)

print(set1.shape)
print(set2.shape)

(1182, 34)
(28972, 34)


In [33]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

def score_model(anomaly_score, y):
    print("\nClassification Report:\n", classification_report(y, anomaly_score, labels=[1, -1], target_names=["Normal", "Anomaly"]))

def find_best_contamination_for_isolation_forest(X, y):
    best_f1, best_c = 0, None
    for c in np.arange(0.01, 0.2, 0.01):
        model = IsolationForest(contamination=c, random_state=42)
        anomaly_score = model.fit_predict(X)

        _, _, f1, _ = precision_recall_fscore_support(y, anomaly_score, average='binary')

        if f1 > best_f1:
            best_f1 = f1
            best_c = c
    
    return best_c, best_f1

X = combined
y = combined['label']

c, f1 = find_best_contamination_for_isolation_forest(X, y)
model = IsolationForest(contamination=c, random_state=42)
anomaly_score = model.fit_predict(X)
score_model(anomaly_score, y)


Classification Report:
               precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     27889
     Anomaly       0.88      0.94      0.91      1083

    accuracy                           0.99     28972
   macro avg       0.94      0.97      0.95     28972
weighted avg       0.99      0.99      0.99     28972

