In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.fft import fft
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevents column wrapping in Jupyter Notebook
pd.set_option('display.width', 1000)  # Adjusts the display width to fit more columns

In [101]:
def load_data(file_path, label):
    df = pd.read_csv(file_path)
    df['label'] = label
    df['requests_rate'] = 1
    df = df[(df['dst_ip'].str.startswith('192.168')) & (df['protocol'] == 'TCP')]
    return df

def load_and_prepare_sets(benign_file_name, malign_file_name, aggregations, include_labels=False):
    benign_data = load_data(f'./BCCC-CIC-IDS-2017/{benign_file_name}', 'benign')
    ddos_data = load_data(f'./BCCC-CIC-IDS-2017/{malign_file_name}', 'malign')

    benign_data['datetime'] = pd.to_datetime(benign_data['timestamp'], format='mixed')
    benign_data.set_index('datetime', inplace=True)
    benign_data.sort_index(inplace=True)

    ddos_data['datetime'] = pd.to_datetime(ddos_data['timestamp'])
    ddos_data.set_index('datetime', inplace=True)
    ddos_data.sort_index(inplace=True)

    bening_subnet = benign_data[(benign_data['dst_ip'].str.startswith('192.168')) & (benign_data['protocol'] == 'TCP')]
    ddos_subnet = ddos_data[ddos_data['dst_ip'].str.startswith('192.168')]

    if include_labels:
        aggregations['label'] = lambda x: 0 if x.eq('benign').all() else 1
        
    bening_subnet_agg = bening_subnet.resample('1s').agg(aggregations).rename(columns={
        'dst_port': 'unique_dst_ports',
        'src_ip': 'unique_src_ips'
    })

    ddos_subnet_agg = ddos_subnet.resample('1s').agg(aggregations).rename(columns={
        'dst_port': 'unique_dst_ports',
        'src_ip': 'unique_src_ips'
    })

    if include_labels:
        aggregations.pop('label', None)

    combined_data = pd.concat([ddos_subnet, bening_subnet], ignore_index=True)
    combined_data['datetime'] = pd.to_datetime(combined_data['timestamp'], format='mixed')
    combined_data.set_index('datetime', inplace=True)
    combined_data.sort_index(inplace=True)

    aggregations['label'] = lambda x: -1 if x.eq('benign').all() else 1
    combined_data_agg = combined_data.resample('1s').agg(aggregations).rename(columns={
        'dst_port': 'unique_dst_ports',
        'src_ip': 'unique_src_ips'
    })
    aggregations.pop('label', None)

    features = list(bening_subnet_agg.keys())
    return bening_subnet_agg, ddos_subnet_agg, combined_data_agg, features

def score_model(y_pred, y):
    print("\nClassification Report:\n", classification_report(y, y_pred, target_names=["Normal", "Anomaly"]))

In [102]:
aggregations = {
    'syn_flag_counts': 'sum',         # Total SYN packets in 1s
    'rst_flag_counts': 'sum',         # Total RST packets in 1s
    'ack_flag_counts': 'sum',         # Total ACK packets in 1s
    #'duration': lambda x: x.mean() if not x.empty else 0,               # Avg duration of connections
    'packets_count': 'sum',           # Total packets per second
    'fwd_packets_count': 'sum',       # Total forward packets
    'bwd_packets_count': 'sum',       # Total backward packets
    'dst_port': 'nunique',            # Unique destination ports per second (Port Scan)
    'src_ip': 'nunique',              # Unique source IPs per second
    'bytes_rate': lambda x: x.mean() if not x.empty else 0,             # Avg bytes per second
    'requests_rate': 'count',          # Number of requests per aggregation period (1s by default)
}

bening_subnet_agg, ddos_subnet_agg, combined_data_agg, features = load_and_prepare_sets("friday_benign.csv", "ddos_loit.csv", aggregations)

In [103]:
from sklearn.metrics import precision_recall_fscore_support

def find_best_contamination_for_isolation_forest(X, y):
    best_f1, best_c = 0, None
    for c in np.arange(0.01, 0.2, 0.01):
        model = IsolationForest(contamination=c, random_state=42)
        anomaly_score = model.fit_predict(X)

        _, _, f1, _ = precision_recall_fscore_support(np.where(y == 0, 1, -1), anomaly_score, average='binary')

        if f1 > best_f1:
            best_f1 = f1
            best_c = c
    
    return best_c, best_f1

In [112]:
from sklearn.ensemble import IsolationForest

X = combined_data_agg[features]
y = combined_data_agg['label'].replace({'benign': -1, 'malign': 1})

#c, f1 = find_best_contamination_for_isolation_forest(X, y)


model = IsolationForest(contamination=0.05, random_state=42)
anomaly_score = model.fit_predict(X)
score_model(anomaly_score, y)

print(f"F1: {f1}")


Classification Report:
               precision    recall  f1-score   support

      Normal       0.30      0.02      0.03     27889
     Anomaly       0.00      0.06      0.00      1083

    accuracy                           0.02     28972
   macro avg       0.15      0.04      0.02     28972
weighted avg       0.29      0.02      0.03     28972

F1: 0


In [63]:
import pandas as pd
import numpy as np
import ipaddress

df_train = load_data("./BCCC-CIC-IDS-2017/monday_benign.csv", 'benign')
df_benign = load_data("./BCCC-CIC-IDS-2017/friday_benign.csv", 'benign')
df_ddos = load_data("./BCCC-CIC-IDS-2017/ddos_loit.csv", 'malign')

def categorize_ip(ip):
    try:
        ip_obj = ipaddress.ip_address(ip)
        if ip_obj.is_private:
            return "subnet"
        else:
            return "internet"
    except ValueError:  # Handle invalid IPs
        return "unknown"

def categorize_port(port):
    if port in [80, 443]:
        return "web"
    elif port == 53:
        return "dns"
    elif port in [25, 110, 143]:
        return "mail"
    elif port in [22, 3389]:
        return "remote_access"
    elif port in [3306, 5432, 1433]:
        return "database"
    else:
        return "other"
    
def prepare_data(df):
    df['src_ip_category'] = df['src_ip'].apply(categorize_ip)
    df['dst_ip_category'] = df['dst_ip'].apply(categorize_ip)

    df['src_port_category'] = df['src_port'].apply(categorize_port)
    df['dst_port_category'] = df['dst_port'].apply(categorize_port)

    ### Step 3: One-Hot Encode Protocol, IP Categories, and Port Categories
    df = pd.get_dummies(df, columns=['protocol', 'src_ip_category', 'dst_ip_category', 
                                    'src_port_category', 'dst_port_category'], drop_first=True, dtype=int)

    ### Step 4: Drop Original Categorical Columns
    df.drop(columns=['src_ip', 'dst_ip', 'src_port', 'dst_port'], inplace=True)

    df['datetime'] = pd.to_datetime(df['timestamp'], format='mixed')
    df.set_index('datetime', inplace=True)
    df.sort_index(inplace=True)

    df.replace({'benign': -1, 'malign': 1}, inplace=True)

    return df.drop(columns=['flow_id', 'timestamp'])


print(df_benign.shape)

df = prepare_data(pd.concat([df_benign, df_ddos]))
df_train = prepare_data(df_train)

(52904, 123)


  df.replace({'benign': -1, 'malign': 1}, inplace=True)
  df.replace({'benign': -1, 'malign': 1}, inplace=True)


In [64]:
print(df_benign.shape)
print(df_ddos.shape)
print(df.shape)

(52904, 123)
(94535, 123)
(147439, 124)


In [65]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

test = df.drop(columns=['label'])

def calculate_vif(test):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = test.columns

    vif = []
    print(f"Total: {range(test.shape[1])}")
    for i in range(test.shape[1]):
        vif.append(variance_inflation_factor(test.values, i))
        print(i)

    vif_data["VIF"] = vif
    return vif_data

vif_df = calculate_vif(test)
print(vif_df)

Total: range(0, 123)
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72


  return 1 - self.ssr/self.centered_tss


73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113


  return 1 - self.ssr/self.centered_tss


114
115
116
117
118
119
120
121
122
                             Feature           VIF
0                           duration  6.793976e+05
1                      packets_count  7.402062e+10
2                  fwd_packets_count  4.688100e+10
3                  bwd_packets_count  1.154173e+10
4                total_payload_bytes  7.510881e+10
..                               ...           ...
118            src_port_category_web  3.134922e+04
119            dst_port_category_dns  1.503103e+00
120          dst_port_category_other  1.681033e+04
121  dst_port_category_remote_access  5.294180e+02
122            dst_port_category_web  1.722585e+04

[123 rows x 2 columns]


In [66]:
vif_df

Unnamed: 0,Feature,VIF
0,duration,6.793976e+05
1,packets_count,7.402062e+10
2,fwd_packets_count,4.688100e+10
3,bwd_packets_count,1.154173e+10
4,total_payload_bytes,7.510881e+10
...,...,...
118,src_port_category_web,3.134922e+04
119,dst_port_category_dns,1.503103e+00
120,dst_port_category_other,1.681033e+04
121,dst_port_category_remote_access,5.294180e+02


In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Split features and target variable
X = df.drop(columns=['label'])  # Replace 'target' with your actual target column
y = df['label']

# Standardize the features for models like KNN & Lasso
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training & test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [68]:
print(f"There are {X_train.shape[1]} features")

# Apply LassoCV to select numerical features
lasso = LassoCV(cv=5, random_state=41).fit(X_train, y_train)
selector = SelectFromModel(lasso, prefit=True, threshold="mean")

# Transform dataset with selected features
X_train_lasso = selector.transform(X_train)
X_test_lasso = selector.transform(X_test)

print(f"Selected {X_train_lasso.shape[1]} numerical features using LassoCV.")

# Train KNN with selected numerical features
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_lasso, y_train)

# Evaluate KNN model
y_pred_knn = knn.predict(X_test_lasso)
knn_accuracy = accuracy_score(y_test, y_pred_knn)

print(np.allclose(y_pred_knn, y_test))

print(f"KNN Accuracy after Lasso feature selection: {knn_accuracy:.4f}")

There are 123 features


  model = cd_fast.enet_coordinate_descent(


Selected 3 numerical features using LassoCV.
False
KNN Accuracy after Lasso feature selection: 0.9998


datetime
2017-07-07 07:59:50.315195    1
2017-07-07 07:59:50.316273    1
2017-07-07 08:00:35.337052    1
2017-07-07 08:00:35.338671    1
2017-07-07 08:00:35.342358    1
                             ..
2017-07-07 16:02:38.575222    1
2017-07-07 16:02:39.565876    1
2017-07-07 16:02:39.567285    1
2017-07-07 16:02:40.805791    1
2017-07-07 16:02:41.005391    1
Name: requests_rate, Length: 147439, dtype: int64

In [69]:
# Get boolean mask of selected features
selected_mask = selector.get_support()

# Get feature names from original dataframe
selected_features = np.array(df.drop(columns=['label']).columns)[selected_mask]

print("Selected Features for KNN:", selected_features)

Selected Features for KNN: ['dst_port_category_other' 'dst_port_category_remote_access'
 'dst_port_category_web']


In [70]:

vif_df = calculate_vif(test[selected_features])
print(vif_df)

Total: range(0, 3)
0
1
2
                           Feature  VIF
0          dst_port_category_other  1.0
1  dst_port_category_remote_access  1.0
2            dst_port_category_web  1.0


In [80]:
score_model(y_pred_knn, y_test)


Classification Report:
               precision    recall  f1-score   support

      Normal       1.00      1.00      1.00     10522
     Anomaly       1.00      1.00      1.00     18966

    accuracy                           1.00     29488
   macro avg       1.00      1.00      1.00     29488
weighted avg       1.00      1.00      1.00     29488



In [73]:
import pandas as pd

agg_funcs = {
    # 'duration': lambda x: x.max() if not x.empty else 0,
    # 'packets_count': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_packets_count': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_total_payload_bytes': lambda x: x.sum() if not x.empty else 0,
    # 'payload_bytes_mean': lambda x: x.mean() if not x.empty else 0,
    # 'payload_bytes_std': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_payload_bytes_mean': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_payload_bytes_std': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_payload_bytes_variance': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_payload_bytes_mean': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_payload_bytes_std': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_payload_bytes_variance': lambda x: x.mean() if not x.empty else 0,
    # 'max_header_bytes': lambda x: x.mean() if not x.empty else 0,
    # 'min_header_bytes': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_min_header_bytes': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_std_header_bytes': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_mean_header_bytes': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_std_header_bytes': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_avg_segment_size': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_avg_segment_size': lambda x: x.mean() if not x.empty else 0,
    # 'avg_segment_size': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_init_win_bytes': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_init_win_bytes': lambda x: x.mean() if not x.empty else 0,
    # 'active_min': lambda x: x.max() if not x.empty else 0,
    # 'active_max': lambda x: x.max() if not x.empty else 0,
    # 'idle_min': lambda x: x.max() if not x.empty else 0,
    # 'idle_std': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_bytes_rate': lambda x: x.sum() if not x.empty else 0,
    # 'bwd_packets_rate': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_packets_rate': lambda x: x.sum() if not x.empty else 0,
    # 'down_up_rate': lambda x: x.mean() if not x.empty else 0,
    # 'avg_fwd_bytes_per_bulk': lambda x: x.mean() if not x.empty else 0,
    # 'avg_fwd_packets_per_bulk': lambda x: x.mean() if not x.empty else 0,
    # 'avg_fwd_bulk_rate': lambda x: x.mean() if not x.empty else 0,
    # 'avg_bwd_packets_bulk_rate': lambda x: x.mean() if not x.empty else 0,
    # 'avg_bwd_bulk_rate': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_bulk_state_count': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_bulk_total_size': lambda x: x.sum() if not x.empty else 0,
    # 'bwd_bulk_state_count': lambda x: x.sum() if not x.empty else 0,
    # 'bwd_bulk_total_size': lambda x: x.sum() if not x.empty else 0,
    # 'ack_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'rst_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_fin_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_psh_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_syn_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_ack_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_rst_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'bwd_fin_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'bwd_ack_flag_counts': lambda x: x.sum() if not x.empty else 0,
    # 'fwd_packets_IAT_std': lambda x: x.mean() if not x.empty else 0,
    # 'fwd_packets_IAT_max': lambda x: x.max() if not x.empty else 0,
    # 'bwd_packets_IAT_mean': lambda x: x.mean() if not x.empty else 0,
    # 'bwd_packets_IAT_max': lambda x: x.max() if not x.empty else 0,
    # 'protocol_UDP': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'src_ip_category_subnet': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'dst_ip_category_subnet': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'src_port_category_dns': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'src_port_category_other': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'src_port_category_remote_access': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'src_port_category_web': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'dst_port_category_dns': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'dst_port_category_other': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'dst_port_category_remote_access': lambda x: x.mode().iloc[0] if not x.empty else 0,
    # 'dst_port_category_web': lambda x: x.mode().iloc[0] if not x.empty else 0,
    'syn_flag_counts': 'sum',         # Total SYN packets in 1s
    'rst_flag_counts': 'sum',         # Total RST packets in 1s
    'ack_flag_counts': 'sum',         # Total ACK packets in 1s
    #'duration': lambda x: x.mean() if not x.empty else 0,               # Avg duration of connections
    'packets_count': 'sum',           # Total packets per second
    'fwd_packets_count': 'sum',       # Total forward packets
    'bytes_rate': lambda x: x.mean() if not x.empty else 0,             # Avg bytes per second
    'requests_rate': 'count',          # Number of requests per aggregation period (1s by default)
    'label': lambda x: x.max() if not x.empty else -1,
}

# Resample and aggregate
df_resampled = df.resample('1s').agg(agg_funcs)
train_resampled = df_train.resample('1s').agg(agg_funcs)

In [74]:
df_resampled

Unnamed: 0_level_0,syn_flag_counts,rst_flag_counts,ack_flag_counts,packets_count,fwd_packets_count,bytes_rate,requests_rate,label
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-07-07 07:59:50,0,2,687,687,453,96.412162,2,-1
2017-07-07 07:59:51,0,0,0,0,0,0.000000,0,-1
2017-07-07 07:59:52,0,0,0,0,0,0.000000,0,-1
2017-07-07 07:59:53,0,0,0,0,0,0.000000,0,-1
2017-07-07 07:59:54,0,0,0,0,0,0.000000,0,-1
...,...,...,...,...,...,...,...,...
2017-07-07 16:02:37,0,0,1,1,0,0.000000,1,-1
2017-07-07 16:02:38,8,0,111,115,58,10961.071985,2,-1
2017-07-07 16:02:39,4,0,86,88,42,3779.565881,2,-1
2017-07-07 16:02:40,0,0,3,3,0,0.000000,1,-1


In [75]:
model = IsolationForest(contamination=0.01, random_state=42)
y_pred = model.fit_predict(df_resampled.drop(columns=['label']))
score_model(y_pred, df_resampled['label'])



Classification Report:
               precision    recall  f1-score   support

      Normal       0.26      0.00      0.01     27889
     Anomaly       0.03      0.81      0.06      1083

    accuracy                           0.03     28972
   macro avg       0.15      0.40      0.03     28972
weighted avg       0.25      0.03      0.01     28972

