In [15]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression


In [34]:
import os

Dataset_Directory = '/home/jbenyam/threatdetection/dataset/csv' # Directory where the dataset is stored

# List all files in the directory and subdirectories
def get_all_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.relpath(os.path.join(root, file), start=directory))
    return csv_files

df_sets = get_all_csv_files(Dataset_Directory)
df_sets.sort() # Sort the files

print(f"Total CSV files found: {len(df_sets)}")
print("Files:", df_sets)

# Split the data set into training 80% and test 20%
train_set = df_sets[:int(len(df_sets)*0.8)]
test_set = df_sets[int(len(df_sets)*0.8):]

print(f"Number of training files: {len(train_set)}")
print(f"Number of test files: {len(test_set)}")

# If you need full paths for loading data later
# train_set_full_path = [os.path.join(Dataset_Directory, file) for file in train_set]
# test_set_full_path = [os.path.join(Dataset_Directory, file) for file in test_set]

#print("Train files full paths:", train_set_full_path)
#print("Test files full paths:", test_set_full_path)


Total CSV files found: 287
Files: ['Backdoor_Malware/Backdoor_Malware.pcap.csv', 'Benign_Final/BenignTraffic.pcap.csv', 'Benign_Final/BenignTraffic1.pcap.csv', 'Benign_Final/BenignTraffic2.pcap.csv', 'Benign_Final/BenignTraffic3.pcap.csv', 'BrowserHijacking/BrowserHijacking.pcap.csv', 'CommandInjection/CommandInjection.pcap.csv', 'DDoS-HTTP_Flood/DDoS-HTTP_Flood-.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood1.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood10.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood11.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood12.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood13.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood14.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood15.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood16.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood17.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood18.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood19.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood2.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Fl

In [31]:
print(train_set)
print(test_set)


['Backdoor_Malware/Backdoor_Malware.pcap.csv', 'Benign_Final/BenignTraffic.pcap.csv', 'Benign_Final/BenignTraffic1.pcap.csv', 'Benign_Final/BenignTraffic2.pcap.csv', 'Benign_Final/BenignTraffic3.pcap.csv', 'BrowserHijacking/BrowserHijacking.pcap.csv', 'CommandInjection/CommandInjection.pcap.csv', 'DDoS-HTTP_Flood/DDoS-HTTP_Flood-.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood1.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood10.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood11.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood12.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood13.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood14.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood15.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood16.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood17.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood18.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood19.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood2.pcap.csv', 'DDoS-ICMP_Flood/DDoS-ICMP_Flood20.pcap.csv', 'DDoS-ICMP_Flood/

In [35]:
# X_columns: list of columns to use as features
# Y_column: column to predict
# Model: model to use (logistic regression)

X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight',
]
y_column = 'label'



#### Scaling Features & Fitting the Scaler

In [36]:
# Importing feature scaler to help normalize feature values
# The scales of the features are different, so we need to normalize them to have mean 0 and standard deviation of 1

from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = StandardScaler

# Fiting the scaler
for train_file in tqdm(train_set):
    # Construct the full path to the CSV file
    full_path = os.path.join(Dataset_Directory, train_file)
    
    # Read the CSV file
    df = pd.read_csv(full_path)
    
    # Check if the columns exist in the DataFrame
    if all(col in df.columns for col in X_columns):
        # Fit the scaler on the specified columns
        scaler.fit(df[X_columns])
        # Perform other operations as needed (e.g., transforming data)
    else:
        print(f"Warning: Not all specified columns are present in {train_file}")


  0%|          | 0/229 [00:00<?, ?it/s]



  1%|          | 2/229 [00:00<01:09,  3.27it/s]



  1%|▏         | 3/229 [00:01<01:23,  2.69it/s]



  2%|▏         | 5/229 [00:01<01:04,  3.50it/s]



  4%|▍         | 9/229 [00:01<00:32,  6.69it/s]



  4%|▍         | 10/229 [00:02<00:39,  5.55it/s]



  5%|▍         | 11/229 [00:02<00:43,  4.97it/s]



  5%|▌         | 12/229 [00:02<00:45,  4.72it/s]



  6%|▌         | 13/229 [00:02<00:47,  4.57it/s]



  6%|▌         | 14/229 [00:03<00:48,  4.44it/s]



  7%|▋         | 15/229 [00:03<00:49,  4.36it/s]



  7%|▋         | 16/229 [00:03<00:51,  4.15it/s]



  7%|▋         | 17/229 [00:03<00:51,  4.14it/s]



  8%|▊         | 18/229 [00:04<00:51,  4.12it/s]



  8%|▊         | 19/229 [00:04<00:51,  4.12it/s]



  9%|▊         | 20/229 [00:04<00:50,  4.10it/s]



  9%|▉         | 21/229 [00:04<00:51,  4.08it/s]



 10%|▉         | 22/229 [00:05<00:51,  4.05it/s]



 10%|█         | 23/229 [00:05<00:51,  4.03it/s]



 10%|█         | 24/229 [00:05<00:50,  4.04it/s]



 11%|█         | 25/229 [00:05<00:51,  3.96it/s]



 11%|█▏        | 26/229 [00:06<00:50,  4.05it/s]



 12%|█▏        | 27/229 [00:06<00:50,  4.02it/s]



 12%|█▏        | 28/229 [00:06<00:49,  4.04it/s]



 13%|█▎        | 29/229 [00:06<00:48,  4.11it/s]



 13%|█▎        | 30/229 [00:07<00:48,  4.10it/s]



 14%|█▎        | 31/229 [00:07<00:48,  4.06it/s]



 14%|█▍        | 32/229 [00:07<00:48,  4.02it/s]



 14%|█▍        | 33/229 [00:07<00:48,  4.04it/s]



 15%|█▍        | 34/229 [00:08<00:50,  3.88it/s]



 17%|█▋        | 39/229 [00:08<00:21,  8.78it/s]



 21%|██        | 47/229 [00:08<00:10, 18.13it/s]



 24%|██▍       | 55/229 [00:09<00:06, 26.16it/s]



 26%|██▌       | 59/229 [00:10<00:19,  8.82it/s]



 27%|██▋       | 62/229 [00:10<00:25,  6.49it/s]



 28%|██▊       | 64/229 [00:11<00:25,  6.39it/s]



 29%|██▉       | 66/229 [00:11<00:29,  5.55it/s]



 30%|██▉       | 68/229 [00:12<00:31,  5.06it/s]



 30%|███       | 69/229 [00:12<00:33,  4.80it/s]



 31%|███       | 70/229 [00:12<00:34,  4.62it/s]



 31%|███       | 71/229 [00:13<00:35,  4.47it/s]



 31%|███▏      | 72/229 [00:13<00:36,  4.30it/s]



 32%|███▏      | 73/229 [00:13<00:37,  4.17it/s]



 32%|███▏      | 74/229 [00:13<00:38,  4.07it/s]



 33%|███▎      | 75/229 [00:14<00:38,  4.02it/s]



 33%|███▎      | 76/229 [00:14<00:38,  4.01it/s]



 34%|███▎      | 77/229 [00:14<00:38,  3.96it/s]



 34%|███▍      | 78/229 [00:14<00:38,  3.94it/s]



 35%|███▍      | 80/229 [00:15<00:32,  4.62it/s]



 35%|███▌      | 81/229 [00:15<00:33,  4.39it/s]



 36%|███▌      | 82/229 [00:15<00:34,  4.28it/s]



 36%|███▌      | 83/229 [00:15<00:34,  4.21it/s]



 37%|███▋      | 84/229 [00:16<00:34,  4.20it/s]



 37%|███▋      | 85/229 [00:16<00:37,  3.80it/s]



 38%|███▊      | 86/229 [00:16<00:38,  3.69it/s]



 38%|███▊      | 87/229 [00:17<00:37,  3.79it/s]



 38%|███▊      | 88/229 [00:17<00:37,  3.78it/s]



 39%|███▉      | 89/229 [00:17<00:36,  3.83it/s]



 39%|███▉      | 90/229 [00:17<00:36,  3.85it/s]



 40%|███▉      | 91/229 [00:18<00:35,  3.85it/s]



 40%|████      | 92/229 [00:18<00:35,  3.84it/s]



 41%|████      | 93/229 [00:18<00:35,  3.80it/s]



 41%|████▏     | 95/229 [00:19<00:29,  4.53it/s]



 42%|████▏     | 96/229 [00:19<00:30,  4.36it/s]



 43%|████▎     | 98/229 [00:19<00:28,  4.54it/s]



 43%|████▎     | 99/229 [00:19<00:30,  4.33it/s]



 44%|████▎     | 100/229 [00:20<00:30,  4.21it/s]



 44%|████▍     | 101/229 [00:20<00:31,  4.08it/s]



 45%|████▍     | 102/229 [00:20<00:32,  3.91it/s]



 45%|████▍     | 103/229 [00:21<00:33,  3.73it/s]



 46%|████▌     | 105/229 [00:21<00:25,  4.77it/s]



 46%|████▋     | 106/229 [00:21<00:27,  4.51it/s]



 47%|████▋     | 107/229 [00:21<00:28,  4.31it/s]



 47%|████▋     | 108/229 [00:22<00:28,  4.18it/s]



 48%|████▊     | 110/229 [00:22<00:25,  4.72it/s]



 48%|████▊     | 111/229 [00:22<00:26,  4.43it/s]



 49%|████▉     | 112/229 [00:23<00:28,  4.13it/s]



 49%|████▉     | 113/229 [00:23<00:29,  3.88it/s]



 50%|████▉     | 114/229 [00:23<00:29,  3.91it/s]



 50%|█████     | 115/229 [00:23<00:29,  3.86it/s]



 51%|█████     | 116/229 [00:24<00:29,  3.81it/s]



 51%|█████     | 117/229 [00:24<00:30,  3.71it/s]



 52%|█████▏    | 118/229 [00:24<00:29,  3.76it/s]



 52%|█████▏    | 119/229 [00:24<00:29,  3.72it/s]



 52%|█████▏    | 120/229 [00:25<00:29,  3.72it/s]



 53%|█████▎    | 121/229 [00:25<00:29,  3.72it/s]



 53%|█████▎    | 122/229 [00:25<00:28,  3.80it/s]



 54%|█████▎    | 123/229 [00:25<00:27,  3.85it/s]



 54%|█████▍    | 124/229 [00:26<00:27,  3.86it/s]



 55%|█████▍    | 125/229 [00:26<00:26,  3.86it/s]



 55%|█████▌    | 126/229 [00:26<00:26,  3.88it/s]



 55%|█████▌    | 127/229 [00:27<00:26,  3.88it/s]



 56%|█████▋    | 129/229 [00:27<00:22,  4.54it/s]



 57%|█████▋    | 131/229 [00:27<00:21,  4.52it/s]



 58%|█████▊    | 132/229 [00:28<00:21,  4.52it/s]



 58%|█████▊    | 133/229 [00:28<00:23,  4.15it/s]



 59%|█████▊    | 134/229 [00:28<00:27,  3.43it/s]



 59%|█████▉    | 135/229 [00:29<00:27,  3.38it/s]



 59%|█████▉    | 136/229 [00:29<00:26,  3.47it/s]



 60%|█████▉    | 137/229 [00:29<00:30,  3.06it/s]



 60%|██████    | 138/229 [00:29<00:27,  3.29it/s]



 61%|██████    | 139/229 [00:30<00:26,  3.41it/s]



 61%|██████    | 140/229 [00:30<00:24,  3.57it/s]



 62%|██████▏   | 141/229 [00:30<00:23,  3.72it/s]



 62%|██████▏   | 142/229 [00:30<00:22,  3.84it/s]



 62%|██████▏   | 143/229 [00:31<00:22,  3.90it/s]



 63%|██████▎   | 144/229 [00:31<00:21,  3.92it/s]



 63%|██████▎   | 145/229 [00:31<00:21,  3.90it/s]



 64%|██████▍   | 146/229 [00:32<00:21,  3.84it/s]



 64%|██████▍   | 147/229 [00:32<00:21,  3.87it/s]



 65%|██████▍   | 148/229 [00:32<00:20,  3.86it/s]



 65%|██████▌   | 149/229 [00:32<00:20,  3.90it/s]



 66%|██████▌   | 151/229 [00:33<00:16,  4.70it/s]



 66%|██████▋   | 152/229 [00:33<00:17,  4.36it/s]



 67%|██████▋   | 153/229 [00:33<00:18,  4.13it/s]



 67%|██████▋   | 154/229 [00:33<00:18,  4.03it/s]



 68%|██████▊   | 155/229 [00:34<00:19,  3.75it/s]



 68%|██████▊   | 156/229 [00:34<00:19,  3.71it/s]



 69%|██████▉   | 158/229 [00:34<00:17,  4.09it/s]



 70%|███████   | 161/229 [00:35<00:08,  7.81it/s]



 71%|███████   | 163/229 [00:35<00:12,  5.40it/s]



 72%|███████▏  | 164/229 [00:35<00:12,  5.04it/s]



 72%|███████▏  | 165/229 [00:36<00:13,  4.68it/s]



 72%|███████▏  | 166/229 [00:36<00:14,  4.34it/s]



 73%|███████▎  | 167/229 [00:36<00:14,  4.19it/s]



 74%|███████▍  | 169/229 [00:37<00:13,  4.34it/s]



 74%|███████▍  | 170/229 [00:37<00:13,  4.23it/s]



 75%|███████▍  | 171/229 [00:37<00:13,  4.15it/s]



 76%|███████▌  | 173/229 [00:38<00:11,  4.79it/s]



 76%|███████▌  | 174/229 [00:38<00:12,  4.46it/s]



 76%|███████▋  | 175/229 [00:38<00:12,  4.29it/s]



 77%|███████▋  | 176/229 [00:38<00:12,  4.18it/s]



 77%|███████▋  | 177/229 [00:39<00:12,  4.14it/s]



 78%|███████▊  | 178/229 [00:39<00:12,  4.14it/s]



 79%|███████▊  | 180/229 [00:39<00:10,  4.79it/s]



 79%|███████▉  | 181/229 [00:39<00:09,  5.01it/s]



 79%|███████▉  | 182/229 [00:40<00:10,  4.38it/s]



 80%|███████▉  | 183/229 [00:40<00:12,  3.75it/s]



 80%|████████  | 184/229 [00:40<00:13,  3.38it/s]



 81%|████████  | 185/229 [00:41<00:13,  3.22it/s]



 81%|████████  | 186/229 [00:41<00:13,  3.17it/s]



 82%|████████▏ | 187/229 [00:41<00:13,  3.07it/s]



 83%|████████▎ | 189/229 [00:42<00:10,  3.98it/s]



 83%|████████▎ | 191/229 [00:42<00:07,  5.06it/s]



 84%|████████▍ | 193/229 [00:42<00:06,  5.44it/s]



 85%|████████▍ | 194/229 [00:43<00:07,  4.92it/s]



 85%|████████▌ | 195/229 [00:43<00:07,  4.39it/s]



 87%|████████▋ | 200/229 [00:43<00:03,  9.42it/s]



 91%|█████████▏| 209/229 [00:44<00:00, 20.17it/s]



 93%|█████████▎| 212/229 [00:44<00:01, 13.82it/s]



 95%|█████████▌| 218/229 [00:44<00:00, 18.23it/s]



 98%|█████████▊| 224/229 [00:44<00:00, 21.75it/s]



100%|██████████| 229/229 [00:45<00:00,  5.08it/s]






#### Model Classification

In [38]:
ML_models = [LogisticRegression(n_jobs=-1),] # List of ML models used for classification
ML_names = ['Logistic Regression',] # Model names for reference

# Training the models
for train_set in tqdm(train_set): # Loops through the training sets files
    d = pd.read_csv(Dataset_Directory + train_set) # Reads the dataset in Pandas
    d[X_columns] = scaler.transform(d[X_columns]) # Scales features in the dataset
    for model in (ML_models): 
        model.fit(d[X_columns], d[y_column]) # Trains the model on the scaled features and labels
    del d # Deletes the dataset to free up memory


  0%|          | 0/42 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/home/jbenyam/threatdetection/dataset/csvB'

#### Evaluating Model Performance

In [9]:
y_test = [] #  Initializes a list to store the true labels for the test sets.
preds = {i:[] for i in range(len(ML_models))} # Initializes a dictionary to store the predictions of the models for the test sets.
for test_set in tqdm(test_sets): # Loops through the test sets files
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set) # Reads the dataset in Pandas
    d_test[X_columns] = scaler.transform(d_test[X_columns]) # Scales features in the dataset
    
    y_test += list(d_test[y_column].values) # Appends the true labels to the y_test list
    
    for i in range(len(ML_models)): # Loops through the models
        model = ML_models[i] # Selects the model
        y_pred = list(model.predict(d_test[X_columns])) # Predicts the labels of the test set
        preds[i] = preds[i] + y_pred # Appends the predictions to the dictionary


0it [00:00, ?it/s]


#### Calculating Metrics

In [11]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items(): # Loops through the predictions
    y_pred = v # Retrieves the predictions for the model
    print(f"##### {ML_names[k]} (34 classes) #####") # Prints the model name 
    print('accuracy_score: ', accuracy_score(y_pred, y_test)) # Prints the accuracy score
    print('recall_score: ', recall_score(y_pred, y_test, average='macro')) # Prints the recall score
    print('precision_score: ', precision_score(y_pred, y_test, average='macro')) # Prints the precision score
    print('f1_score: ', f1_score(y_pred, y_test, average='macro')) # Prints the F1 score
    print()
    print()
    print()


##### Logistic Regression (34 classes) #####
accuracy_score:  nan
recall_score:  nan
precision_score:  nan
f1_score:  nan





#### Mapping Classes (7 classes)

##### A dictionary to map the original class labels to simplified class labels for a 7-class classification.

In [12]:
dict_11classes = {} # Initializes a dictionary to store the predictions of the models for the test sets.
dict_11classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_11classes['DDoS-HTTP_Flood'] = 'DDoS'
dict_11classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_11classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_11classes['DDoS-PSHACK_FLOOD'] = 'DDoS'
dict_11classes['DDoS-RSTFINFFLOOD'] = 'DDoS'
dict_11classes['DDoS-SlowLoris'] = 'DDoS'
dict_11classes['DDoS-SYN_Flood'] = 'DDoS'
dict_11classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_11classes['DDoS-TCP_Flood'] = 'DDoS'
dict_11classes['DDoS-UDP_Flood'] = 'DDoS'


### Outputing Metrics from Classification Report

In [14]:
from sklearn.metrics import classification_report
for k,v in preds.items(): # Maps the true labels to the 11 classes
    y_pred = v
    y_test_simple = [dict_11classes[y] for y in y_test] # Maps the true labels to the 11 labels
    y_pred_simple = [dict_11classes[y] for y in y_pred] # Maps the predicted labels to the 11 labels
    print(f"##### {ML_names[k]} (11 classes) #####")
    print(classification_report(y_test_simple, y_pred_simple))
    print()
    print()
    print()


##### Logistic Regression (11 classes) #####


ValueError: max() arg is an empty sequence