In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder

In [2]:
# Read raw data from three different attack-related CSV files
ddos_df = pd.read_csv('./datasets/cicids2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
portscan_df = pd.read_csv('./datasets/cicids2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
dos_hulk_df = pd.read_csv('./datasets/cicids2017/Wednesday-workingHours.pcap_ISCX.csv')

print("DDoS:", ddos_df.shape)
print("PortScan:", portscan_df.shape)
print("DoS Hulk:", dos_hulk_df.shape)

DDoS: (225745, 79)
PortScan: (286467, 79)
DoS Hulk: (692703, 79)


In [3]:
# Combine three files into a single DataFrame
df = pd.concat([ddos_df, portscan_df, dos_hulk_df])

# Remove leading and trailing whitespace from column names
df.rename(columns=lambda x: x.strip(), inplace=True)

# Keep only rows with desired labels
df = df[df['Label'].isin(['BENIGN', 'DoS Hulk', 'PortScan', 'DDoS'])]

print("Shape after merging:", df.shape)

Shape after merging: (1183316, 79)


In [4]:
# Remove duplicated rows
df = df.drop_duplicates()

print("Shape after removing duplicates:", df.shape)

Shape after removing duplicates: (1019035, 79)


In [5]:
# Encode the label column using LabelEncoder
label_col = 'Label'
label_encoder = LabelEncoder()
df[label_col] = label_encoder.fit_transform(df[label_col])

# Create a dictionary that maps original labels names to encoded integers
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping: ", label_mapping)

label_mapping_file = {k: int(v) for k, v in label_mapping.items()}

# Save label mapping for future works
with open("processed_data/label_mapping.json", "w") as f:
    json.dump(label_mapping_file, f)

Label Mapping:  {'BENIGN': 0, 'DDoS': 1, 'DoS Hulk': 2, 'PortScan': 3}


In [6]:
# Total number of infinite values
print("Number of infinity values: ", np.isinf(df).sum().sum())

# Replace infinite values (inf, -inf) with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

Number of infinity values:  1237


In [7]:
# The total number of missing values
print("Number of missing values: ", df.isna().sum().sum())

#  Replace missing values (NaN) with the median of each column
df = df.fillna(df.median())
print("Remaining missing values:", df.isna().sum().sum()) 

Number of missing values:  1316
Remaining missing values: 0


In [8]:
# Save the preprocessed DataFrame to a CSV file
df.to_csv("processed_data/preprocessed_dataset.csv", index=False)
print("Preprocessed data saved to preprocessed_dataset.csv")

Preprocessed data saved to preprocessed_dataset.csv
