# Data Cleaning
The following subprocess are included in this program:
1. **Feature Selection**
2. **Data Cleaning**
3. **Normalization**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

### 1. Feature Selection

In [2]:
# Paths to the csv files
file_paths = {
    "portscan": "rawdata/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "DDOS": "rawdata/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "webattacks": "rawdata/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "infiltration": "rawdata/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"
}

# These are the features selected for trainig the model out all the 79 columns
custom_features = [
    "Flow Duration", "Total Fwd Packets", "Total Backward Packets",
    "Fwd Packet Length Mean", "Bwd Packet Length Mean",
    "Flow Bytes/s", "Flow Packets/s", "Fwd IAT Total", "Bwd IAT Total",
    "SYN Flag Count", "FIN Flag Count", "RST Flag Count", "ACK Flag Count", 
    "PSH Flag Count", "URG Flag Count",
    "Average Packet Size", "Init_Win_bytes_forward", "Init_Win_bytes_backward",
    "Active Mean", "Idle Mean"
]

# The lables given in the file are strings thus we have to convert them to numeric values
custom_labels = {
    "BENIGN": 0,
    "PortScan": 1,
    "DDoS": 2,
    "Web Attack � Brute Force": 3,
    "Web Attack � XSS": 4,
    "Web Attack � Sql Injection": 5,
}

df_list = []

### 2. Data Cleaning

In [3]:
for attactype, file_path in file_paths.items():
    print(f"Processing {attactype} file...")
    df = pd.read_csv(file_path)

    # Striping the extra space in the column names
    df.columns = df.columns.str.strip()

    # Serching for the availabel features in the data set
    available_features = [i for i in custom_features if i in df.columns]
    df = df[available_features + ["Label"]]

    # Maping the numeric values to the labels
    df["Label"] = df["Label"].map(custom_labels)

    df.replace([np.inf, -np.inf], np.nan, inplace=True) #replace all +infinity, -infinity with NAN
    df.dropna(inplace=True) #dropping all the rows containin NAN values
    df.drop_duplicates(inplace=True) #dropping all the duplicate rows
    df.reset_index(drop=True, inplace=True)#reseting the index

    df_list.append(df)#appending the data frame to empty list
    
print("Completed initial processing.")

Processing portscan file...
Processing DDOS file...
Processing webattacks file...
Processing infiltration file...
Completed initial processing.


### 3. Normalization

In [4]:
# Merging all datasets
final_df = pd.concat(df_list, ignore_index=True)

# Normalize numeric values using minmaxscaler
scaler = MinMaxScaler()
final_df[custom_features] = scaler.fit_transform(final_df[custom_features])
final_df[custom_features] = np.clip(final_df[custom_features], 0, 1)
final_df["Label"] = final_df["Label"].astype(int)
final_df.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packet Length Mean,Bwd Packet Length Mean,Flow Bytes/s,Flow Packets/s,Fwd IAT Total,Bwd IAT Total,SYN Flag Count,...,RST Flag Count,ACK Flag Count,PSH Flag Count,URG Flag Count,Average Packet Size,Init_Win_bytes_forward,Init_Win_bytes_backward,Active Mean,Idle Mean,Label
0,0.01055296,0.000199,0.000163,0.015051,0.027247,0.111972,0.333345,0.010553,0.002647258,0.0,...,0.0,0.0,1.0,0.0,0.031802,0.445572,0.003723,0.0,0.0,0
1,0.01099472,0.000199,0.000163,0.015051,0.027247,0.111972,0.333344,0.010995,0.003028575,0.0,...,0.0,0.0,1.0,0.0,0.031802,0.445572,0.003723,0.0,0.0,0
2,1.441667e-06,0.0,4e-06,0.0,0.0,0.111969,0.335417,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.00444,0.003723,0.0,0.0,0
3,0.01086251,0.000199,0.000155,0.015412,0.027231,0.111972,0.333344,0.010862,0.002890425,0.0,...,0.0,0.0,1.0,0.0,0.031702,0.445572,0.003723,0.0,0.0,0
4,7.5e-07,0.0,7e-06,0.0,0.0,0.111969,0.339827,0.0,4.083333e-07,0.0,...,0.0,1.0,0.0,1.0,0.0,0.003723,0.00444,0.0,0.0,0


In [5]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696723 entries, 0 to 696722
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Flow Duration            696723 non-null  float64
 1   Total Fwd Packets        696723 non-null  float64
 2   Total Backward Packets   696723 non-null  float64
 3   Fwd Packet Length Mean   696723 non-null  float64
 4   Bwd Packet Length Mean   696723 non-null  float64
 5   Flow Bytes/s             696723 non-null  float64
 6   Flow Packets/s           696723 non-null  float64
 7   Fwd IAT Total            696723 non-null  float64
 8   Bwd IAT Total            696723 non-null  float64
 9   SYN Flag Count           696723 non-null  float64
 10  FIN Flag Count           696723 non-null  float64
 11  RST Flag Count           696723 non-null  float64
 12  ACK Flag Count           696723 non-null  float64
 13  PSH Flag Count           696723 non-null  float64
 14  URG 

In [6]:
#converting the final data frame to csv file
final_df.to_csv("CleanedData.csv", index=False)

In [7]:
final_df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packet Length Mean,Bwd Packet Length Mean,Flow Bytes/s,Flow Packets/s,Fwd IAT Total,Bwd IAT Total,SYN Flag Count,...,RST Flag Count,ACK Flag Count,PSH Flag Count,URG Flag Count,Average Packet Size,Init_Win_bytes_forward,Init_Win_bytes_backward,Active Mean,Idle Mean,Label
0,1.055296e-02,0.000199,0.000163,0.015051,0.027247,0.111972,0.333345,1.055285e-02,2.647258e-03,0.0,...,0.0,0.0,1.0,0.0,0.031802,0.445572,0.003723,0.0,0.0,0
1,1.099472e-02,0.000199,0.000163,0.015051,0.027247,0.111972,0.333344,1.099461e-02,3.028575e-03,0.0,...,0.0,0.0,1.0,0.0,0.031802,0.445572,0.003723,0.0,0.0,0
2,1.441667e-06,0.000000,0.000004,0.000000,0.000000,0.111969,0.335417,0.000000e+00,0.000000e+00,0.0,...,0.0,1.0,0.0,1.0,0.000000,0.004440,0.003723,0.0,0.0,0
3,1.086251e-02,0.000199,0.000155,0.015412,0.027231,0.111972,0.333344,1.086240e-02,2.890425e-03,0.0,...,0.0,0.0,1.0,0.0,0.031702,0.445572,0.003723,0.0,0.0,0
4,7.500000e-07,0.000000,0.000007,0.000000,0.000000,0.111969,0.339827,0.000000e+00,4.083333e-07,0.0,...,0.0,1.0,0.0,1.0,0.000000,0.003723,0.004440,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696718,4.924525e-03,0.000005,0.000000,0.000000,0.000000,0.111969,0.333334,4.924417e-03,0.000000e+00,0.0,...,0.0,1.0,0.0,1.0,0.000000,0.003647,0.000000,0.0,0.0,0
696719,9.900008e-03,0.000005,0.000000,0.000000,0.000000,0.111969,0.333334,9.899900e-03,0.000000e+00,0.0,...,0.0,1.0,0.0,1.0,0.000000,0.003647,0.000000,0.0,0.0,0
696720,1.916667e-07,0.000000,0.000033,0.001390,0.001034,0.114543,0.500000,0.000000e+00,5.833333e-08,0.0,...,0.0,1.0,0.0,0.0,0.001855,0.000015,0.000015,0.0,0.0,0
696721,2.666667e-07,0.000045,0.000000,0.054898,0.000000,0.165594,0.421053,1.583333e-07,0.000000e+00,0.0,...,0.0,0.0,0.0,0.0,0.073272,0.000000,0.000000,0.0,0.0,0
