In [45]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [11]:
def load_dataset():
    ds = []
    for root, dirs, files in os.walk('CIC dataset/'):
        for file in files:            
            ds.append(pd.read_csv(os.path.join(root,file)))
    # for i, data in enumerate(ds, start=1):
    #     rows, cols = data.shape
    #     print(f'ds{i} -> {rows} rows, {cols} columns')
    return ds

In [13]:
def preprocess_data(datas):    
    #merging data
    data = pd.concat(datas, axis = 0, ignore_index = True)
    print(f"Before duplicate removal{data.shape}")

    #remove leading or trailing whitespace from col names
    col_names = {col: col.strip() for col in data.columns}
    data.rename(columns = col_names, inplace = True)
    
    #duplicate rows removal
    data = data.drop_duplicates(keep = 'first')
    
    #duplicate columns removal
    columns = data.columns
    identical_columns =[]
    list_control = columns.copy().tolist()
    for col1 in columns:
        for col2 in columns:
            if(col1!=col2):
                if(data[col1].equals(data[col2])):
                    if(col2 not in identical_columns and col2 in list_control):
                        identical_columns.append(col2)
                        if col1 in list_control:
                            list_control.remove(col1)
                        if col2 in list_control: 
                            list_control.remove(col2)
                    elif(col2 in identical_columns and col2 in list_control):
                        if col2 in list_control: 
                            list_control.remove(col2)    
    for col in identical_columns:
        data.drop(columns = col, inplace = True)
                    
    print(f"After duplicate removal{data.shape}")
    
    # Treating infinite values
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    #removing rows with missing values
    missing_rows = data.isna().any(axis=1).sum()
    print(f'\nTotal rows with missing values: {missing_rows}')
    data = data.dropna()
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    

    
    print(f"After missing value rows' removal{data.shape}")
    
    #splitting data
    target = data['Label']
    features = data.drop('Label',axis = 1)

    
    return features,target

In [15]:
def encode_data(data):
    encoder = LabelEncoder()
    for col in data.columns:
        if(data[col].dtype == 'object'):
            data[col] = encoder.fit_transform(data[col])
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    return ds

In [17]:
def standardize_data(data):
    scaler = StandardScaler()
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    return scaler.fit_transform(data)

In [19]:
def oversample(data,target):
    sampler = RandomOverSampler(sampling_strategy = 'minority')
    new_data,new_target = sampler.fit_resample(data,target)
    return new_data, new_target

In [21]:
def dimensionality_reduction(data):    
    pca = PCA(n_components = 10)
    data = pca.fit_transform(data)
    return data

In [55]:
def decision_tree(features, labels, test_x, test_y):
    classifier = tree.DecisionTreeClassifier()
    classifier.fit(features,labels)
    # tree.plot_tree(classifier, max_depth = 3)
    predictions = classifier.predict(test_x)
    accuracy = accuracy_score(test_y, predictions)
    report = classification_report(test_y, predictions) 
    print(f"Accuracy = {accuracy*100:.2f}%")
    print(f"Report = \n{report}")

In [57]:
def extra_tree(features, labels, test_x, test_y):
    classifier = ExtraTreesClassifier(n_estimators = 100)
    classifier.fit(features,labels)
    predictions = classifier.predict(test_x)
    accuracy = accuracy_score(test_y, predictions)
    report = classification_report(test_y, predictions,zero_division=1) 
    print(f"Accuracy = {accuracy*100:.2f}%")
    print(f"Report = \n{report}")

In [59]:
def random_forest(features,labels,test_x,test_y):
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(features,labels)
    y_pred = model.predict(test_x)
    accuracy = accuracy_score(test_y, y_pred)
    report = classification_report(test_y, y_pred) 
    print(f"Accuracy = {accuracy*100:.2f}%")
    print(f"Report = \n{report}")

In [27]:
ds = load_dataset()

In [28]:
ds,target = preprocess_data(ds)

Before duplicate removal(2830743, 79)
After duplicate removal(2522362, 67)

Total rows with missing values: 1564
Total NaN values: 0
After missing value rows' removal(2520798, 67)


In [29]:
#convert categorical values to numerical
ds = encode_data(ds)


Total NaN values: 0


In [30]:
ds = standardize_data(ds)

Total NaN values: 0


In [31]:
ds,target = oversample(ds,target)
print("DS shape:", ds.shape)

DS shape: (4615844, 66)


In [32]:
ds = dimensionality_reduction(ds)
print("DS shape:", ds.shape)
ds = np.column_stack((ds,target))
print(ds)

DS shape: (4615844, 10)
[[-9.391995175263197 -1.4885123629789496 0.30591912204321814 ...
  0.3093207000626895 1.2006419055097592 'BENIGN']
 [-9.446858281547547 -1.3939894617846957 0.3475871107197592 ...
  0.07153114987566539 0.12992402758719468 'BENIGN']
 [-9.451897619559245 -1.403749290937708 0.34841416293072475 ...
  0.07866966713530138 0.17455249954161928 'BENIGN']
 ...
 [9.417947254178088 0.5190888851129569 -3.748364016238078 ...
  -0.09917274001990037 1.3176586807763795 'Heartbleed']
 [10.206014542010283 -0.32260596183209206 -0.35242576488970706 ...
  -0.16447065370414807 0.37230536998253216 'Heartbleed']
 [10.206014542010283 -0.32260596183209206 -0.35242576488970706 ...
  -0.16447065370414807 0.37230536998253216 'Heartbleed']]


In [34]:
train_data, test_data = train_test_split(ds, test_size = 0.2)
# test_data = np.delete(test_data, 10,1)
x_train = train_data[:,:-1]
y_train = train_data[:,-1]
x_test = test_data[:,:-1]
y_test = test_data[:,-1]

In [62]:
decision_tree(x_train,y_train, x_test, y_test)

Accuracy = 99.81%
Report = 
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    419448
                       Bot       0.62      0.64      0.63       377
                      DDoS       1.00      1.00      1.00     25466
             DoS GoldenEye       0.97      0.96      0.96      2058
                  DoS Hulk       1.00      1.00      1.00     34573
          DoS Slowhttptest       0.98      0.99      0.98      1115
             DoS slowloris       0.99      0.98      0.99      1043
               FTP-Patator       1.00      1.00      1.00      1220
                Heartbleed       1.00      1.00      1.00    418522
              Infiltration       0.50      0.43      0.46         7
                  PortScan       0.98      0.98      0.98     18208
               SSH-Patator       0.97      0.98      0.97       684
  Web Attack � Brute Force       0.72      0.73      0.73       307
Web Attack � Sql In

In [63]:
extra_tree(x_train,y_train, x_test, y_test)

Accuracy = 99.87%
Report = 
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    419448
                       Bot       0.77      0.62      0.69       377
                      DDoS       1.00      1.00      1.00     25466
             DoS GoldenEye       1.00      0.98      0.99      2058
                  DoS Hulk       1.00      1.00      1.00     34573
          DoS Slowhttptest       0.99      0.99      0.99      1115
             DoS slowloris       0.99      0.99      0.99      1043
               FTP-Patator       1.00      1.00      1.00      1220
                Heartbleed       1.00      1.00      1.00    418522
              Infiltration       0.83      0.71      0.77         7
                  PortScan       0.99      0.99      0.99     18208
               SSH-Patator       0.97      0.98      0.97       684
  Web Attack � Brute Force       0.75      0.76      0.76       307
Web Attack � Sql In

In [61]:
random_forest(x_train,y_train, x_test, y_test)

Accuracy = 99.87%
Report = 
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    419448
                       Bot       0.79      0.58      0.67       377
                      DDoS       1.00      1.00      1.00     25466
             DoS GoldenEye       1.00      0.98      0.99      2058
                  DoS Hulk       1.00      1.00      1.00     34573
          DoS Slowhttptest       0.99      0.99      0.99      1115
             DoS slowloris       1.00      0.99      0.99      1043
               FTP-Patator       1.00      1.00      1.00      1220
                Heartbleed       1.00      1.00      1.00    418522
              Infiltration       0.67      0.29      0.40         7
                  PortScan       0.99      0.99      0.99     18208
               SSH-Patator       0.98      0.98      0.98       684
  Web Attack � Brute Force       0.75      0.77      0.76       307
Web Attack � Sql In