In [53]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
def load_dataset():
    ds = []
    for root, dirs, files in os.walk('CIC dataset/'):
        for file in files:            
            ds.append(pd.read_csv(os.path.join(root,file)))
    # for i, data in enumerate(ds, start=1):
    #     rows, cols = data.shape
    #     print(f'ds{i} -> {rows} rows, {cols} columns')
    return ds

In [3]:
def preprocess_data(datas):    
    #merging data
    data = pd.concat(datas, axis = 0, ignore_index = True)
    print(f"Before duplicate removal{data.shape}")

    #remove leading or trailing whitespace from col names
    col_names = {col: col.strip() for col in data.columns}
    data.rename(columns = col_names, inplace = True)
    
    #duplicate rows removal
    data = data.drop_duplicates(keep = 'first')
    
    #duplicate columns removal
    columns = data.columns
    identical_columns =[]
    list_control = columns.copy().tolist()
    for col1 in columns:
        for col2 in columns:
            if(col1!=col2):
                if(data[col1].equals(data[col2])):
                    if(col2 not in identical_columns and col2 in list_control):
                        identical_columns.append(col2)
                        if col1 in list_control:
                            list_control.remove(col1)
                        if col2 in list_control: 
                            list_control.remove(col2)
                    elif(col2 in identical_columns and col2 in list_control):
                        if col2 in list_control: 
                            list_control.remove(col2)    
    for col in identical_columns:
        data.drop(columns = col, inplace = True)
                    
    print(f"After duplicate removal{data.shape}")
    
    # Treating infinite values
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    #removing rows with missing values
    missing_rows = data.isna().any(axis=1).sum()
    print(f'\nTotal rows with missing values: {missing_rows}')
    data = data.dropna()
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    

    
    print(f"After missing value rows' removal{data.shape}")
    
    #splitting data
    target = data['Label']
    features = data.drop('Label',axis = 1)

    
    return features,target

In [4]:
def encode_data(data):
    encoder = LabelEncoder()
    for col in data.columns:
        if(data[col].dtype == 'object'):
            data[col] = encoder.fit_transform(data[col])
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    return ds

In [5]:
def standardize_data(data):
    scaler = StandardScaler()
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    return scaler.fit_transform(data)

In [6]:
def oversample(data,target):
    sampler = RandomOverSampler(sampling_strategy = 'minority')
    new_data,new_target = sampler.fit_resample(data,target)
    return new_data, new_target

In [7]:
def dimensionality_reduction(data):    
    pca = PCA(n_components = 10)
    data = pca.fit_transform(data)
    return data

In [45]:
def decision_tree(features, labels, test_x, test_y):
    classifier = tree.DecisionTreeClassifier()
    classifier.fit(features,labels)
    # tree.plot_tree(classifier, max_depth = 3)
    predictions = classifier.predict(test_x)
    accuracy = accuracy_score(test_y, predictions)
    report = classification_report(test_y, predictions) 
    print(f"Accuracy = {accuracy*100}")
    print(f"Report = \n{report}")

In [63]:
def extra_tree(features, labels, test_x, test_y):
    classifier = ExtraTreesClassifier(n_estimators = 100)
    classifier.fit(features,labels)
    predictions = classifier.predict(test_x)
    accuracy = accuracy_score(test_y, predictions)
    report = classification_report(test_y, predictions) 
    print(f"Accuracy = {accuracy*100}")
    print(f"Report = \n{report}")

In [9]:
ds = load_dataset()

In [10]:
ds,target = preprocess_data(ds)

Before duplicate removal(2830743, 79)
After duplicate removal(2522362, 67)

Total rows with missing values: 1564
Total NaN values: 0
After missing value rows' removal(2520798, 67)


In [16]:
#convert categorical values to numerical
ds = encode_data(ds)


Total NaN values: 0


In [20]:
ds = standardize_data(ds)

Total NaN values: 0


In [21]:
ds,target = oversample(ds,target)
print("DS shape:", ds.shape)

DS shape: (4615844, 66)


In [22]:
ds = dimensionality_reduction(ds)
print("DS shape:", ds.shape)
ds = np.column_stack((ds,target))
print(ds)

DS shape: (4615844, 10)
[[-9.391445791325527 -1.4884928546516967 0.3021858419694866 ...
  0.30749211012599464 1.199261440242548 'BENIGN']
 [-9.446310825895623 -1.3939967465666345 0.34395760069340714 ...
  0.0698971937030779 0.13214277166781757 'BENIGN']
 [-9.451350528077601 -1.40375439712791 0.3447465993256427 ...
  0.07696767021108439 0.17669485395849222 'BENIGN']
 ...
 [10.145300262028423 -0.8109020888652607 1.5127354942850078 ...
  -0.2633571118269411 -0.17667607445016498 'Heartbleed']
 [9.569992903159497 -0.8676613326192604 1.7725376304831866 ...
  -0.2884609398349668 -0.35859377394563013 'Heartbleed']
 [9.933300218740728 -0.7859763913912146 1.6949860063520286 ...
  -0.17846949223997627 -0.38752692967888597 'Heartbleed']]


In [23]:
train_data, test_data = train_test_split(ds, test_size = 0.2)
# test_data = np.delete(test_data, 10,1)
x_train = train_data[:,:-1]
y_train = train_data[:,-1]
x_test = test_data[:,:-1]
y_test = test_data[:,-1]

In [47]:
decision_tree(x_train,y_train, x_test, y_test)

Accuracy = 99.82397589173814
Report = 
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00    419627
                       Bot       0.63      0.68      0.65       368
                      DDoS       1.00      1.00      1.00     25303
             DoS GoldenEye       0.97      0.97      0.97      2064
                  DoS Hulk       1.00      1.00      1.00     34670
          DoS Slowhttptest       0.98      0.98      0.98      1016
             DoS slowloris       0.99      0.97      0.98      1064
               FTP-Patator       1.00      1.00      1.00      1192
                Heartbleed       1.00      1.00      1.00    418781
              Infiltration       0.33      0.09      0.14        11
                  PortScan       0.98      0.98      0.98     18023
               SSH-Patator       0.97      0.97      0.97       632
  Web Attack � Brute Force       0.74      0.71      0.72       293
Web Atta

In [65]:
extra_tree(x_train,y_train, x_test, y_test)

TypeError: got an unexpected keyword argument 'zero_division'