In [11]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler

In [12]:
def load_dataset():
    ds = []
    for root, dirs, files in os.walk('CIC dataset/'):
        for file in files:            
            ds.append(pd.read_csv(os.path.join(root,file)))
    # for i, data in enumerate(ds, start=1):
    #     rows, cols = data.shape
    #     print(f'ds{i} -> {rows} rows, {cols} columns')
    return ds

In [86]:
def preprocess_data(datas):    
    #merging data
    data = pd.concat(datas, axis = 0, ignore_index = True)
    print(f"Before duplicate removal{data.shape}")

    #remove leading or trailing whitespace from col names
    col_names = {col: col.strip() for col in data.columns}
    data.rename(columns = col_names, inplace = True)
    
    #duplicate rows removal
    data = data.drop_duplicates(keep = 'first')
    
    #duplicate columns removal
    columns = data.columns
    identical_columns =[]
    list_control = columns.copy().tolist()
    for col1 in columns:
        for col2 in columns:
            if(col1!=col2):
                if(data[col1].equals(data[col2])):
                    if(col2 not in identical_columns and col2 in list_control):
                        identical_columns.append(col2)
                        if col1 in list_control:
                            list_control.remove(col1)
                        if col2 in list_control: 
                            list_control.remove(col2)
                    elif(col2 in identical_columns and col2 in list_control):
                        if col2 in list_control: 
                            list_control.remove(col2)    
    for col in identical_columns:
        data.drop(columns = col, inplace = True)
                    
    print(f"After duplicate removal{data.shape}")
    
    # Treating infinite values
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    #removing rows with missing values
    missing_rows = data.isna().any(axis=1).sum()
    print(f'\nTotal rows with missing values: {missing_rows}')
    data = data.dropna()
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    

    
    print(f"After missing value rows' removal{data.shape}")
    
    #splitting data
    target = data['Label']
    features = data.drop('Label',axis = 1)

    
    return features,target

In [88]:
def encode_data(data):
    encoder = LabelEncoder()
    for col in data.columns:
        if(data[col].dtype == 'object'):
            data[col] = encoder.fit_transform(data[col])
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    return ds

In [90]:
def standardize_data(data):
    scaler = StandardScaler()
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    return scaler.fit_transform(data)

In [92]:
def oversample(data,target):
    sampler = RandomOverSampler(sampling_strategy = 'minority')
    new_data,new_target = sampler.fit_resample(data,target)
    # nan_count1 = new_data.isnull().sum().sum()
    # nan_count2 = new_target.isnull().sum().sum()
    # print(f"Total NaN values: {nan_count1} and {nan_count2}")
    return new_data, new_target

In [94]:
def dimensionality_reduction(data):    
    pca = PCA(n_components = 10)
    data = pca.fit_transform(data)
    return data

In [96]:
ds = load_dataset()

In [97]:
ds,target = preprocess_data(ds)

Before duplicate removal(2830743, 79)
After duplicate removal(2522362, 67)

Total rows with missing values: 1564
Total NaN values: 0
After missing value rows' removal(2520798, 67)


In [98]:
#convert categorical values to numerical
ds = encode_data(ds)


Total NaN values: 0


In [99]:
ds = standardize_data(ds)

Total NaN values: 0


In [100]:
ds,target = oversample(ds,target)
print("DS shape:", ds.shape)

DS shape: (4615844, 66)


In [101]:
ds = dimensionality_reduction(ds)
print("DS shape:", ds.shape)
ds = np.column_stack((ds,target))
print(ds)

DS shape: (4615844, 10)
[[-9.391921378760898 -1.4883970670263669 0.30493449192343525 ...
  0.308196144176769 1.2019175605294312 'BENIGN']
 [-9.446783439909952 -1.3938679943699754 0.34661147170050716 ...
  0.07074958490174506 0.12941947086699673 'BENIGN']
 [-9.451822676845193 -1.4036276280504745 0.34743192784465843 ...
  0.07784218922063342 0.17410161288864634 'BENIGN']
 ...
 [10.14514157106182 -0.808184803707598 1.5108635165132378 ...
  -0.2629479478411217 -0.17647008859623287 'Heartbleed']
 [10.14514157106182 -0.808184803707598 1.5108635165132378 ...
  -0.2629479478411217 -0.17647008859623287 'Heartbleed']
 [6.564105103767575 -0.9692230556128578 2.647245341865279 ...
  1.6110263372755547 -0.7795092655115883 'Heartbleed']]
