In [152]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler

In [154]:
def load_dataset():
    ds = []
    for root, dirs, files in os.walk('CIC dataset/'):
        for file in files:            
            ds.append(pd.read_csv(os.path.join(root,file)))
    # for i, data in enumerate(ds, start=1):
    #     rows, cols = data.shape
    #     print(f'ds{i} -> {rows} rows, {cols} columns')
    return ds

In [213]:
def preprocess_data(datas):    
    #merging data
    data = pd.concat(datas, axis = 0, ignore_index = True)
    print(f"Before duplicate removal{data.shape}")

    #remove leading or trailing whitespace from col names
    col_names = {col: col.strip() for col in data.columns}
    data.rename(columns = col_names, inplace = True)
    
    #duplicate rows removal
    data = data.drop_duplicates(keep = 'first')
    
    #duplicate columns removal
    columns = data.columns
    identical_columns =[]
    list_control = columns.copy().tolist()
    for col1 in columns:
        for col2 in columns:
            if(col1!=col2):
                if(data[col1].equals(data[col2])):
                    if(col2 not in identical_columns and col2 in list_control):
                        identical_columns.append(col2)
                        if col1 in list_control:
                            list_control.remove(col1)
                        if col2 in list_control: 
                            list_control.remove(col2)
                    elif(col2 in identical_columns and col2 in list_control):
                        if col2 in list_control: 
                            list_control.remove(col2)    
    for col in identical_columns:
        data.drop(columns = col, inplace = True)
                    
    print(f"After duplicate removal{data.shape}")

    #removing rows with missing values
    missing_rows = data.isna().any(axis=1).sum()
    print(f'\nTotal rows with missing values: {missing_rows}')
    data = data.dropna()
    nan_count = data.isnull().sum().sum()
    print(f"Total NaN values: {nan_count}")
    
    # Treating infinite values
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    print(f"After missing value rows' removal{data.shape}")
    
    #splitting data
    target = data['Label']
    features = data.drop('Label',axis = 1)

    
    return features,target

In [158]:
def encode_data(data):
    encoder = LabelEncoder()
    for col in data.columns:
        if(data[col].dtype == 'object'):
            data[col] = encoder.fit_transform(data[col])
    return ds

In [160]:
def standardize_data(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

In [162]:
def oversample(data,target):
    sampler = RandomOverSampler(sampling_strategy = 'minority')
    new_data,new_target = sampler.fit_resample(data,target)
    return new_data, new_target

In [188]:
def dimensionality_reduction(data):    
    pca = PCA(n_components = 10)
    data = pca.fit_transform(data)

In [217]:
ds = load_dataset()

In [218]:
ds,target = preprocess_data(ds)

Before duplicate removal(2830743, 79)
After duplicate removal(2522362, 67)

Total rows with missing values: 353
Total NaN values: 0
After missing value rows' removal(2522009, 67)


In [221]:
#convert categorical values to numerical
ds = encode_data(ds)


In [223]:
ds = standardize_data(ds)

In [225]:
ds,target = oversample(ds,target)

In [227]:
ds = dimensionality_reduction(ds)
ds = np.column_stack((ds,target))
print(ds)

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values