In [None]:
# Azure setup
# install in notebook or in the console: conda activate env and install:
# !conda install -c conda-forge --y imbalanced-learn
# !conda install -c anaconda --y seaborn

import torch
import numpy as np
import pandas as pd
import sys
import os

import glob
from torchvision.transforms import ToTensor

import matplotlib.pyplot as plt

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder #, OneHotEncoder

from pprint import pprint
import time



In [None]:
'''
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd '/content/drive/My Drive/1_MalmoUni/AdvML/Project'
'''

In [None]:
def clean_column_name(column):
    column = column.strip(' ')
    column = column.replace('/', '_')
    column = column.replace(' ', '_')
    column = column.lower()
    return column

In [None]:
# DATA_DIR  = os.path.join(os.path.abspath("."), "datasets\MachineLearningCSV\MachineLearningCVE")
DATA_DIR  = os.path.join(os.path.abspath("."), "datasets/MachineLearningCSV/MachineLearningCVE")


# Read all the .csv files
filenames = glob.glob(os.path.join(DATA_DIR,  '*.csv'))
datasets = [pd.read_csv(filename) for filename in filenames]

# Remove white spaces and rename the columns
for dataset in datasets:
    dataset.columns = [clean_column_name(column) for column in dataset.columns]

# Concatenate the datasets
dataset = pd.concat(datasets, axis=0, ignore_index=True)
#dataset.drop(labels=['fwd_header_length.1'], axis= 1, inplace=True)

In [None]:
dataset.info()

In [None]:
dataset.head(5)

In [None]:
dataset.label.value_counts()

In [None]:
print('The total number of rows duplicates are: ',dataset.duplicated(keep=False).sum())   

In [None]:
duplicate = dataset[dataset.duplicated()]
print('Duplicated rows by labels are: ')
duplicate.label.value_counts()

In [None]:
print('Datase duplicates :', dataset.duplicated().any())
lenght_data = len(dataset)
dataset.drop_duplicates(inplace=True, keep=False, ignore_index=True)
# Remove duplicate rows
dups_count = lenght_data-len(dataset)

print('count of duplicate values dropped: ', dups_count)

In [None]:
dataset.label.value_counts()

In [None]:
def print_nans(label='NAN '):
    result = dataset.isna().sum()
    
    for idx in result.index:
        if result[idx] > 0:
            print(idx,' has '+ label + ' values :' ,result[idx])      

print_nans()  

lenght_data = len(dataset)
dataset.dropna(axis=0, inplace=True, how="any")
# Remove duplicate rows
dups_count = lenght_data-len(dataset)

print('count of NANs values dropped: ', dups_count)

In [None]:
print('are all values finite: ',np.all(np.isfinite(dataset.drop(['label'], axis=1))))
# Replace infinite values to NaN
dataset.replace([-np.inf, np.inf], np.nan, inplace=True)
print_nans(label='Inf ')  

In [None]:
dataset[(dataset['flow_bytes_s'].isna()) & (dataset['flow_packets_s'].isna())].label.value_counts()

In [None]:
lenght_data = len(dataset)

# Remove infinte values
dataset.dropna(axis=0, how='any', inplace=True)

dups_count = lenght_data-len(dataset)

print('count of infinte values dropped: ', dups_count)

In [None]:
dataset.label.value_counts()

In [None]:
dataset_datatypes = dataset.columns.to_series().groupby(dataset.dtypes).groups
datatypes_info = {k.name: v for k, v in dataset_datatypes.items()}

pprint(datatypes_info)
pprint(dataset.describe(include=[object]).transpose())

In [None]:
dataset[['flow_bytes_s', 'flow_packets_s']] = dataset[['flow_bytes_s', 'flow_packets_s']].apply(pd.to_numeric)

In [None]:
Y = dataset.label
X = dataset.drop(columns='label')

class_labels = Y.unique()

num_classes = Y.nunique()     # number of unique values
print("shape of X: ",X.shape)
print("number of labels of y: ", num_classes)
print("Class labels: ", class_labels)

In [None]:
#  are there missing values
'''

from sklearn.impute import SimpleImputer

mean_imp = SimpleImputer(missing_values=-1, strategy='mean')
cat_imp = SimpleImputer(missing_values=-1, strategy='most_frequent')
for c in X.columns:
  X[c] = mean_imp.fit_transform(X[[c]]).ravel()
  
'''


In [None]:
X.columns

In [None]:
print('X shape: ', X.shape)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, stratify=Y)

print("\nafter spliting the data:")
print("X training data shape:", X_train.shape)
print("x test data shape:", X_test.shape)

print("Y training data shape:", Y_train.shape)
print("Y test data shape:", Y_test.shape)

In [None]:
le = LabelEncoder()       # Encode target labels with value between 0 and n_classes-1

Y_train_binary = le.fit_transform(Y_train)

#print("instances per label in test set\n", y_test_binary.value_counts())
# transform -	Transform labels to normalized encoding.
Y_test_binary = le.transform(Y_test)

#we use fit_transform() on training data but transform() on the test data

# classes_ - ndarray of shape (n_classes,) - Holds the label for each class.
# To create a dictionary from two sequences, use dict(zip(keys, values))
# The zip(fields, values) method returns an iterator that generates two-items tuples 
labels_dict = dict(zip(le.classes_, range(len(le.classes_))))

pprint(labels_dict)




import pickle

_file = open("datasets/labels_dict_file.pkl","wb")
pickle.dump(labels_dict, _file)
_file.close()

In [None]:
df=pd.DataFrame(Y_train)
df.value_counts()

In [None]:
df=pd.DataFrame(Y_train_binary)
df.value_counts()

In [None]:
!pwd

In [None]:
os.cpu_count()

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
majority_feature_values= 1526628  
dict_strategy={}

for k in labels_dict.values():
    if labels_dict['BENIGN']==k:
        continue
    dict_strategy[k]= int(15/100 * majority_feature_values)

dict_strategy

In [None]:
'''
majority_feature_values= 1526628  

dict_strategy=labels_dict.copy()

for k in dict_strategy.keys():
    dict_strategy[k]= int(15/100 * majority_feature_values)

dict_strategy.pop('BENIGN')

dict_strategy
'''

In [None]:

from imblearn.over_sampling import KMeansSMOTE

k_smt = KMeansSMOTE(random_state=42, sampling_strategy=dict_strategy, k_neighbors=7,kmeans_estimator=20, n_jobs= os.cpu_count())

x_sampled, y_sampled = k_smt.fit_resample(X_train, Y_train_binary)

In [None]:

from imblearn.over_sampling import ADASYN

adasyn_sampl = ADASYN(random_state=42, sampling_strategy=dict_strategy, n_neighbors=7, n_jobs= os.cpu_count())

x_adasyn_sampled, y_adasyn_sampled = adasyn_sampl.fit_resample(X_train, Y_train_binary)

In [None]:
df=pd.DataFrame(y_adasyn_sampled)
df.value_counts() 

In [None]:
np.savetxt("datasets/y_train_78_features_ADASYN_sampled.csv", y_adasyn_sampled, delimiter=",")
np.savetxt("datasets/x_train_78_features_ADASYN_sampled.csv", x_adasyn_sampled, delimiter=",")

In [None]:
majority_feature_values= 1526628  

dict_strategy_undersample=dict()
dict_strategy_undersample[labels_dict['BENIGN']]=int(20/100 * majority_feature_values)
dict_strategy_undersample

In [None]:
from imblearn.under_sampling import InstanceHardnessThreshold

iht = InstanceHardnessThreshold(random_state=42, sampling_strategy=dict_strategy_undersample, cv=15, n_jobs=os.cpu_count())

x_undersampled_iht, y_undersampled_iht = iht.fit_resample(x_adasyn_sampled, y_adasyn_sampled)

In [None]:
df=pd.DataFrame(y_undersampled_iht)
df.value_counts()

In [None]:
import pickle
from numpy import genfromtxt

labels_dict = pickle.load(open("datasets/labels_dict_file.pkl", "rb"))

x_undersampled_iht = genfromtxt("datasets/x_train_78_features_ADASYN_undersampled_iht.csv", delimiter=',', autostrip=True)
y_undersampled_iht = genfromtxt("datasets/y_train_78_features_ADASYN_undersampled_iht.csv",dtype=np.uint8, delimiter=',', autostrip=True)

print('x: ',x_undersampled_iht.shape)
print('y: ',y_undersampled_iht.shape)


In [None]:
y_undersampled_iht

In [None]:
majority_feature_values= 1526628  

dict_strategy_undersample=dict()
dict_strategy_undersample[labels_dict['BENIGN']]=int(20/100 * majority_feature_values)
dict_strategy_undersample

In [None]:
from imblearn.under_sampling import InstanceHardnessThreshold
import time


iht = InstanceHardnessThreshold(random_state=42, sampling_strategy=dict_strategy_undersample, cv=10, n_jobs=os.cpu_count())
for i in range(4):
    x_undersampled_iht, y_undersampled_iht = iht.fit_resample(x_undersampled_iht, y_undersampled_iht)

    print('runned instances:', i)
    time.sleep(5)



In [None]:
df=pd.DataFrame(y_undersampled_iht) # 1261368
df.value_counts()

In [None]:
print(type(x_undersampled_iht))
type(y_undersampled_iht)


In [None]:
np.savetxt("datasets/y_train_78_features_ADASYN_undersampled_iht.csv", y_undersampled_iht, delimiter=",")
np.savetxt("datasets/x_train_78_features_ADASYN_undersampled_iht.csv", x_undersampled_iht, delimiter=",")

In [None]:
df=pd.DataFrame(x_undersampled_iht)
df.columns

In [None]:
df=pd.DataFrame(y_undersampled_iht)
df.columns

In [None]:
print(type(Y_test_binary))
type(X_test)

In [None]:
np.savetxt("datasets/y_test_binary_78_features.csv", Y_test_binary, delimiter=",")
np.savetxt("datasets/x_test_78_features.csv", X_test, delimiter=",")