# I - Import Libraries

In [1]:
import pandas as pd
import numpy as np
import cupy as cp
import math
import time

In [2]:
# Clear GPU Memory
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

0

In [3]:
#Show GPU list
print([(i, torch.cuda.get_device_properties(i)) for i in range(torch.cuda.device_count())])

[(0, _CudaDeviceProperties(name='NVIDIA GeForce RTX 2060 SUPER', major=7, minor=5, total_memory=7974MB, multi_processor_count=34))]


# II - Load Dataset

In [4]:
# Load the CSV file
df = pd.read_csv(r"../../capstone_data/archive/combined_file.csv", low_memory=False)
#df= df.sample(frac=0.90, random_state=40) #sample a percent as needed

# III - Data Preprocessing
### A. Drop Columns

In [5]:
# Drop specified columns
columns_to_drop = ['detailed-label', 
                   'uid', 'service', 
                   'tunnel_parents', 
                   'local_resp', 
                   'local_orig', 
                   'missed_bytes']
df = df.drop(columns=columns_to_drop)

### B. Fix Null Values

In [6]:
# Replace '-' with '0' in specific numeric columns
columns_to_zero = ['duration', 'orig_bytes', 'resp_bytes']
for col in columns_to_zero:
    df[col] = df[col].replace('-', '0')

# Replace '-' with 'Unk' in 'history' col
df['history'] = df['history'].replace('-', 'Unk')

# Identifiy list of numerical features in a list and coerce to numeric type
numeric_columns = ['duration', 
                   'orig_bytes', 
                   'resp_bytes', 
                   'orig_pkts', 
                   'resp_pkts', 
                   'orig_ip_bytes', 
                   'resp_ip_bytes']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col])



### C. Discretize TimeStamp Col to hourly

In [7]:
# Convert 'ts' to hour of the day
df['hour'] = pd.to_datetime(df['ts'], unit='s').dt.hour
df = df.drop(columns=['ts'])

### D. Convert Target Variable to Binary

In [8]:
#Reclasify 'label' to binary. As 'Benign' for benign observations, "Malicious" for all others.
df['label'] = df['label'].str.contains('Malicious').astype(int)

### E. Fix Class Imbalance
#### E1. Undersampling

In [9]:
#Balance the dataset by undersampling the majority class (Malicious Type)
df_majority = df[df['label']==1]
df_minority = df[df['label']==0]
from sklearn.utils import resample,shuffle
df_majority_undersampled = resample(df_majority, #undersample majority class
                                    replace=False,    # sample without replacement
                                    n_samples=len(df_minority),  # to match minority class
                                    random_state=40) # reproducible results
df = pd.concat([df_majority_undersampled, df_minority]) #combine classes

#### E2. SMOTE

### Verify Transformations

In [10]:
# Show the first few rows to verify the transformations
df.head()

Unnamed: 0,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,duration,orig_bytes,resp_bytes,conn_state,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,hour
20727743,192.168.100.111,56407.0,181.53.62.185,23.0,tcp,0.0,0,0,S0,S,1.0,40.0,0.0,0.0,1,19
5932810,192.168.1.196,29446.0,209.97.190.136,80.0,tcp,2.114547,0,0,RSTOS0,I,2.0,80.0,0.0,0.0,1,4
16371171,192.168.1.195,6944.0,162.248.88.215,62336.0,tcp,0.0,0,0,OTH,C,0.0,0.0,0.0,0.0,1,22
24291095,192.168.100.111,9418.0,208.66.190.248,81.0,tcp,0.0,0,0,S0,S,1.0,40.0,0.0,0.0,1,6
17642410,192.168.1.195,26749.0,162.248.88.215,62336.0,tcp,0.0,0,0,OTH,C,0.0,0.0,0.0,0.0,1,22


In [11]:
len(df['id.resp_h'].unique())

8913785

### F. Label-Encode or One-hot-Encode categorical Columns
#### F1. Label Encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() #initialize sklearn encoder
df['label'] = le.fit_transform(df['label'])

#### F2. One-Hot-Encoding
OHE - Could not be used to due excessive memory allocation
Dummy encoding - Could not be used to due excessive memory allocation
Label-Encoder - Being used due to its memory efficient ability. However, cannot be used with algorithms that interpret integers as having a meaningful order or magnitude. (Logistic Reg/Neural Network/SVM/KNN/Hierarchical Clustering/PCA/

In [13]:
from sklearn.preprocessing import OneHotEncoder #yielded shape: (15804762, 8605399) 980TB
#pd.get_dummies needs 285GB. 


categorical_columns = ['id.orig_h',
                       'id.orig_p', 
                       'id.resp_h', 
                       'id.resp_p', 
                       'proto', 
                       'conn_state', 
                       'history']

# encoder = OneHotEncoder(sparse_output=False)
# encoded_data = encoder.fit_transform(df[categorical_columns])
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))
# df_final = pd.concat([df.drop(categorical_cols, axis=1), encoded_df], axis=1)

#df = pd.get_dummies(df, columns = categorical_columns)


# from sklearn.feature_extraction import FeatureHasher
# import scipy.sparse as sp
# hasher = FeatureHasher(n_features=50, input_type='string')
# # Hash each categorical column
# hashed_features = []
# for column in categorical_columns:
#     hashed_feature = hasher.transform(df[column].astype(str))
#     hashed_features.append(hashed_feature)
# # Combine the hashed features horizontally
# combined_features = sp.hstack(hashed_features, format='csr')
# # Now `combined_features` is a sparse matrix representation of your dataset's categorical features
# print(combined_features.shape)
# print(combined_features.toarray())

#Use Label encoder for now:
le = LabelEncoder()
for column in categorical_columns:
    df[column + '_encoded'] = le.fit_transform(df[column])
    df = df.drop(column,axis=1)

In [14]:
df.head()

Unnamed: 0,duration,orig_bytes,resp_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,hour,id.orig_h_encoded,id.orig_p_encoded,id.resp_h_encoded,id.resp_p_encoded,proto_encoded,conn_state_encoded,history_encoded
20727743,0.0,0,0,1.0,40.0,0.0,0.0,1,19,8940,56407,4286499,23,1,6,40
5932810,2.114547,0,0,2.0,80.0,0.0,0.0,1,4,8933,29446,5578569,79,1,3,38
16371171,0.0,0,0,0.0,0.0,0.0,0.0,1,22,8932,6944,3303093,62237,1,0,2
24291095,0.0,0,0,1.0,40.0,0.0,0.0,1,6,8940,9418,5535325,80,1,6,40
17642410,0.0,0,0,0.0,0.0,0.0,0.0,1,22,8932,26749,3303093,62237,1,0,2


In [16]:
## save dataset
df.to_csv(r"../../capstone_data/archive/compiled_clean_data_unscaled_numerics_and_labelencondedcats.csv", index=False)

### G. Scale Numerical Columns

In [None]:
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import numpy as np
import time


# Histogram
counts, bins, patches = plt.hist(df['duration'], density=True, edgecolor='black', alpha=0.5)

# KDE
kde = gaussian_kde(df['duration'])
kde_x = np.linspace(bins.min(), bins.max(), 300)
kde_y = kde(kde_x)
plt.plot(kde_x, kde_y, color='red')  # KDE line in red for visibility

# Adding titles and labels
plt.title('Histogram of duration')
plt.xlabel('Duration')
plt.ylabel('Frequency')

# Show the plot
plt.show()

# Save the figure with high dpi
plt.savefig('duration_histogram.png', dpi=300)

In [None]:
import seaborn as sns

sns.histplot(df['orig_bytes'], kde=True)
plt.title('Histogram of orig_bytes')
plt.xlabel('orig_bytes')
plt.ylabel('Frequency')
plt.show()

plt.savefig('orig_bytes_histogram.png', dpi=300)

In [None]:
sns.histplot(df['resp_bytes'], kde=True)
plt.title('Histogram of resp_bytes')
plt.xlabel('resp_bytes')
plt.ylabel('Frequency')
plt.show()

plt.savefig('resp_bytes_histogram.png', dpi=300)

In [None]:
import seaborn as sns
sns.histplot(df['orig_pkts'], kde=True)
plt.title('Histogram of orig_pkts')
plt.xlabel('orig_pkts')
plt.ylabel('Frequency')
plt.show()

plt.savefig('orig_pkts_histogram.png', dpi=300)

In [None]:
sns.histplot(df['resp_pkts'], kde=True)
plt.title('Histogram of resp_pkts')
plt.xlabel('resp_pkts')
plt.ylabel('Frequency')
plt.show()

plt.savefig('resp_pkts_histogram.png', dpi=300)

In [None]:
sns.histplot(df['orig_ip_bytes'], kde=True)
plt.title('Histogram of orig_ip_bytes')
plt.xlabel('orig_ip_bytes')
plt.ylabel('Frequency')
plt.show()

plt.savefig('orig_ip_bytes_histogram.png', dpi=300)

In [None]:
sns.histplot(df['resp_ip_bytes'], kde=True)
plt.title('Histogram of resp_ip_bytes')
plt.xlabel('resp_ip_bytes')
plt.ylabel('Frequency')
plt.show()

plt.savefig('resp_ip_bytes_histogram.png', dpi=300)

In [None]:
gdf = cudf.from_pandas(df)

# Transfer SMOTE Code HERE
### Data Without Scaling

SMOTE balances classes

In [None]:
%%time

#Using SMOTE: takes 41mins to run on CPU
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

# ros = RandomOverSampler(random_state=seed)
# X, y = ros.fit_resample(X, y)


# #execute smote
# smote = SMOTE(random_state=seed)
# x_train_pd, y_train_pd = smote.fit_resample(X_train.to_pandas(),
#                           y_train.to_pandas())


# #convert back to cudf
# X_train = cudf.DataFrame.from_pandas(pd.DataFrame(x_train_pd, columns=X_train.columns))
# y_train = cudf.Series(y_train_pd)


#save smote dataset as parquet
# x_train_pd.to_parquet(r"../../capstone_data/archive/SMOTE_train_test_split/X_train_smote.parquet", compression='snappy')
# y_train_df = pd.DataFrame(y_train_pd)
# y_train_df.to_parquet(r"../../capstone_data/archive/SMOTE_train_test_split/y_train_smote.parquet", compression='snappy')
# X_test.to_parquet(r"../../capstone_data/archive/SMOTE_train_test_split/X_test_smote.parquet", compression='snappy')
# y_test.to_parquet(r"../../capstone_data/archive/SMOTE_train_test_split/y_test_smote.parquet", compression='snappy')

### Data With Scaling