## CSC-180 Project 2
##### By: Derek Chen, Nicolas Gueliemo, Xai Yang, Katrina Yu

### Imports & Provided Functions from Lab

In [16]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import random
from matplotlib.pyplot import figure, show
from sklearn.model_selection import train_test_split
import os
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import optimizers


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        print('CLASS')
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        print('REG')
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

# Function to plot results
def plot(results):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    for activation in results['Activation'].unique():
        subset = results[results['Activation'] == activation]
        ax.plot(subset['Neurons'].astype(str), subset['RMSE'], marker='o', label=activation)

    ax.set_title('Model Performance by Activation Function')
    ax.set_xlabel('Layer Configuration (Neurons)')
    ax.set_ylabel('RMSE')
    ax.legend(title='Activation Function')
    plt.xticks(rotation=45)
    plt.grid()
    plt.tight_layout()
    plt.show()

def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    plt.plot(t['pred'].tolist(), label='Prediction', color='orange')
    plt.plot(t['y'].tolist(), label='Expected', color='blue')
    plt.ylabel('output')
    plt.legend()
    plt.show()
    
# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


### Step 1: Data Processing
#### By: Nicolas Gugliemo
#### Includes:
- Read from CSV Files
- Create Data frames
- Clean Data to drop dups

In [17]:
import os
import pandas as pd
import numpy as np

#Set paths and set option to print entire tables
path = "./data/"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#Set paths for CSV
features_path = os.path.join(path,"NUSW-NB15_features.csv")
train_path = os.path.join(path,"UNSW_NB15_training-set.csv")
test_path = os.path.join(path,"UNSW_NB15_test-set.csv")

#Read the CSVs 
features_df = pd.read_csv(features_path,encoding='ISO-8859-1')
train_df = pd.read_csv(train_path,encoding='ISO-8859-1')
test_df = pd.read_csv(test_path,encoding='ISO-8859-1')

#Fix ID's Name
train_df.rename(columns={'ï»¿id': 'id'}, inplace=True)
test_df.rename(columns={'ï»¿id': 'id'}, inplace=True)

#Replace '-' (missing value) with NA
train_df.replace('-', pd.NA, inplace=True)
test_df.replace('-', pd.NA, inplace=True)
  
#Drop missing values
features_df = features_df.dropna()
train_df = train_df.dropna()
test_df = test_df.dropna()

##TO-DO, DROP UNIQUE ROWS
train_df[0:10]


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,252,2740.179,3358.62207,1,3,152.876547,90.235726,259.080172,4991.784669,255,1107119177,1047442890,255,0.0,0.0,0.0,52,64,0,0,1,1,2,1,1,3,1,1,0,2,1,0,Normal,0
11,12,2.093085,tcp,smtp,FIN,62,28,56329,2212,42.520967,62,252,211825.1,8152.559082,28,8,34.312868,75.092445,3253.278833,106.113453,255,1824722662,860716719,255,0.13114,0.052852,0.078288,909,79,0,0,2,1,1,1,1,2,0,0,0,1,1,0,Normal,0
15,16,2e-06,udp,snmp,INT,2,0,138,0,500000.0013,254,0,276000000.0,0.0,0,0,0.002,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,69,0,0,0,1,2,1,1,1,4,0,0,0,2,1,0,Normal,0
17,18,0.393556,tcp,http,FIN,10,8,860,1096,43.195886,62,252,15733.47,19494.04883,2,2,43.728444,47.669145,2124.837873,71.805867,255,3882971404,3084071099,255,0.120313,0.059201,0.061112,86,137,1,103,3,1,1,1,1,2,0,0,1,1,3,0,Normal,0
21,22,0.338017,tcp,http,FIN,10,6,998,268,44.376468,254,252,21277.04,5301.508789,2,1,35.218111,56.579801,1928.55071,82.022258,255,2665974075,3521361798,255,0.154433,0.055109,0.099324,100,45,1,0,6,1,1,1,1,1,0,0,1,2,3,0,Normal,0
22,23,0.964656,tcp,ftp,CON,14,12,690,950,25.915974,62,252,5315.885,7223.300293,5,6,74.204308,59.700727,3869.615403,65.75518,255,996917599,59266788,255,0.112429,0.052589,0.05984,49,79,0,0,1,3,1,1,1,2,1,1,0,2,1,0,Normal,0
28,29,0.546425,tcp,http,FIN,10,8,1168,1040,31.111315,254,252,15401.93,13322.96289,2,2,60.713889,71.65757,3133.151417,109.403391,255,1648692252,4159782394,255,0.08489,0.043864,0.041026,117,130,1,109,6,1,1,1,1,2,0,0,1,1,8,0,Normal,0
30,31,0.725996,tcp,http,FIN,10,16,822,12220,34.435451,62,252,8154.315,126248.625,2,5,80.666222,43.607801,5370.273866,3485.812422,255,1602967781,379821754,255,0.105602,0.037093,0.068509,82,764,1,5532,6,1,1,1,1,2,0,0,1,2,1,0,Normal,0
31,32,0.274605,tcp,http,FIN,10,6,1228,268,54.623913,62,252,32220.83,6525.736816,2,1,27.216333,44.842199,1456.62599,63.563766,255,621256643,3708416778,255,0.100635,0.050384,0.050251,123,45,1,0,5,1,1,1,1,4,0,0,1,1,8,0,Normal,0
32,33,0.337456,tcp,http,FIN,10,8,800,2260,50.37694,62,252,17068.89,46892.03906,2,2,37.495111,41.27,1942.887532,76.181,255,4112395778,844999239,255,0.091221,0.029011,0.06221,80,283,1,704,3,1,1,1,1,1,0,0,1,1,8,0,Normal,0


### Encode categorical features and normalize numeric features.


In [18]:
# Test the expected categorical columns to one hot encoding 
pd.get_dummies(train_df, columns=['proto', 'service', 'state']).head()
pd.get_dummies(test_df, columns=['proto', 'service', 'state']).head()

# Send the expected numeric and categorical columns to normalization. Send target to text_index.  
NumericColumns = ['dur','spkts','dpkts','sbytes','dbytes','rate','sttl','dttl','sload','dload','sloss','dloss','sinpkt','dinpkt','sjit','djit','swin','stcpb','dtcpb','tcprtt',
               'synack','ackdat','smean','dmean','trans_depth','response_body_len','ct_srv_src','ct_state_ttl','ct_dst_ltm','ct_dst_sport_ltm','ct_dst_src_ltm','is_ftp_login','ct_ftp_cmd',
               'ct_flw_http_mthd','ct_src_ltm','ct_srv_dst','is_sm_ips_ports']
CategoricalColumns = ['proto', 'service', 'state']
for i in NumericColumns:
    encode_numeric_zscore(train_df,i)
    encode_numeric_zscore(test_df,i)
for i in CategoricalColumns:
    encode_text_dummy(train_df,i)
    encode_text_dummy(test_df,i)
encode_text_index(train_df,'attack_cat')
encode_text_index(test_df,'attack_cat')

train_df[0:10]


Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label,proto-tcp,proto-udp,service-dhcp,service-dns,service-ftp,service-ftp-data,service-http,service-irc,service-pop3,service-radius,service-smtp,service-snmp,service-ssh,service-ssl,state-CON,state-FIN,state-INT,state-REQ,state-RST
3,4,0.174677,-0.04803,-0.04458,-0.055932,-0.077782,-0.624123,-1.042527,1.554587,-0.610101,-0.216534,-0.061567,-0.055844,0.465346,0.190108,-0.125318,0.921565,1.185734,0.159327,0.116305,255,-0.548899,-0.511111,-0.542046,-0.358947,-0.230851,-0.197312,-0.05778,-0.931741,-0.340489,-0.811222,1,-0.890285,-0.790888,5.266767,5.266767,-0.278273,-0.855604,-0.927297,,6,0,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False
11,12,0.266079,0.216764,0.07668,0.169685,-0.069496,-0.623956,-1.042527,1.554587,-0.607521,-0.211838,0.228984,0.022395,-0.036952,0.131492,0.015802,-0.156252,1.185734,0.698403,-0.024568,255,1.387128,0.976391,1.689012,3.682398,-0.171035,-0.197312,-0.05778,-0.857352,-0.340489,-0.911291,1,-0.890285,-0.865254,-0.175733,-0.175733,-0.278273,-0.955804,-0.927297,,6,0,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False
15,16,-0.1989,-0.100988,-0.135525,-0.057916,-0.082207,2.287049,0.822646,-0.717582,2.795841,-0.219823,-0.072328,-0.102787,-0.182312,-0.15917,-0.137528,-0.179661,-0.843349,-0.67236,-0.67392,0,-0.548899,-0.511111,-0.542046,-0.278781,-0.486063,-0.197312,-0.05778,-0.931741,0.851546,-0.911291,1,-0.890285,-0.716523,-0.175733,-0.175733,-0.278273,-0.855604,-0.927297,,6,0,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False
17,18,-0.111472,-0.058621,-0.074895,-0.054992,-0.075909,-0.623952,-1.042527,1.554587,-0.609941,-0.20073,-0.050806,-0.071491,0.002937,0.025344,-0.037382,-0.16382,1.185734,2.244595,1.652802,255,1.227288,1.155081,1.199529,-0.198614,0.060251,0.689287,-0.056479,-0.782963,-0.340489,-0.911291,1,-0.890285,-0.865254,-0.175733,-0.175733,0.719265,-0.955804,-0.778749,,6,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
21,22,-0.12381,-0.058621,-0.090052,-0.054433,-0.080667,-0.623945,0.822646,1.554587,-0.609873,-0.214631,-0.050806,-0.087139,-0.033117,0.059835,-0.046634,-0.161566,1.185734,1.330366,1.982709,255,1.731003,1.039913,2.288498,-0.132594,-0.306617,0.689287,-0.05778,-0.559796,-0.340489,-0.911291,1,-0.890285,-0.93962,-0.175733,-0.175733,0.719265,-0.855604,-0.778749,,6,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
22,23,0.015398,-0.037438,-0.04458,-0.05568,-0.076748,-0.624052,-1.042527,1.554587,-0.61007,-0.212748,-0.018522,-0.008901,0.132049,0.071915,0.044851,-0.165155,1.185734,0.076542,-0.629207,255,1.110896,0.968989,1.16328,-0.373094,-0.171035,-0.197312,-0.05778,-0.931741,2.043582,-0.911291,1,-0.890285,-0.865254,5.266767,5.266767,-0.278273,-0.855604,-0.927297,,6,0,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False
28,29,-0.077512,-0.058621,-0.074895,-0.053744,-0.076231,-0.624022,0.822646,1.554587,-0.609945,-0.206774,-0.050806,-0.071491,0.074896,0.118197,0.01014,-0.155526,1.185734,0.566166,2.464354,255,0.704337,0.723427,0.627117,-0.052428,0.032337,0.689287,-0.056403,-0.559796,-0.340489,-0.911291,1,-0.890285,-0.865254,-0.175733,-0.175733,0.719265,-0.955804,-0.407378,,6,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
30,31,-0.03762,-0.058621,-0.014265,-0.055146,-0.011986,-0.624003,-1.042527,1.554587,-0.610034,-0.096168,-0.050806,-0.024548,0.159425,0.009624,0.115578,0.589336,1.185734,0.531817,-0.38737,255,1.010109,0.532859,1.41033,-0.217477,2.560532,0.689287,0.012099,-0.559796,-0.340489,-0.911291,1,-0.890285,-0.865254,-0.175733,-0.175733,0.719265,-0.855604,-0.927297,,6,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
31,32,-0.137897,-0.058621,-0.090052,-0.053501,-0.080667,-0.623885,-1.042527,1.554587,-0.609737,-0.213431,-0.050806,-0.087139,-0.067017,0.014402,-0.068876,-0.165638,1.185734,-0.205661,2.123829,255,0.936781,0.90693,0.890011,-0.024133,-0.306617,0.689287,-0.05778,-0.634185,-0.340489,-0.911291,1,-0.890285,-0.716523,-0.175733,-0.175733,0.719265,-0.955804,-0.407378,,6,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
32,33,-0.123935,-0.058621,-0.074895,-0.055235,-0.06922,-0.62391,-1.042527,1.554587,-0.609924,-0.173894,-0.050806,-0.071491,-0.023471,0.000575,-0.045958,-0.162855,1.185734,2.416942,-0.036425,255,0.797801,0.305394,1.23082,-0.226908,0.642453,0.689287,-0.048887,-0.782963,-0.340489,-0.911291,1,-0.890285,-0.93962,-0.175733,-0.175733,0.719265,-0.955804,-0.407378,,6,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False


### Create First Model

In [19]:
train_categories = set(train_df['attack_cat'].unique())
test_categories = set(test_df['attack_cat'].unique())

unique_to_train = train_categories - test_categories
unique_to_test = test_categories - train_categories

print("Unique categories in Train:", unique_to_train)
print("Unique categories in Test:", unique_to_test)

# Drop unique categories in the train dataset
train_df = train_df[~train_df['attack_cat'].isin(unique_to_train)]

# Drop unique categories in the test dataset
test_df = test_df[~test_df['attack_cat'].isin(unique_to_test)]

print("Updated Train categories:", train_df['attack_cat'].nunique())
print("Updated Test categories:", test_df['attack_cat'].nunique())


Unique categories in Train: {8}
Unique categories in Test: set()
Updated Train categories: 8
Updated Test categories: 8


In [31]:
x,y = to_xy(train_df,"attack_cat")
x_test,y_test = to_xy(test_df,"attack_cat")

print("Train X shape:", x.shape)
print("Train Y shape:", y.shape)
print("Test X shape:", x_test.shape)
print("Test Y shape:", y_test.shape)


model = Sequential()
model.add(Dense(100, input_dim=x.shape[1], activation='relu')) # More neurons
model.add(Dense(50, activation='relu')) # Additional layer
model.add(Dense(25, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax')) # Output layer

adam = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=2, mode='auto')  
checkpointer = ModelCheckpoint(filepath="dnn/best_weights.keras", verbose=0, save_best_only=True) # save best model
model.fit(x, y, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=1000) 
model.load_weights('dnn/best_weights.keras') # load weights from best model 
#model.fit(x,y,verbose=2,epochs=10)

# Make predictions on the test set
pred = model.predict(x_test)

# Convert predicted and true labels to class indices
predicted_classes = np.argmax(pred, axis=1)
true_classes = np.argmax(y_test, axis=1)

# Evaluate the accuracy score
score = metrics.accuracy_score(true_classes, predicted_classes)
print("Accuracy score: {}".format(score))

# Print classification report
print(metrics.classification_report(true_classes, predicted_classes))



CLASS
CLASS
Train X shape: (81059, 60)
Train Y shape: (81059, 8)
Test X shape: (35179, 60)
Test Y shape: (35179, 8)
Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


2534/2534 - 4s - 1ms/step - loss: 1.5163 - val_loss: 2.7555
Epoch 2/1000
2534/2534 - 2s - 970us/step - loss: 1.3159 - val_loss: 2.9595
Epoch 3/1000
2534/2534 - 2s - 951us/step - loss: 1.3069 - val_loss: 2.9616
Epoch 4/1000
2534/2534 - 2s - 947us/step - loss: 1.3054 - val_loss: 2.9761
Epoch 5/1000
2534/2534 - 2s - 931us/step - loss: 1.3052 - val_loss: 2.9734
Epoch 6/1000
2534/2534 - 2s - 957us/step - loss: 1.3051 - val_loss: 2.9840
Epoch 6: early stopping


BadZipFile: File is not a zip file