## CSC-180 Project 2
##### By: Derek Chen, Nicolas Gueliemo, Xai Yang, Katrina Yu

### Imports & Provided Functions from Lab

In [281]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import random
from matplotlib.pyplot import figure, show
from sklearn.model_selection import train_test_split
import os
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import optimizers


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        print('CLASS')
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        print('REG')
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

# Function to plot results
def plot(results):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    for activation in results['Activation'].unique():
        subset = results[results['Activation'] == activation]
        ax.plot(subset['Neurons'].astype(str), subset['RMSE'], marker='o', label=activation)

    ax.set_title('Model Performance by Activation Function')
    ax.set_xlabel('Layer Configuration (Neurons)')
    ax.set_ylabel('RMSE')
    ax.legend(title='Activation Function')
    plt.xticks(rotation=45)
    plt.grid()
    plt.tight_layout()
    plt.show()

def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    plt.plot(t['pred'].tolist(), label='Prediction', color='orange')
    plt.plot(t['y'].tolist(), label='Expected', color='blue')
    plt.ylabel('output')
    plt.legend()
    plt.show()
    
# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


### Step 1: Data Processing
#### By: Nicolas Gugliemo
#### Includes:
- Read from CSV Files
- Create Data frames
- Clean Data to drop dups

In [282]:
import os
import pandas as pd
import numpy as np

#Set paths and set option to print entire tables
path = "./data/"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#Set paths for CSV
features_path = os.path.join(path,"NUSW-NB15_features.csv")
train_path = os.path.join(path,"UNSW_NB15_training-set.csv")
test_path = os.path.join(path,"UNSW_NB15_test-set.csv")

#Read the CSVs 
features_df = pd.read_csv(features_path,encoding='ISO-8859-1')
train_df = pd.read_csv(train_path,encoding='ISO-8859-1')
test_df = pd.read_csv(test_path,encoding='ISO-8859-1')

#Fix ID's Name
train_df.rename(columns={'ï»¿id': 'id'}, inplace=True)
test_df.rename(columns={'ï»¿id': 'id'}, inplace=True)

#Replace '-' (missing value) with NA
train_df.replace('-', pd.NA, inplace=True)
test_df.replace('-', pd.NA, inplace=True)
  
#Drop missing values
features_df = features_df.dropna()
train_df = train_df.dropna()
test_df = test_df.dropna()

train_df['attack_cat'].replace('Generic', 'Normal', inplace=True)
test_df['attack_cat'].replace('Generic', 'Normal', inplace=True)
##TO-DO, DROP UNIQUE ROWS
train_df[0:10]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['attack_cat'].replace('Generic', 'Normal', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['attack_cat'].replace('Generic', 'Normal', inplace=True)


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,252,2740.179,3358.62207,1,3,152.876547,90.235726,259.080172,4991.784669,255,1107119177,1047442890,255,0.0,0.0,0.0,52,64,0,0,1,1,2,1,1,3,1,1,0,2,1,0,Normal,0
11,12,2.093085,tcp,smtp,FIN,62,28,56329,2212,42.520967,62,252,211825.1,8152.559082,28,8,34.312868,75.092445,3253.278833,106.113453,255,1824722662,860716719,255,0.13114,0.052852,0.078288,909,79,0,0,2,1,1,1,1,2,0,0,0,1,1,0,Normal,0
15,16,2e-06,udp,snmp,INT,2,0,138,0,500000.0013,254,0,276000000.0,0.0,0,0,0.002,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,69,0,0,0,1,2,1,1,1,4,0,0,0,2,1,0,Normal,0
17,18,0.393556,tcp,http,FIN,10,8,860,1096,43.195886,62,252,15733.47,19494.04883,2,2,43.728444,47.669145,2124.837873,71.805867,255,3882971404,3084071099,255,0.120313,0.059201,0.061112,86,137,1,103,3,1,1,1,1,2,0,0,1,1,3,0,Normal,0
21,22,0.338017,tcp,http,FIN,10,6,998,268,44.376468,254,252,21277.04,5301.508789,2,1,35.218111,56.579801,1928.55071,82.022258,255,2665974075,3521361798,255,0.154433,0.055109,0.099324,100,45,1,0,6,1,1,1,1,1,0,0,1,2,3,0,Normal,0
22,23,0.964656,tcp,ftp,CON,14,12,690,950,25.915974,62,252,5315.885,7223.300293,5,6,74.204308,59.700727,3869.615403,65.75518,255,996917599,59266788,255,0.112429,0.052589,0.05984,49,79,0,0,1,3,1,1,1,2,1,1,0,2,1,0,Normal,0
28,29,0.546425,tcp,http,FIN,10,8,1168,1040,31.111315,254,252,15401.93,13322.96289,2,2,60.713889,71.65757,3133.151417,109.403391,255,1648692252,4159782394,255,0.08489,0.043864,0.041026,117,130,1,109,6,1,1,1,1,2,0,0,1,1,8,0,Normal,0
30,31,0.725996,tcp,http,FIN,10,16,822,12220,34.435451,62,252,8154.315,126248.625,2,5,80.666222,43.607801,5370.273866,3485.812422,255,1602967781,379821754,255,0.105602,0.037093,0.068509,82,764,1,5532,6,1,1,1,1,2,0,0,1,2,1,0,Normal,0
31,32,0.274605,tcp,http,FIN,10,6,1228,268,54.623913,62,252,32220.83,6525.736816,2,1,27.216333,44.842199,1456.62599,63.563766,255,621256643,3708416778,255,0.100635,0.050384,0.050251,123,45,1,0,5,1,1,1,1,4,0,0,1,1,8,0,Normal,0
32,33,0.337456,tcp,http,FIN,10,8,800,2260,50.37694,62,252,17068.89,46892.03906,2,2,37.495111,41.27,1942.887532,76.181,255,4112395778,844999239,255,0.091221,0.029011,0.06221,80,283,1,704,3,1,1,1,1,1,0,0,1,1,8,0,Normal,0


### Encode categorical features and normalize numeric features.


In [283]:
# Test the expected categorical columns to one hot encoding 
pd.get_dummies(train_df, columns=['proto', 'service', 'state']).head()
pd.get_dummies(test_df, columns=['proto', 'service', 'state']).head()
# Send the expected numeric and categorical columns to normalization. Send target to text_index.  
NumericColumns = ['dur','spkts','dpkts','sbytes','dbytes','rate','sttl','dttl','sload','dload','sloss','dloss','sinpkt','dinpkt','sjit','djit','swin','stcpb','dtcpb','tcprtt',
               'synack','ackdat','smean','dmean','trans_depth','response_body_len','ct_srv_src','ct_state_ttl','ct_dst_ltm','ct_dst_sport_ltm','ct_dst_src_ltm','is_ftp_login','ct_ftp_cmd',
               'ct_flw_http_mthd','ct_src_ltm','ct_srv_dst','is_sm_ips_ports']
CategoricalColumns = ['proto', 'service', 'state']
for i in NumericColumns:
    encode_numeric_zscore(train_df,i)
    encode_numeric_zscore(test_df,i)
for i in CategoricalColumns:
    encode_text_dummy(train_df,i)
    encode_text_dummy(test_df,i)
encode_text_index(train_df,'attack_cat')
encode_text_index(test_df,'attack_cat') 


array(['Backdoor', 'DoS', 'Exploits', 'Fuzzers', 'Normal',
       'Reconnaissance', 'Worms'], dtype=object)

### Drop Unique Rows

In [284]:
train_categories = set(train_df['attack_cat'].unique())
test_categories = set(test_df['attack_cat'].unique())

unique_to_train = train_categories - test_categories
unique_to_test = test_categories - train_categories

print("Unique categories in Train:", unique_to_train)
print("Unique categories in Test:", unique_to_test)

# Drop unique categories in the train dataset
train_df = train_df[~train_df['attack_cat'].isin(unique_to_train)]

# Drop unique categories in the test dataset
test_df = test_df[~test_df['attack_cat'].isin(unique_to_test)]

print("Train categories:", train_df['attack_cat'].nunique())
print("Test categories:", test_df['attack_cat'].nunique())

#Fix to binary problem where 1 means safe and 0 means attack
train_df['attack_cat'] = train_df['attack_cat'].apply(lambda x: 1 if x == 5 else 0)
test_df['attack_cat'] = test_df['attack_cat'].apply(lambda x: 1 if x == 4 else 0)


print("Updated Train categories:", train_df['attack_cat'].nunique())
print("Updated Test categories:", test_df['attack_cat'].nunique())


Unique categories in Train: {7}
Unique categories in Test: set()
Train categories: 7
Test categories: 7
Updated Train categories: 2
Updated Test categories: 2


### Make First Model

In [285]:
#Figure out how to turn the output into 0 and 1, not 0-7
x,y = to_xy(train_df,"attack_cat")
x_test,y_test = to_xy(test_df,"attack_cat")

y = np.argmax(y, axis=1)
y_test = np.argmax(y_test, axis=1)

print("Train X shape:", x.shape)
print("Train Y shape:", y.shape)
print("Test X shape:", x_test.shape)
print("Test Y shape:", y_test.shape)

model = Sequential()
model.add(Dense(100, input_dim=x.shape[1], activation='relu')) # More neurons
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(50, activation='relu')) # Additional layer
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(25, activation='relu'))
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(1, activation='sigmoid')) # Output layer
adam = optimizers.Adam(learning_rate=0.001, beta_1=0.999, beta_2=0.999, epsilon=None, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=3, verbose=2, mode='auto')  
checkpointer = ModelCheckpoint(filepath="dnn/model.keras", verbose=0, save_best_only=True) # save best model
model.fit(x, y, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=1000) 
model.load_weights('dnn/model.keras') # load weights from best model 

# Make predictions on the test set
pred = model.predict(x_test)

predicted_classes = (pred > 0.5).astype(int).flatten()  # Ensure it's 1D
true_classes = y_test.flatten()  # Ensure it's 1D

# Evaluate the accuracy score
score = metrics.accuracy_score(true_classes, predicted_classes)
print("Accuracy score: {}".format(score))

# Print classification report
print(metrics.classification_report(true_classes, predicted_classes))


CLASS
CLASS
Train X shape: (81059, 60)
Train Y shape: (81059,)
Test X shape: (35179, 60)
Test Y shape: (35179,)
Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


2534/2534 - 5s - 2ms/step - loss: 0.6036 - val_loss: 0.5175
Epoch 2/1000
2534/2534 - 3s - 1ms/step - loss: 0.5856 - val_loss: 0.5165
Epoch 3/1000
2534/2534 - 3s - 1ms/step - loss: 0.5856 - val_loss: 0.5157
Epoch 4/1000
2534/2534 - 3s - 1ms/step - loss: 0.5856 - val_loss: 0.5167
Epoch 5/1000
2534/2534 - 5s - 2ms/step - loss: 0.5856 - val_loss: 0.5161
Epoch 6/1000
2534/2534 - 4s - 1ms/step - loss: 0.5856 - val_loss: 0.5159
Epoch 6: early stopping
[1m 127/1100[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 801us/step

  saveable.load_own_variables(weights_store.get(inner_path))


[1m1100/1100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 796us/step
Accuracy score: 0.20165439608857558
              precision    recall  f1-score   support

           0       0.20      1.00      0.34      7094
           1       0.00      0.00      0.00     28085

    accuracy                           0.20     35179
   macro avg       0.10      0.50      0.17     35179
weighted avg       0.04      0.20      0.07     35179



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
