## CSC-180 Project 2
##### By: Derek Chen, Nicolas Gueliemo, Xai Yang, Katrina Yu

### Imports & Provided Functions from Lab

In [29]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import random
from matplotlib.pyplot import figure, show
from sklearn.model_selection import train_test_split
import os
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import optimizers


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        print('CLASS')
        unique_values = df[target].nunique()
        if unique_values > 2:
            # If more than 2 classes, convert to binary (e.g., 0 for normal, 1 for attacks)
            df[target] = np.where(df[target] > 0, 1, 0)
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32).reshape(-1, 1)
    else:
        # Regression
        print('REG')
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

# Function to plot results
def plot(results):
    fig, ax = plt.subplots(figsize=(10, 6))

    for activation in results['Activation'].unique():
        subset = results[results['Activation'] == activation]
        ax.plot(subset['Neurons'].astype(str), subset['RMSE'], marker='o', label=activation)

    ax.set_title('Model Performance by Activation Function')
    ax.set_xlabel('Layer Configuration (Neurons)')
    ax.set_ylabel('RMSE')
    ax.legend(title='Activation Function')
    plt.xticks(rotation=45)
    plt.grid()
    plt.tight_layout()
    plt.show()

def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    plt.plot(t['pred'].tolist(), label='Prediction', color='orange')
    plt.plot(t['y'].tolist(), label='Expected', color='blue')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


### Step 1: Data Processing
#### By: Nicolas Gugliemo
#### Includes:
- Read from CSV Files
- Create Data frames
- Clean Data to drop dups

In [30]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

#Set paths and set option to print entire tables
path = "./data/"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#Set paths for CSV
features_path = os.path.join(path,"NUSW-NB15_features.csv")
train_path = os.path.join(path,"UNSW_NB15_training-set.csv")
test_path = os.path.join(path,"UNSW_NB15_test-set.csv")

#Read the CSVs
features_df = pd.read_csv(features_path,encoding='ISO-8859-1')
train_df = pd.read_csv(train_path,encoding='ISO-8859-1')
test_df = pd.read_csv(test_path,encoding='ISO-8859-1')

#Fix ID's Name
train_df.rename(columns={'ï»¿id': 'id'}, inplace=True)
test_df.rename(columns={'ï»¿id': 'id'}, inplace=True)

#Replace '-' (missing value) with NA
train_df.replace('-', pd.NA, inplace=True)
test_df.replace('-', pd.NA, inplace=True)

#Drop missing values
features_df = features_df.dropna()
train_df = train_df.dropna()
test_df = test_df.dropna()

train_df['attack_cat'] = train_df['attack_cat'].apply(lambda x: 1 if x != 'Normal' else 0)
test_df['attack_cat'] = test_df['attack_cat'].apply(lambda x: 1 if x != 'Normal' else 0)

print('Train Values:')
print(train_df['attack_cat'].value_counts())
print('Test Values:')
print(test_df['attack_cat'].value_counts())


Train Values:
attack_cat
1    61685
0    19488
Name: count, dtype: int64
Test Values:
attack_cat
1    25554
0     9625
Name: count, dtype: int64


### Encode categorical features and normalize numeric features.


In [34]:
#DROP UNIQUE ROWS
CategoricalColumns = ['proto', 'service', 'state']

# One-hot encoding for the categorical columns using encode_text_dummy
for col in CategoricalColumns:
    encode_text_dummy(train_df, col)
    encode_text_dummy(test_df, col)

# Ensure training and test sets have the same columns after encoding
train_df, test_df = train_df.align(test_df, join='inner', axis=1)

# Send the expected numeric and categorical columns to normalization. Send target to text_index.
NumericColumns = ['dur','spkts','dpkts','sbytes','dbytes','rate','sttl','dttl','sload','dload','sloss','dloss','sinpkt','dinpkt','sjit','djit','swin','stcpb','dtcpb','tcprtt',
               'synack','ackdat','smean','dmean','trans_depth','response_body_len','ct_srv_src','ct_state_ttl','ct_dst_ltm','ct_dst_sport_ltm','ct_dst_src_ltm','is_ftp_login','ct_ftp_cmd',
               'ct_flw_http_mthd','ct_src_ltm','ct_srv_dst','is_sm_ips_ports']

scaler = StandardScaler()
train_df[NumericColumns] = scaler.fit_transform(train_df[NumericColumns])
test_df[NumericColumns] = scaler.transform(test_df[NumericColumns])

# Now convert the DataFrame to feature matrix (x) and target vector (y)
x, y = to_xy(train_df, "attack_cat")
x_test, y_test = to_xy(test_df, "attack_cat")

print(train_df['attack_cat'].value_counts())
print(test_df['attack_cat'].value_counts())

CLASS
CLASS
attack_cat
1    61681
0    19478
Name: count, dtype: int64
attack_cat
1    25554
0     9624
Name: count, dtype: int64


### Drop Unique Rows

In [35]:
print("Unique categories in Train (before conversion):", train_df['attack_cat'].unique())
print("Unique categories in Test (before conversion):", test_df['attack_cat'].unique())

train_categories = set(train_df['attack_cat'].unique())
test_categories = set(test_df['attack_cat'].unique())

unique_to_train = train_categories - test_categories
unique_to_test = test_categories - train_categories

print("Unique categories in Train:", unique_to_train)
print("Unique categories in Test:", unique_to_test)

# Drop unique categories in the train dataset
train_df = train_df[~train_df['attack_cat'].isin(unique_to_train)]

# Drop unique categories in the test dataset
test_df = test_df[~test_df['attack_cat'].isin(unique_to_test)]

print("Train categories:", train_df['attack_cat'].nunique())
print("Test categories:", test_df['attack_cat'].nunique())
print(train_df['attack_cat'].value_counts())
print(test_df['attack_cat'].value_counts())
print("Before fix",train_df['attack_cat'].unique())

#Fix to binary problem where 1 means safe and 0 means attack
train_df['attack_cat'] = train_df['attack_cat'].apply(lambda x: 1 if x != 0 else 0)
test_df['attack_cat'] = test_df['attack_cat'].apply(lambda x: 1 if x != 0 else 0)
print(train_df['attack_cat'].value_counts())
print(test_df['attack_cat'].value_counts())
print(train_df['attack_cat'].unique())

print("Updated Train categories:", train_df['attack_cat'].nunique())
print("Updated Test categories:", test_df['attack_cat'].nunique())
# Assuming you have a DataFrame called df and want to print unique values in a specific column


Unique categories in Train (before conversion): [0 1]
Unique categories in Test (before conversion): [0 1]
Unique categories in Train: set()
Unique categories in Test: set()
Train categories: 2
Test categories: 2
attack_cat
1    61681
0    19478
Name: count, dtype: int64
attack_cat
1    25554
0     9624
Name: count, dtype: int64
Before fix [0 1]
attack_cat
1    61681
0    19478
Name: count, dtype: int64
attack_cat
1    25554
0     9624
Name: count, dtype: int64
[0 1]
Updated Train categories: 2
Updated Test categories: 2


### Make First Model

In [36]:
#Figure out how to turn the output into 0 and 1, not 0-7
x,y = to_xy(train_df,"attack_cat")
x_test,y_test = to_xy(test_df,"attack_cat")

unique_train, counts_train = np.unique(y, return_counts=True)
print("Unique values in y_train_binary:", unique_train)
print("Counts in y_train_binary:", counts_train)

# Print unique values and their counts for y_test_binary
unique_test, counts_test = np.unique(y_test, return_counts=True)
print("\nUnique values in y_test_binary:", unique_test)
print("Counts in y_test_binary:", counts_test)
print("Train X shape:", x.shape)
print("Train Y shape:", y.shape)
print("Test X shape:", x_test.shape)
print("Test Y shape:", y_test.shape)
print(train_df['attack_cat'].value_counts())
print(test_df['attack_cat'].value_counts())

model = Sequential()
model.add(Dense(100, input_dim=x.shape[1], activation='relu')) # More neurons
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(50, activation='relu')) # Additional layer
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(25, activation='relu'))
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(1, activation='sigmoid')) # Output layer
model.save('dnn/model.keras')
adam = optimizers.Adam(learning_rate=0.001, beta_1=0.999, beta_2=0.999, epsilon=None, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=2, mode='auto')
model.save('dnn/model.keras')
checkpointer = ModelCheckpoint(filepath="dnn/model.keras", verbose=0, save_best_only=True) # save best model
model.fit(x, y, validation_data=(x_test,y_test), callbacks=[monitor, checkpointer], verbose=2, epochs=100)
model.load_weights('dnn/model.keras') # load weights from best model
#model.fit(x,y,verbose=2,epochs=10)

# Make predictions on the test set
pred = model.predict(x_test)

predicted_classes = (pred > 0.5).astype(int).flatten()  # Ensure it's 1D
true_classes = y_test.flatten()  # Ensure it's 1D

# Evaluate the accuracy score
score = metrics.accuracy_score(true_classes, predicted_classes)
print("Accuracy score: {}".format(score))

# Print classification report
print(metrics.classification_report(true_classes, predicted_classes))
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred_probs = model.predict(x_test).flatten()
y_pred_classes = (y_pred_probs > 0.5).astype(int)

# Print classification report
print(classification_report(y_test, y_pred_classes, target_names=['Normal', 'Attack']))

CLASS
CLASS
Unique values in y_train_binary: [0. 1.]
Counts in y_train_binary: [19478 61681]

Unique values in y_test_binary: [0. 1.]
Counts in y_test_binary: [ 9624 25554]
Train X shape: (81159, 59)
Train Y shape: (81159, 1)
Test X shape: (35178, 59)
Test Y shape: (35178, 1)
attack_cat
1    61681
0    19478
Name: count, dtype: int64
attack_cat
1    25554
0     9624
Name: count, dtype: int64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
2537/2537 - 13s - 5ms/step - accuracy: 0.7817 - loss: 24.4319 - val_accuracy: 0.5041 - val_loss: 0.8134
Epoch 2/100
2537/2537 - 5s - 2ms/step - accuracy: 0.8343 - loss: 0.4553 - val_accuracy: 0.3651 - val_loss: 0.9460
Epoch 3/100
2537/2537 - 6s - 2ms/step - accuracy: 0.8627 - loss: 0.3051 - val_accuracy: 0.3188 - val_loss: 1.0934
Epoch 4/100
2537/2537 - 5s - 2ms/step - accuracy: 0.8673 - loss: 0.3064 - val_accuracy: 0.3486 - val_loss: 2.5481
Epoch 5/100
2537/2537 - 12s - 5ms/step - accuracy: 0.8580 - loss: 0.3177 - val_accuracy: 0.4759 - val_loss: 3.3150
Epoch 6/100
2537/2537 - 9s - 4ms/step - accuracy: 0.8635 - loss: 0.3093 - val_accuracy: 0.4201 - val_loss: 5.5953
Epoch 6: early stopping
[1m1100/1100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Accuracy score: 0.5040934675081017
              precision    recall  f1-score   support

         0.0       0.32      0.75      0.45      9624
         1.0       0.81      0.41      0.55     25554

    accuracy 