## CSC-180 Project 2
##### By: Derek Chen, Nicolas Gueliemo, Xai Yang, Katrina Yu

### Imports & Provided Functions from Lab

In [83]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import random
from matplotlib.pyplot import figure, show
from sklearn.model_selection import train_test_split
import os
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import optimizers


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        print('CLASS')
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        print('REG')
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

# Function to plot results
def plot(results):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    for activation in results['Activation'].unique():
        subset = results[results['Activation'] == activation]
        ax.plot(subset['Neurons'].astype(str), subset['RMSE'], marker='o', label=activation)

    ax.set_title('Model Performance by Activation Function')
    ax.set_xlabel('Layer Configuration (Neurons)')
    ax.set_ylabel('RMSE')
    ax.legend(title='Activation Function')
    plt.xticks(rotation=45)
    plt.grid()
    plt.tight_layout()
    plt.show()

def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    plt.plot(t['pred'].tolist(), label='Prediction', color='orange')
    plt.plot(t['y'].tolist(), label='Expected', color='blue')
    plt.ylabel('output')
    plt.legend()
    plt.show()
    
# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


### Step 1: Data Processing
#### By: Nicolas Gugliemo
#### Includes:
- Read from CSV Files
- Create Data frames
- Clean Data to drop dups

In [84]:
import os
import pandas as pd
import numpy as np

#Set paths and set option to print entire tables
path = "./data/"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#Set paths for CSV
features_path = os.path.join(path,"NUSW-NB15_features.csv")
train_path = os.path.join(path,"UNSW_NB15_training-set.csv")
test_path = os.path.join(path,"UNSW_NB15_test-set.csv")

#Read the CSVs 
features_df = pd.read_csv(features_path,encoding='ISO-8859-1')
train_df = pd.read_csv(train_path,encoding='ISO-8859-1')
test_df = pd.read_csv(test_path,encoding='ISO-8859-1')

#Fix ID's Name
train_df.rename(columns={'ï»¿id': 'id'}, inplace=True)
test_df.rename(columns={'ï»¿id': 'id'}, inplace=True)

#Replace '-' (missing value) with NA
train_df.replace('-', pd.NA, inplace=True)
test_df.replace('-', pd.NA, inplace=True)
  
#Drop missing values
features_df = features_df.dropna()
train_df = train_df.dropna()
test_df = test_df.dropna()

train_df['attack_cat'].replace('Generic', 'Normal', inplace=True)
test_df['attack_cat'].replace('Generic', 'Normal', inplace=True)
##TO-DO, DROP UNIQUE ROWS
train_df[0:10]

print(train_df['attack_cat'].value_counts())


attack_cat
Normal            58984
Exploits          16187
DoS                1791
Fuzzers            1731
Reconnaissance     1703
Analysis            564
Worms               114
Backdoor             99
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['attack_cat'].replace('Generic', 'Normal', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['attack_cat'].replace('Generic', 'Normal', inplace=True)


### Encode categorical features and normalize numeric features.


In [85]:
# Test the expected categorical columns to one hot encoding 
pd.get_dummies(train_df, columns=['proto', 'service', 'state']).head()
pd.get_dummies(test_df, columns=['proto', 'service', 'state']).head()

# Send the expected numeric and categorical columns to normalization. Send target to text_index.  
NumericColumns = ['dur','spkts','dpkts','sbytes','dbytes','rate','sttl','dttl','sload','dload','sloss','dloss','sinpkt','dinpkt','sjit','djit','swin','stcpb','dtcpb','tcprtt',
               'synack','ackdat','smean','dmean','trans_depth','response_body_len','ct_srv_src','ct_state_ttl','ct_dst_ltm','ct_dst_sport_ltm','ct_dst_src_ltm','is_ftp_login','ct_ftp_cmd',
               'ct_flw_http_mthd','ct_src_ltm','ct_srv_dst','is_sm_ips_ports']
CategoricalColumns = ['proto', 'service', 'state']
for i in NumericColumns:
    encode_numeric_zscore(train_df,i)
    encode_numeric_zscore(test_df,i)
for i in CategoricalColumns:
    encode_text_dummy(train_df,i)
    encode_text_dummy(test_df,i)
encode_text_index(train_df,'attack_cat')
encode_text_index(test_df,'attack_cat') #
print(train_df['attack_cat'].value_counts())
print(test_df['attack_cat'].value_counts())

attack_cat
5    58984
3    16187
2     1791
4     1731
6     1703
0      564
7      114
1       99
Name: count, dtype: int64
attack_cat
4    28085
2     5293
1      717
3      535
5      504
6       34
0       11
Name: count, dtype: int64


### Drop Unique Rows

In [86]:
train_categories = set(train_df['attack_cat'].unique())
test_categories = set(test_df['attack_cat'].unique())

unique_to_train = train_categories - test_categories
unique_to_test = test_categories - train_categories

print("Unique categories in Train:", unique_to_train)
print("Unique categories in Test:", unique_to_test)

# Drop unique categories in the train dataset
train_df = train_df[~train_df['attack_cat'].isin(unique_to_train)]

# Drop unique categories in the test dataset
test_df = test_df[~test_df['attack_cat'].isin(unique_to_test)]

print("Train categories:", train_df['attack_cat'].nunique())
print("Test categories:", test_df['attack_cat'].nunique())
print(train_df['attack_cat'].value_counts())
print(test_df['attack_cat'].value_counts())
print("Before fix",train_df['attack_cat'].unique())

#Fix to binary problem where 1 means safe and 0 means attack
train_df['attack_cat'] = train_df['attack_cat'].apply(lambda x: 1 if x == 5 else 0)
test_df['attack_cat'] = test_df['attack_cat'].apply(lambda x: 1 if x == 4 else 0)
print(train_df['attack_cat'].value_counts())
print(test_df['attack_cat'].value_counts())
print(train_df['attack_cat'].unique())

print("Updated Train categories:", train_df['attack_cat'].nunique())
print("Updated Test categories:", test_df['attack_cat'].nunique())
# Assuming you have a DataFrame called df and want to print unique values in a specific column


Unique categories in Train: {7}
Unique categories in Test: set()
Train categories: 7
Test categories: 7
attack_cat
5    58984
3    16187
2     1791
4     1731
6     1703
0      564
1       99
Name: count, dtype: int64
attack_cat
4    28085
2     5293
1      717
3      535
5      504
6       34
0       11
Name: count, dtype: int64
Before fix [5 1 4 6 3 0 2]
attack_cat
1    58984
0    22075
Name: count, dtype: int64
attack_cat
1    28085
0     7094
Name: count, dtype: int64
[1 0]
Updated Train categories: 2
Updated Test categories: 2


### Make First Model

In [87]:
#Figure out how to turn the output into 0 and 1, not 0-7
x,y = to_xy(train_df,"attack_cat")
x_test,y_test = to_xy(test_df,"attack_cat")

y = np.argmax(y, axis=1)
y_test = np.argmax(y_test, axis=1)
unique_train, counts_train = np.unique(y, return_counts=True)
print("Unique values in y_train_binary:", unique_train)
print("Counts in y_train_binary:", counts_train)

# Print unique values and their counts for y_test_binary
unique_test, counts_test = np.unique(y_test, return_counts=True)
print("\nUnique values in y_test_binary:", unique_test)
print("Counts in y_test_binary:", counts_test)
print("Train X shape:", x.shape)
print("Train Y shape:", y.shape)
print("Test X shape:", x_test.shape)
print("Test Y shape:", y_test.shape)
print(train_df['attack_cat'].value_counts())
print(test_df['attack_cat'].value_counts())

model = Sequential()
model.add(Dense(100, input_dim=x.shape[1], activation='relu')) # More neurons
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(50, activation='relu')) # Additional layer
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(25, activation='relu'))
model.add(Dropout(0.1))  # Dropout to reduce overfitting
model.add(Dense(1, activation='sigmoid')) # Output layer

adam = optimizers.Adam(learning_rate=0.001, beta_1=0.999, beta_2=0.999, epsilon=None, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=2, mode='auto')  
model.save('dnn/model.keras')
checkpointer = ModelCheckpoint(filepath="dnn/model.keras", verbose=0, save_best_only=True) # save best model
model.fit(x, y, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=1000) 
model.load_weights('dnn/model.keras') # load weights from best model 
#model.fit(x,y,verbose=2,epochs=10)

# Make predictions on the test set
pred = model.predict(x_test)

predicted_classes = (pred > 0.5).astype(int).flatten()  # Ensure it's 1D
true_classes = y_test.flatten()  # Ensure it's 1D

# Evaluate the accuracy score
score = metrics.accuracy_score(true_classes, predicted_classes)
print("Accuracy score: {}".format(score))

# Print classification report
print(metrics.classification_report(true_classes, predicted_classes))
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred_probs = model.predict(x_test).flatten()
y_pred_classes = (y_pred_probs > 0.5).astype(int)

# Print classification report
print(classification_report(y_test, y_pred_classes, target_names=['Normal', 'Attack']))

CLASS
CLASS
Unique values in y_train_binary: [0 1]
Counts in y_train_binary: [22075 58984]

Unique values in y_test_binary: [0 1]
Counts in y_test_binary: [ 7094 28085]
Train X shape: (81059, 60)
Train Y shape: (81059,)
Test X shape: (35179, 60)
Test Y shape: (35179,)
attack_cat
1    58984
0    22075
Name: count, dtype: int64
attack_cat
1    28085
0     7094
Name: count, dtype: int64
Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


2534/2534 - 4s - 2ms/step - loss: 0.6046 - val_loss: 0.5179
Epoch 2/1000
2534/2534 - 3s - 1ms/step - loss: 0.5856 - val_loss: 0.5159
Epoch 3/1000
2534/2534 - 3s - 1ms/step - loss: 0.5856 - val_loss: 0.5167
Epoch 4/1000
2534/2534 - 3s - 1ms/step - loss: 0.5856 - val_loss: 0.5167
Epoch 5/1000
2534/2534 - 3s - 1ms/step - loss: 0.5856 - val_loss: 0.5168
Epoch 6/1000
2534/2534 - 4s - 1ms/step - loss: 0.5856 - val_loss: 0.5162
Epoch 7/1000
2534/2534 - 3s - 1ms/step - loss: 0.5856 - val_loss: 0.5170
Epoch 7: early stopping
[1m  74/1100[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 685us/step

  saveable.load_own_variables(weights_store.get(inner_path))


[1m1100/1100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 641us/step
Accuracy score: 0.20165439608857558
              precision    recall  f1-score   support

           0       0.20      1.00      0.34      7094
           1       0.00      0.00      0.00     28085

    accuracy                           0.20     35179
   macro avg       0.10      0.50      0.17     35179
weighted avg       0.04      0.20      0.07     35179

[1m 150/1100[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 682us/step

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m1100/1100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 571us/step
              precision    recall  f1-score   support

      Normal       0.20      1.00      0.34      7094
      Attack       0.00      0.00      0.00     28085

    accuracy                           0.20     35179
   macro avg       0.10      0.50      0.17     35179
weighted avg       0.04      0.20      0.07     35179



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
