In [31]:
import pandas as pd
from joblib import load

test_path = "./data_local/UNSW_NB15_testing-set.csv"
features_path = "./data_local/NUSW-NB15_features.csv"
data = pd.read_csv(test_path)
features = pd.read_csv(features_path, encoding='latin-1', header=None)

model = load('./models/random_forest_model _dropped.joblib')


data.shape

(175341, 45)

In [29]:


def remove_spaces_in_column_names(df):
    new_columns = {col: col.replace(' ', '') for col in df.columns}
    df = df.rename(columns=new_columns)
    return df
data = remove_spaces_in_column_names(data)


def remove_invalid_rows(df, column_name, desired_type):
    invalid_rows = []
    for i, value in enumerate(df[column_name]):
        try:
            casted_value = desired_type(value)
        except (ValueError, TypeError):
            invalid_rows.append(i)
    new_df = df.drop(invalid_rows)
    return new_df




# Get the list of integer features
integer_features = [ 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
                    'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smean', 'dmean',
                    'trans_depth', 'response_body_len', 'ct_state_ttl', 'ct_flw_http_mthd',
                    'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
                    'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm']

# Find invalid values for each integer feature
data = data[~(data == '-').any(axis=1)]
data = data[~(data == ' ').any(axis=1)]
data.dropna()


from sklearn.preprocessing import LabelEncoder

# Assuming your dataset is stored in a pandas DataFrame named 'df'

# List of nominal feature columns to encode
nominal_features = [ 'proto', 'state', 'service']

# Create a copy of the original dataset

data_encoded = data.copy()

# Label Encoding
label_encoder = LabelEncoder()
for feature in nominal_features:
    data_encoded[feature] = label_encoder.fit_transform(data_encoded[feature])


import numpy as np

rows, cols = np.where( data_encoded == "NaN")

# Print the row and column indices where '-' occurs
for row, col in zip(rows, cols):
    print(f"Row: {row}, Column: {data_encoded.columns[col]}")



data.shape

(81173, 45)

In [27]:
columns = ['id',
 'proto',
 'state',
 'dur',
 'sbytes',
 'dbytes',
 'sttl',
 'dttl',
 'sloss',
 'dloss',
 'service',
 'sload',
 'dload',
 'Spkts',
 'Dpkts',
 'swin',
 'dwin',
 'stcpb',
 'dtcpb',
 'smean',
 'dmean',
 'trans_depth',
 'response_body_len',
 'sjit',
 'djit',
 'sinpkt',
 'dinpkt',
 'tcprtt',
 'synack',
 'ackdat',
 'is_sm_ips_ports',
 'ct_state_ttl',
 'ct_flw_http_mthd',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_srv_src',
 'ct_srv_dst',
 'ct_dst_ltm',
 'ct_src_ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm']


test_data = data_encoded.drop(["rate","attack_cat","label"],axis=1)

test_data.rename(columns={"spkts":"Spkts","dpkts":"Dpkts"},inplace=True)

test_data = test_data.reindex(columns=columns)

prediction = model.predict(test_data)


In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


y_test_dropped = data['label']




random_forest_accuracy = accuracy_score(y_test_dropped,prediction)
random_forest_precision = precision_score(y_test_dropped,prediction)
random_forest_recall = recall_score(y_test_dropped,prediction)
random_forest_f1score = f1_score(y_test_dropped,prediction)




print("Random Forest:")
print(f"Accuracy: {random_forest_accuracy:.4f}")
print(f"Precision: {random_forest_precision:.4f}")
print(f"Recall: {random_forest_recall:.4f}")
print(f"F1-Score: {random_forest_f1score:.4f}")
print()

Random Forest:
Accuracy: 0.9779
Precision: 0.9726
Recall: 0.9990
F1-Score: 0.9857

