<a href="https://colab.research.google.com/github/jtunde/Python/blob/main/NSLKDDAllClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

def proc_categ(values):
    categ = values.value_counts()
    count = float(len(values))
    res = ""
    for idx in categ.index:
        res += idx
        res += " " + str(categ[idx]) + " " + str(100*round(categ[idx] / count,4)) + "%, "
    return res

def example_plot(normal_values, attack_values):
    # Plot a selection of features
    fig, axs = plt.subplots(3, 2)

    # The first column will plot the normal, the second the attack
    axs[0, 0].plot(normal_values[37])#dst_host_serror_rate
    axs[0, 0].set_title('Normal: % dest. connections with SYN errors')
    axs[0, 1].plot(attack_values[37])#dst_host_serror_rate
    axs[0, 1].set_title('Attack: % dest. connections with SYN errors')

    axs[1, 0].plot(normal_values[24])#serror_rate
    axs[1, 0].set_title('Normal: % connections with SYN errors')
    axs[1, 1].plot(attack_values[24])#serror_rate
    axs[1, 1].set_title('Attack: % connections with SYN errors')

    axs[2, 0].plot(normal_values[4])#src_bytes
    axs[2, 0].set_title('Normal: src bytes')
    axs[2, 1].plot(attack_values[4])#src_bytes
    axs[2, 1].set_title('Attack: src bytes')

    plt.show()

nls_columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'class']

CLASSIFIER_COLUMN = 41


In [None]:
df = pd.read_csv('sample_data/KDDTrain+_20Percent.txt', header=None)

In [None]:
print(df[0:10])
print("Attack type statistics:" + proc_categ(df[41]))
print("Protocol statistics:" + proc_categ(df[1]))

In [None]:
normal_val = df.loc[df[41] == 'normal']
syn_val = df.loc[df[41] == 'neptune']
print("NORMAL data count: " + str(len(normal_val)))
print("SYN data count: " + str(len(syn_val)))

example_plot(normal_val, syn_val)

In [None]:
def prepare_labels(df):
    categ = df[CLASSIFIER_COLUMN].value_counts()
    labels = {}
    ctr = 0
    for idx in categ.index:
        labels[idx] = ctr
        ctr += 1

        df.loc[(df[CLASSIFIER_COLUMN] == idx), CLASSIFIER_COLUMN] = ctr

    return df

# Place integer values in column 41 (the class column)
dflab = prepare_labels(df)
dflab[CLASSIFIER_COLUMN] = pd.to_numeric(dflab[CLASSIFIER_COLUMN]) # Make it numeric, so that it is not eliminated by the preprocessing!!!
print(dflab)

In [None]:
def preproc_data(df):
    # We delete columns with 0
    df = df.loc[:, (df != 0).any(axis=0)]

    # We delete columns with non-numeric values
    non_numerical = []
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            non_numerical.append(col)
    df = df.drop(columns=non_numerical)

    # Normalize columns - min-max normalization ([0,1] interval)
    for col in df:
        if col == CLASSIFIER_COLUMN: # Skip the classifier column
            continue
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    return df

# Remove last column - it contains traffic type, we do not care about that
dflab_proc = dflab.iloc[:,:(len(dflab.columns)-1)]

# Remove zeros and non-numeric columns
dflab_proc = preproc_data(dflab_proc)
print(dflab_proc)

In [None]:
# Prepare dataset for training and for testing
df_train = dflab_proc.iloc[0:(int(len(dflab_proc.index)/2)),:]
df_pred = dflab_proc.iloc[(int(len(dflab_proc.index)/2)):,:]

In [None]:
# Create model
def rand_forest_fit(dfin):
    # Perform a random forest-based classification
    rf = RandomForestClassifier(n_estimators = 200, random_state = 42, max_depth = 30, min_samples_leaf = 4, min_samples_split = 5, oob_score = True)

    # Prepare the data
    data_arr = dfin.to_numpy()
    nb_cols = numpy.shape(data_arr)[1]

    train_data = data_arr[:,0:(nb_cols-2)]
    label_data = data_arr[:,nb_cols-1]

    print(label_data)

    # Now train the model
    rf.fit(train_data, label_data)

    # Show importance
    features = dfin.columns.to_numpy()
    features = features[0:-1] # Delete last column - not used as a feature
    feat_imp = rf.feature_importances_

    feat_sig = [a for a in zip(feat_imp, features)]
    # Sort by importance
    feat_sig.sort(reverse=True,key = lambda x: x[0])

    print("IMPORTANCES: ")
    for sig, feat in feat_sig:
        print("{} : {}".format(nls_columns[feat], sig))

    return rf

# Train the model
rf_model = rand_forest_fit(df_train)

In [None]:
def rand_forest_predict(rf, df):
    # Prepare the data
    data_arr = df.to_numpy()
    nb_cols = numpy.shape(data_arr)[1]

    predict_data = data_arr[:,0:(nb_cols-2)]
    label_data = data_arr[:,(nb_cols-1)]

    prediction = rf.predict(predict_data)
    errors = abs(prediction - label_data)

    print('Mean Absolute Error:', round(numpy.mean(errors), 4), 'degrees.')

    # Plot a selection of features
    fig, axs = plt.subplots(2)

    axs[0].plot(prediction)
    axs[1].plot(label_data)
    plt.show()

# Now predict
rand_forest_predict(rf_model, df_pred)