# Imports

In [1]:
import numpy as np
import pandas as pd

from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import TensorBoard
from sklearn.preprocessing import StandardScaler
# from skfeature.function.similarity_based import fisher_score
from skfeature.function.similarity_based.fisher_score import fisher_score, feature_ranking
from sklearn.metrics import recall_score, accuracy_score, precision_score

Using TensorFlow backend.


# Utils

In [2]:
scaler = StandardScaler()

# Load Data

In [3]:
df = pd.read_csv("dataset.csv")

In [4]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Fisher score

In [7]:
def fisherScore(n_features, df):
    benign = df[df["anomaly"]==0].sample(n=1000, random_state=17)
    malicious = df[df["anomaly"]==1].sample(n=1000, random_state=17)
    temp_df = pd.concat([benign, malicious])
    score = fisher_score(temp_df.iloc[:, :-1].values, temp_df.iloc[:, -1].values)
    ranked_features = list(feature_ranking(score))
    ranked_features.append(n_features)
    return ranked_features

In [8]:
idx_order = fisherScore(115, df)

**Data Prep**

In [10]:
benign_df = df[df["anomaly"]==0].iloc[:, idx_order]

In [11]:
train, validate, test = np.split(benign_df.sample(frac=1, random_state=42), [int(1/3*len(benign_df)), int(2/3*len(benign_df))])

In [12]:
train_scaled = scaler.fit_transform(train.iloc[:, :-1].values)

In [13]:
validate_scaled = scaler.fit_transform(validate.iloc[:, :-1].values)

# Deep Autoencoder

In [9]:
def auto_encoder(input_dim):
    autoencoder = Sequential()
    autoencoder.add(Dense(int(0.75 * input_dim), activation="relu", input_shape=(input_dim,)))
    autoencoder.add(Dense(int(0.5 * input_dim), activation="relu"))
    autoencoder.add(Dense(int(0.33 * input_dim), activation="relu"))
    autoencoder.add(Dense(int(0.25 * input_dim), activation="relu"))
    autoencoder.add(Dense(int(0.33 * input_dim), activation="relu"))
    autoencoder.add(Dense(int(0.5 * input_dim), activation="relu"))
    autoencoder.add(Dense(int(0.75 * input_dim), activation="relu"))
    autoencoder.add(Dense(input_dim))
    return autoencoder

**Train Model**

In [15]:
trained_model = auto_encoder(115)
trained_model.compile(loss="mean_squared_error", optimizer="sgd")
tensorBoard = TensorBoard(log_dir=f"./logs", histogram_freq=0, write_graph=True, write_images=True)

In [34]:
trained_model.fit(train_scaled, 
                  train_scaled, 
                  epochs=60, 
                  batch_size=100, 
                  verbose=1,
                  callbacks=[tensorBoard])

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.callbacks.History at 0x1a37529208>

**Test Model**

In [35]:
x_validate_predictions = trained_model.predict(validate_scaled)

In [36]:
mse = np.mean(np.power(validate_scaled - x_validate_predictions, 2), axis=1)

In [37]:
tr = mse.mean() + mse.std()

In [38]:
test_set = pd.concat([test, df[df["anomaly"]==1]], sort=True, ignore_index=True)

In [39]:
test_set = test_set.iloc[:, idx_order]

In [40]:
test_set_scaled = scaler.transform(test_set.iloc[:, :-1].values)

In [41]:
test_pred = trained_model.predict(test_set_scaled)

**Performance**

In [42]:
mse = np.mean(np.power(test_set_scaled - test_pred, 2), axis=1)

In [43]:
predictions = (mse > tr).astype(int)

In [44]:
print('Accuracy')
print(accuracy_score(test_set.iloc[:, -1], predictions))
print('Recall')
print(recall_score(test_set.iloc[:, -1], predictions))
print('Precision')
print(precision_score(test_set.iloc[:, -1], predictions))

Accuracy
0.9723076380504838
Recall
1.0
Precision
0.9723076380504838
