In [39]:
import pandas as pd
from turtleIsolationForests.isolationForest import IsolationForest
from sklearn.preprocessing import StandardScaler

In [40]:
dataframe = pd.read_csv("./data_OHE.csv", index_col=0)
dataframe

In [41]:
test_dataframe = pd.read_csv("./test_data_OHE.csv", index_col=0)
test_dataframe

In [42]:
train_labels = dataframe.xs('class_normal', axis='columns')

for column in dataframe.columns:
    if (column[0:6] == 'class_'):
        dataframe.drop(column, axis='columns', inplace=True)

test_labels = test_dataframe.xs('class_normal', axis='columns')

for column in test_dataframe.columns:
    if (column[0:6] == 'class_'):
        test_dataframe.drop(column, axis='columns', inplace=True)

In [43]:
X_train = dataframe
X_test = test_dataframe

In [44]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_train

In [45]:
contamination = sum(train_labels == 0) / len(train_labels)
contamination

In [46]:
model = IsolationForest(contamination = contamination, random_state = None)
model.fit(X_train)
model

In [57]:
model.threshold

In [48]:
predictions = model.predict(X_test)

In [49]:
predictions['is_normal'] = test_labels
predictions

In [50]:
true_anomaly = len(predictions[predictions['is_normal'] == 0 & (predictions['predicted_as_anomaly'] == True)])
false_anomaly = len(predictions[predictions['is_normal'] == 0 & (predictions['predicted_as_anomaly'] == False)])
false_normal = len(predictions[predictions['is_normal'] == 1 & (predictions['predicted_as_anomaly'] == True)])
true_normal = len(predictions[predictions['is_normal'] == 1 & (predictions['predicted_as_anomaly'] == False)])

precision = true_anomaly / (true_anomaly + false_anomaly)
recall = true_anomaly / (true_anomaly + false_normal)
f1 = 2 * precision * recall / (precision + recall)

print("precision: " + str(precision))
print("recall: " + str(recall))
print("f1-score: " + str(f1))

In [51]:
predicted_list = list(predictions["predicted_as_anomaly"])
test_list = list(predictions["is_normal"])

In [52]:
model.threshold

In [53]:
"""
Questions:
How do we obtain "windows" from our datasets?
What should k be for our data? Look at paper
How do we obtain our threshold? It would be a line between the average True point and average False point
"""

def pak(anomaly_segment_list, ground_truth, threshold,  k):
    allAboveThreshold = True

    for item in anomaly_segment_list:
        if item <= threshold:
            allAboveThreshold = False

    if allAboveThreshold:
        print("All above threshold")
        return True

    numCorrectlyDetected = 0

    for i in range(len(anomaly_segment_list)):
        if anomaly_segment_list[i] == ground_truth[i]:
            numCorrectlyDetected += 1

    return numCorrectlyDetected / len(anomaly_segment_list) > k

In [54]:
import math

def makeWindows(list1, list2, numWindows):
    anomaly_segment_lists = []
    ground_truth_lists = []

    windowSize = math.ceil(len(list1) / numWindows)
    a_s_list = []
    g_t_list = []
    for i in range(len(list1)):
        a_s_list.append(list1[i])
        g_t_list.append(list2[i])
        if i % windowSize == 0 and i != 0:
            anomaly_segment_lists.append(a_s_list)
            ground_truth_lists.append(g_t_list)
            a_s_list = []
            g_t_list = []
    anomaly_segment_lists.append(a_s_list)
    print(anomaly_segment_lists)


In [55]:
l1, l2 = makeWindows(predicted_list, test_list, 100)

In [56]:
for i in range(len(l1)):
    print(pak(l1[i], l2[i], model.threshold, .20))