# 0.0 setting up

In [1]:
import os
# os.getcwd()
os.chdir("C:\\Users\\DELL\\Desktop\\AnomalyDetectionChallenge")

In [2]:
import pandas as pd
from collections import Counter
import numpy as np
from statistics import mean, stdev
from math import sqrt

----

# 0.1 training datasets:

In [3]:
data1 = pd.read_csv("hexacopter-hil-clean-01.kev.csv")
data2 = pd.read_csv("hexacopter-hil-clean-02.kev.csv")
data3 = pd.read_csv("hexacopter-hil-clean-03.kev.csv")
data4 = pd.read_csv("hexacopter-hil-clean-04.kev.csv")
data5 = pd.read_csv("hexacopter-hil-clean-05.kev.csv")
data6 = pd.read_csv("hexacopter-hil-clean-06.kev.csv")
data7 = pd.read_csv("hexacopter-hil-clean-07.kev.csv")
data8 = pd.read_csv("hexacopter-hil-clean-08.kev.csv")
data9 = pd.read_csv("hexacopter-hil-clean-09.kev.csv")
data10 = pd.read_csv("hexacopter-hil-clean-10.kev.csv")

# 0.2 testing datasets

In [4]:
data11 = pd.read_csv("hexacopter-hil-fifo-ls-01.kev.csv")
data12 = pd.read_csv("hexacopter-hil-fifo-ls-02.kev.csv")
data13 = pd.read_csv("hexacopter-hil-fifo-ls-sporadic.kev.csv")
data14 = pd.read_csv("hexacopter-hil-full-while.kev.csv")
data15 = pd.read_csv("hexacopter-hil-half-while.kev.csv")

----

# 1.0 definition of functions 

In [7]:
def get_commonkey():
    
    setlist=[]  # take the common key accross all key levels(ONLY training)
    for i in range(1,11): 
        data=eval('data{}'.format(i))
        data["key"] = data["class"]+data["event"]
        setlist=setlist+[set(data["key"])]
    common_key=setlist[0]
    for i in range(1,len(setlist)):
        common_key=common_key&setlist[i]   # common_key get, length 83 key level across all 10 training files!
    return common_key

def gen_model_df(dataframe):
    newdata = []
    dataframe["key"] = dataframe["class"] + dataframe["event"]
    dataframe["numKey"] = dataframe["key"].apply(lambda x: full_keyset.index(x) if x in full_keyset else -1)  
    dataframe = dataframe[dataframe.numKey != -1]
    dataframe["interval"] = dataframe["time"].diff()
    dataframe = dataframe[dataframe.interval != 0]    # delete events that happen at same time(0.6% total records), current no better method!
    dataframe.index = pd.RangeIndex(len(dataframe.index))
    data_temp=dataframe[["key","numKey","interval"]]
    return data_temp


def interval_matrix(transitions,interval):  # return transition Matrix and the Matrix of events average time interval
    
    n = 1+ max(transitions)  # number of states
    M = [[0]*n for _ in range(n)] # transition matrix
    N = [[0]*n for _ in range(n)] # average time matrix
    X = [[0]*n for _ in range(n)] # standard deviation matrix
    u = 0
    for (i,j) in zip(transitions,transitions[1:]):
        u += 1
        M[i][j] += 1
        N[i][j] += interval[u]
        avg_local = N[i][j]/M[i][j]
        absdis = interval[u]-avg_local
        sdv_local = sqrt(absdis*absdis/(M[i][j]))
        X[i][j] = sdv_local
    
    a = np.array(N)
    b = np.array(M)
    N = np.divide(a, b, out = np.zeros_like(a), where = b!= 0) # take average time interval of each event sequence
        
    #now convert to probabilities:
    for row in M:
        s = sum(row)
        if s > 0:
            row[:] = [f/s for f in row]
    M = np.array(M)
    X = np.array(X)
    
    return M,N,X


def split_testing(data,N):  # split data onto many parts, return a list of data  
    datas=np.array_split(data, N)
    return datas


def run_model(model_input):
    transition_matrix, averagetime_matrix,std_matrix = interval_matrix(model_input["numKey"],model_input["interval"])
    return transition_matrix, averagetime_matrix,std_matrix


def train_anomaly(train_data):

    result = []
    avg_time = []
    w_time = []

    for i in range(len(train_data)):  

        df_newdata = train_data[i]

        split_df= split_testing(df_newdata,500)

        for j in range(len(split_df)):

            input_df = split_df[j]
            df_model = gen_model_df(input_df)
            tm,am,sm = run_model(df_model)
            wti = tm*am
    #         avg_time += [np.sum(am)/np.count_nonzero(am)]
            w_time += [np.sum(wti)/np.count_nonzero(wti)]

    return np.mean(w_time),np.std(w_time),np.max(w_time),np.min(w_time),np.ptp(w_time)


def return_bounds(train_result,N):
    
    upper_bound = train_result[0] + N*train_result[1]
    lower_bound = train_result[0] - N*train_result[1]
    
    return upper_bound, lower_bound

def test_anomaly(test_data,bound):
    
    avg_time_t = []
    w_time_t = []
    anomaly_record = []
    upper = bound[0]
    lower = bound[1]

    for i in range(len(test_data)):  

        df_newdata = test_data[i]

        split_df= split_testing(df_newdata,500)

        for j in range(len(split_df)):

            input_df = split_df[j]
            df_model = gen_model_df(input_df)
            tm,am,sm = run_model(df_model)
            wti_t = tm*am
            avg_wti_t = np.sum(wti)/np.count_nonzero(wti_t)

            avg_time_t = np.sum(am)/np.count_nonzero(am)

            w_time_t += [avg_wti_t]

            if avg_wti_t > upper or avg_wti_t < lower:
                anomaly_record += [(i,split_df[j].index,avg_time_t)] 
            
    return anomaly_record


# 1.1 golbal variables

In [8]:
training_dataset = [eval('data{}'.format(i)) for i in range(1,11)]
testing_dataset = [eval('data{}'.format(i)) for i in range(11,16)]
full_keyset = get_commonkey()
full_keyset = list(full_keyset)
length = len(full_keyset)

   def connect_to_next_port(self, minimum):
    """Connects to the next available port.

    Args:
      minimum: A port value greater or equal to 1024.

    Returns:
      The new minimum port.

    Raises:
      ConnectionError: If no available port is found.
    """
    if minimum < 1024:
      # Note that this raising of ValueError is not mentioned in the doc
      # string's "Raises:" section because it is not appropriate to
      # guarantee this specific behavioral reaction to API misuse.
      raise ValueError('Minimum port must be at least 1024, not %d.' % (minimum,))
    port = self._find_next_open_port(minimum)
    if not port:
      raise ConnectionError('Could not connect to service on %d or higher.' % (minimum,))
    assert port >= minimum, 'Unexpected port %d when minimum was %d.' % (port, minimum)
    return port

# 2. training and testing model

In [27]:
train_result = train_anomaly(training_dataset)

bound=return_bounds(result,3) # 3*std time-prob is deemed anomaly

test_result = test_anomaly(testing_dataset,bound)

In [51]:
print(test_result)

[(0, RangeIndex(start=0, stop=3211, step=1), 464.4793720917418), (0, RangeIndex(start=12844, stop=16055, step=1), 819.5422055904493), (0, RangeIndex(start=38532, stop=41743, step=1), 740.2231621266967), (0, RangeIndex(start=41743, stop=44954, step=1), 718.4666347573759), (0, RangeIndex(start=54587, stop=57798, step=1), 269.93575122810915), (0, RangeIndex(start=61009, stop=64220, step=1), 553.0117176064458), (0, RangeIndex(start=73853, stop=77064, step=1), 718.8048663100245), (0, RangeIndex(start=77064, stop=80275, step=1), 551.0825943650195), (0, RangeIndex(start=80275, stop=83486, step=1), 1322.4752341530639), (0, RangeIndex(start=93119, stop=96330, step=1), 661.0822408997273), (0, RangeIndex(start=96330, stop=99541, step=1), 149.08523165880587), (0, RangeIndex(start=99541, stop=102752, step=1), 157.90871775767064), (0, RangeIndex(start=112385, stop=115596, step=1), 664.0056487229435), (0, RangeIndex(start=118807, stop=122018, step=1), 518.6059088066194), (0, RangeIndex(start=122018, 

In [42]:
test_result2  # anomaly in training dataset using 4*std

[(0, 131, 291.7729297285441),
 (0, 463, 446.0485028007956),
 (2, 126, 288.7127478130077),
 (2, 133, 291.9816533246967),
 (3, 76, 325.0002668229807),
 (4, 347, 312.31589160859886),
 (8, 34, 291.1624019461376)]

In [48]:
# def test model:






    #         if len(anomaly_record)/len(w_time_t) > 0.8 and len(anomaly_record)>10:
    #             print("anomaly detected! Location file:{0},fraction:{1}".format(i,j)) 
    #             break



In [18]:
np.mean(w_time_t),np.std(w_time_t),np.max(w_time_t),np.min(w_time_t),np.ptp(w_time_t)

(95.50131558476538,
 23.007848245918634,
 246.02693623791504,
 65.33495402168698,
 180.69198221622804)

In [21]:
print(anomaly_record)

[(11, 0), (11, 4), (11, 12), (11, 13), (11, 17), (11, 19), (11, 23), (11, 24), (11, 25), (11, 29), (11, 30), (11, 31), (11, 35), (11, 37), (11, 38), (11, 39), (11, 40), (11, 41), (11, 42), (11, 43), (11, 44), (11, 46), (11, 47), (11, 50), (11, 52), (11, 53), (11, 54), (11, 55), (11, 59), (11, 60), (11, 61), (11, 62), (11, 63), (11, 64), (11, 65), (11, 66), (11, 67), (11, 68), (11, 69), (11, 71), (11, 73), (11, 74), (11, 75), (11, 76), (11, 77), (11, 78), (11, 79), (11, 80), (11, 83), (11, 87), (11, 91), (11, 95), (11, 96), (11, 100), (11, 103), (11, 105), (11, 106), (11, 107), (11, 111), (11, 112), (11, 113), (11, 116), (11, 117), (11, 119), (11, 120), (11, 121), (11, 122), (11, 124), (11, 125), (11, 126), (11, 128), (11, 129), (11, 130), (11, 131), (11, 132), (11, 134), (11, 135), (11, 136), (11, 137), (11, 139), (11, 141), (11, 142), (11, 143), (11, 144), (11, 145), (11, 146), (11, 147), (11, 148), (11, 149), (11, 150), (11, 153), (11, 155), (11, 156), (11, 157), (11, 158), (11, 159)

In [24]:
len(w_time_t)

2500

In [30]:
# def gen_concat_df(d1,d2):
#     df_0 = pd.concat([d1,d2])
#     return df_0


# def gen_model_df(dataframe):
#     newdata = []
#     dataframe["key"] = dataframe["class"] + dataframe["event"]
#     dataframe["numKey"] = dataframe["key"].apply(lambda x: full_keyset.index(x) if x in full_keyset else -1)  
#     dataframe = dataframe[dataframe.numKey != -1]
#     dataframe["interval"] = dataframe["time"].diff()
#     dataframe = dataframe[dataframe.interval != 0]    # delete events that happen at same time(0.6% total records), current no better method!
#     data_temp=dataframe[["key","numKey","interval"]]
#     return data_temp

# def interval_matrix(transitions,interval):  # return transition,avgtime,variance Matrix
    
#     n = 1+ max(transitions)  # number of states
#     M = [[0]*n for _ in range(n)] # transition matrix
#     N = [[0]*n for _ in range(n)] # average time matrix
#     X = [[0]*n for _ in range(n)] # standard deviation matrix
#     u = 0

#     for (i,j) in zip(transitions,transitions[1:]):
#         u += 1
#         M[i][j] += 1
#         N[i][j] += interval[u]
#         avg_local = N[i][j]/M[i][j]
#         absdis = interval[u]-avg_local
#         sdv_local = sqrt(absdis*absdis/(M[i][j]))
#         X[i][j] = sdv_local
    
#     a = np.array(N)
#     b = np.array(M)
#     N = np.divide(a, b, out = np.zeros_like(a), where = b!= 0) # take average time interval of each event sequence

#     for row in M:
#         s = sum(row)
#         if s > 0:
#             row[:] = [f/s for f in row]
            
#     M = np.array(M)
#     X = np.array(X)
#     return M,N,X

# def concat_df(data1,data2):
    
#     newdata=[]
#     data0 = pd.concat([data1,data2])
#     data0["key"] = data0["class"] + data0["event"]
#     data0["numKey"] = data0["key"].apply(lambda x: full_keyset.index(x) if x in full_keyset else -1)  
#     data0 = data0[data0.numKey != -1]
#     data0.sort_values("time", axis = 0, ascending = True, inplace = True)
#     data0["interval"] = data0["time"].diff()
#     data0 = data0[data0.interval != 0]    # delete events that happen at same time(0.6% total records), current no better method!
# #     data0.index = pd.RangeIndex(len(data0.index))
#     data_temp=data0[["key","numKey","interval"]]
#     return data_temp,data0


In [66]:
set(list(zip(cc0['numKey'], cc0['key'])))

{(0, 'PROCESSPROCTHREAD_NAME'),
 (1, 'INT_ENTR0x00000044'),
 (2, 'COMMREPLY_MESSAGE'),
 (3, 'CONTROLTIME'),
 (4, 'USREVENTEVENT-2'),
 (5, 'KER_CALLMSG_DELIVER_EVENT/21'),
 (6, 'COMMREC_PULSE'),
 (7, 'INT_HANDLER_EXIT0x00000049'),
 (8, 'INT_HANDLER_ENTR0x0000002e'),
 (9, 'INT_EXIT0x00000049'),
 (10, 'THREADTHCONDVAR'),
 (11, 'THREADTHREADY'),
 (12, 'THREADTHSIGWAITINFO'),
 (13, 'KER_CALLTIMER_TIMEOUT/75'),
 (14, 'KER_EXITMSG_CURRENT/10'),
 (15, 'COMMSND_PULSE_EXE'),
 (16, 'KER_EXITMSG_REPLYV/15'),
 (17, 'KER_CALLCONNECT_CLIENT_INFO/42'),
 (18, 'KER_CALLSIGNAL_WAITINFO/32'),
 (19, 'KER_EXITMSG_INFO/19'),
 (20, 'USREVENTEVENT-0'),
 (21, 'INT_HANDLER_ENTR0x00000029'),
 (22, 'KER_CALLMSG_ERROR/13'),
 (23, 'KER_EXITMSG_SENDV/11'),
 (24, 'KER_CALLMSG_SENDV/11'),
 (25, 'KER_CALLMSG_CURRENT/10'),
 (26, 'KER_CALLCONNECT_DETACH/40'),
 (27, 'INT_EXIT0x00000044'),
 (28, 'USREVENTEVENT-3'),
 (29, 'KER_CALLMSG_SENDVNC/12'),
 (30, 'INT_HANDLER_ENTR0x00000044'),
 (31, 'KER_CALLSYNC_CONDVAR_SIGNAL/83'),

In [43]:
a = split_testing(data1,100)

In [50]:
a[50].index

RangeIndex(start=340050, stop=346851, step=1)