In [1]:
import numpy as np
from odds import OD
from utils import auc, fps, sanitise_scores, mean_norm, mean_scale
from sklearn import metrics
import os
import h5py
from time import time
import platform
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import scipy.io as sio
import ast

import multiprocessing as mp
from itertools import repeat

Point outlier datasets

In [2]:

def get_http(data_path):
    """
    get http data, using h5py as is old matlab format.
    takes in data_path, rout to find the data files
    outputs X data matrix, y outlier labels, and name..
    """

    path = data_path+'http/'
    fname = 'http.mat'

    f = h5py.File(path+fname, 'r')
    X = f.get('X')
    y = f.get('y')
    y = np.array(y).reshape(-1)
    X = np.array(X).T
    print('got http, size ', X.shape)
    return X, y, 'HTTP'

def get_smtp(data_path):
    """
    get smtp data, using h5py as is old matlab format.
    """
    path = data_path+'smtp/'
    fname = 'smtp.mat'

    f = h5py.File(path+fname, 'r')
    X = f.get('X')
    y = f.get('y')
    X = np.array(X).T
    y = np.array(y)
    y = y.reshape(-1)
    print('got smtp, size ', X.shape)
    return X, y, 'SMTP'

def get_occupancy(data_path):
    """
    gets smtp data from txt files using pd.
    stitches them together in to one dataset
    Returns X data matrix and y ground truth, 1 for occupancy 0 otherwise
    also returns name for historic reasons.. hmm..
    """
    path = data_path+'occupancy/'

    df = pd.DataFrame()
    for file in sorted(os.listdir(path)):
        # print(file)
        if file[-4:] != '.txt':
            continue
        print(file)
        dfn = pd.read_csv(path+file)
        df = pd.concat([df,dfn])
    occupancy = df.Occupancy
    y = df.Occupancy.to_numpy()
    df = df.drop(['date','Occupancy'], axis=1)
    print(df.info())
    print(df.head())
    X = df.to_numpy()

    return X,  y, 'occupancy'

def do_data_serially(algo_lst, name, timestamp, data_path, save_path, count=32):
    """
    runs each algo count times to get variability in results uses first data from 'name'
    """
    os.makedirs(save_path, exist_ok=True)
    get_dict = {'http':get_http, 'smtp':get_smtp, 'occupancy':get_occupancy}
    X,y,name = get_dict[name](data_path)

    n,p = X.shape
    X_n = mean_scale(X)

    #should also record results to file.
    b = len(algo_lst)

    outs = np.nonzero(y)[0]
    varres_name = save_path+ '{}_varres_{}.txt'.format(name, timestamp)
    # mets = ['fpr', 'auc', 'f1_b']
    for i in range(b):

        algo = algo_lst[i]
        print(algo)
        t1 = time()
        norm=True
        fpr_lst, auc_lst = [],[]
        for c in range(count):
            args = (X, outs, algo)
            fps, auk = experiment(args,  c)
            fpr_lst.append(fps)
            auc_lst.append(auk)

        write_to_file(varres_name, algo, name, norm, fpr_lst, auc_lst)
        t4 = time()-t1
        print('Algorithm {} with data {} took {}m and {}s to run {} times '.format(algo,
                                                                        name,
                                                                        int(t4//60),
                                                                        int(t4%60),
                                                                        count))
        
def experiment(args,  i):
    """
    process to send to separate core to be parallelised.
    takes a list, calculates outlier scores
    calculates fps and auc scores from those outleir scores.
    appends them to the given lists.

    """
    X, outs, algo = args
    od = OD(algo)
    error=False
    out_scores = None
    try:
        out_scores = od.get_os(X)
        fps, auk = get_scores(out_scores, outs)

    except Exception as e:
        print('Error, {}'.format(e))
        error=True
        fps = 'error'
        auk = 'error'
        # raise
    print(f'{algo}, fps = {fps}, auc = {auk}, count = {i}')
    return (fps, auk)

def get_scores(out_scores,outs):
    """
    calculates fps and auc
    returns both.
    """
    fpr = fps(out_scores, outs)
    try:
        auk = auc(out_scores, outs)
    except: # should really specify the error that gets thrown here.
        auk = np.nan
    return fpr, auk

def write_to_file(varres_name, algo, name, norm, fpr_lst, auc_lst):
    """
    saves results to file. file name is varres_name
    algo is the algorithm used for the results set
    name is the data name
    norm is if it is normalised or not, a boolean
    fps_lst is list of false positive rates
    auc_lst is list of auc scores
    """
    with open(varres_name, 'a+') as f:
        head = '{}, {}, {}, {}, '.format(algo, name, norm, 'fpr')
        res = ''.join(['%0.4f, '])*len(fpr_lst)%tuple(fpr_lst)
        f.write(head+res[:-2]+'\n')
        print(head+res[:-2]+'\n')
        head = '{}, {}, {}, {}, '.format(algo, name, norm, 'auc')
        res = ''.join(['%0.4f, '])*len(auc_lst)%tuple(auc_lst)
        f.write(head+res[:-2]+'\n')
        print(head+res[:-2]+'\n')                                   

In [None]:

sys = platform.system()
if sys == 'Windows': # on BFGPU
    data_path = os.path.expanduser('~') +'/Data/'
    save_path = './results/'
elif sys == 'Darwin': # on home mac
    data_path = os.path.expanduser('~') +'/Data/'
    save_path = './results/'
elif sys == 'Linux': # on iridis system
    data_path = '/mainfs/home/jh1c18/Data/'
    save_path = '/mainfs/home/jh1c18/results/'
else:
    print('Where Am I?')
    raise
print('On system: ', sys)

n_workers = os.cpu_count()
print('n_workers = ', n_workers)



algo_lst=['VAR', 'FRO', 'FRL', 'FRR', 'OCSVM', 'GMM', 'IF', 'LSTM','GRU', 'AE', 'VAE', 'OP']
algo_lst = ['FRL', 'OCSVM'] # comment this line out to run all of them
timestamp = datetime.datetime.fromtimestamp(time())
timestamp = timestamp.strftime('%Y-%m-%d_%H-%M-%S')
print(timestamp)
# for name in ['http', 'smtp', 'occupancy']:
for name in ['http']:
    # setting numb workers to 1 to see if that gives more variability
    # do_data(algo_lst, name, timestamp, data_path, save_path, count=32, n_workers=1)
    do_data_serially(algo_lst, name, timestamp, data_path, save_path, count=32)

On system:  Darwin
n_workers =  4
2023-08-29_11-50-19
got http, size  (567498, 3)
FRL
FRL, fps = 0.9485383111442137, auc = 0.9953140360236172, count = 0
FRL, fps = 0.9485622557230597, auc = 0.9953140336233283, count = 1
FRL, fps = 0.948527528809219, auc = 0.9953140568261208, count = 2
FRL, fps = 0.9485239336934252, auc = 0.9953140576262172, count = 3
FRL, fps = 0.9485371133300747, auc = 0.9953140480250616, count = 4
FRL, fps = 0.9485562716675586, auc = 0.9953157234267035, count = 5
FRL, fps = 0.9485646489554739, auc = 0.9953139880178394, count = 6
FRL, fps = 0.9485514834205934, auc = 0.9953140336233284, count = 7
FRL, fps = 0.9485359154601741, auc = 0.9953140184214987, count = 8
FRL, fps = 0.9485251321211556, auc = 0.9953140616266987, count = 9
FRL, fps = 0.9484915550378568, auc = 0.9953140944306468, count = 10
FRL, fps = 0.9485083490533082, auc = 0.9953140448246764, count = 11
FRL, fps = 0.9485431018432322, auc = 0.9953157594310369, count = 12
FRL, fps = 0.9485813953488372, auc = 0.99

To use previously collected results:

In [9]:
def get_and_sort_results(results_path, tss, names, runs, norm):
    df_lst = []
    run_cols = [str(x) for x in range(runs)]
#     print(run_cols)
    for name, timestamp in zip(names, tss):

        fname = f'{name}_varres_{timestamp}.txt'

        df = pd.read_csv(results_path+fname, header=0, 
                         skipinitialspace=True,
                        index_col=False)
        df_lst.append(df)
        
    df = pd.concat(df_lst)
    mdf = df[df['norm']==norm]
    mdf = mdf.drop(['norm'], axis=1)
    mdf['avg']=mdf.mean(axis=1, numeric_only=True)
    mdf = mdf.drop(run_cols, axis=1)
    pdf = mdf.pivot_table(index=['score','name'], columns='algo')

    algos=['OCSVM', 'GMM', 'OP', 'VAR', 'FRO', 'FRL', 'FRR', 'IF', 'AE', 'VAE', 'LSTM','GRU']
    pdf = pdf[[('avg', algos[i]) for i in range(len(algos))]]
    pdf = pdf.round(2)
  
    return df, pdf

In [10]:
tss = ['2021-12-15_14-49-04', '2021-12-14_14-04-56', '2021-12-14_14-04-56']

names = ['HTTP', 'occupancy', 'SMTP']
results_path = './results/'

runs=20
df, pdf = get_and_sort_results(results_path, tss, names, runs, True)
pdf.T

  return func(*args, **kwargs)


Unnamed: 0_level_0,score,auc,auc,auc,fpr,fpr,fpr
Unnamed: 0_level_1,name,HTTP,SMTP,occupancy,HTTP,SMTP,occupancy
Unnamed: 0_level_2,algo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
avg,OCSVM,0.99,0.85,0.76,0.63,1.0,0.73
avg,GMM,0.19,0.9,0.84,0.94,1.0,0.41
avg,OP,0.5,0.5,0.5,1.0,1.0,1.0
avg,VAR,0.85,0.86,0.66,0.99,1.0,0.77
avg,FRO,0.99,0.82,0.79,0.64,1.0,0.77
avg,FRL,0.99,0.82,0.79,0.87,1.0,0.73
avg,FRR,0.99,0.82,0.79,0.64,1.0,0.77
avg,IF,1.0,0.91,0.85,0.86,1.0,0.6
avg,AE,1.0,0.83,0.78,0.63,1.0,0.69
avg,VAE,1.0,0.77,0.25,0.62,1.0,0.97


Range data 

In [None]:

def get_smd(name, data_path):
  path = data_path+'SMD/'
  test_path = path + 'test/'
  train_path = path + 'train/'
#     X_lst = []
#     y_lst = []
  Xy_dict = {}
  file_lst = sorted(os.listdir(test_path))
  yfile_lst = sorted(os.listdir(path+'test_label/'))
  for i in range(len(file_lst)):
      file = file_lst[i]
#         print(file)
      X = pd.read_csv(test_path+file).values
#         X_lst.append(X)
      y = pd.read_csv(path+'test_label/'+file).values
#         y_lst.append(y)
      Xy_dict[file[:-4]]=X,y

#     X = np.concatenate(X_lst)
#     y = np.concatenate(y_lst)
  print('got {} data, with {} entities'.format(name, len(list(Xy_dict.keys()))))
  return Xy_dict, 'SMD'

def get_SMAP_MSL(name, data_path):
  path = data_path+ 'SMAP_MSL/'
  test_path = path + 'test/'
  outs_fname = 'labeled_anomalies.csv' # gets labels
  df = pd.read_csv(path+outs_fname, index_col='chan_id')
  df = df.drop('P-2')

  df.loc['P-2'] = {
              'spacecraft':'SMAP',
              'anomaly_sequences':'[[5300, 6420]]',
              'class':"['point']",
              'num_values':'8209'
              }
  df = df.sort_values('chan_id')

  df = df[df['spacecraft']==name]
  file_ids = df.index
  # print(file_ids)
  print(df.info())
  # train_path = path + 'train/' #no anomalies in the training data.
  Xy_dict = {}
  for file in sorted(os.listdir(test_path)):
      id = file[:-4]
      # print(id)
      if  id in file_ids:
          X = np.load(test_path+file)
          # print(df.loc[id])
          anomalies = df.loc[id].anomaly_sequences
          # print(anomalies)
          y = get_y_from_ranges(X.shape[0], anomalies)
          Xy_dict[id] = X, y
  print('got {} data, with {} entities'.format(name, len(list(Xy_dict.keys()))))
  return Xy_dict, name


def do_range_data(algo_lst, name, timestamp, count=16, n_workers=16):
    """
    runs each algo count times to get variance uses first data from 'name'
    """
    get_dict = {'SMD':get_smd, 'SMAP':get_SMAP_MSL,'MSL':get_SMAP_MSL, }
    Xy_dict, name = get_dict[name](name, data_path)
    # should also record results to file.

    os.makedirs("./{}/".format('results'), exist_ok=True)
    for key in sorted(Xy_dict.keys()):
        print(key)

        X,y = Xy_dict[key]
        outs = np.nonzero(y)[0]
        varres_name = './results/{}_varres_{}_{}.txt'.format(name, key, timestamp)
        # mets = ['fpr', 'auc', 'f1_b']
        get_range(algo_lst, X, y, varres_name, 'False', count, n_workers)
        X_n = mean_scale(X)
        get_range(algo_lst, X_n, y, varres_name, key, 'True', count, n_workers)

def get_range(algo_lst, X,y, varres_name, machine, norm, count, n_workers):
    b = len(algo_lst)
    for i in range(b):
        algo = algo_lst[i]
        t1 = time()

        fps_lst, auc_lst, f1b_lst = parallel_range_runs(count, X, y, algo, n_workers)
        write_to_file_range(varres_name, algo, name, machine, norm, fps_lst, auc_lst, f1b_lst)
        t2 = time()
        t3 = t2-t1
        print('Algorithm {} with data {} {} took {}m and {}s to run {} times '.format(algo,
                                                                        name, key,
                                                                        int(t3//60),
                                                                        int(t3%60),

def parallel_range_runs(count, X, y, algo, n_workers):
    """
    takes in count number of runs, data matrix X, and algo
    returns scores for each run as a list.
    list of count fpr and auc scores.
    """
    fps_lst=[]
    auc_lst=[]
    f1b_lst=[]

    pool = mp.Pool(n_workers)
    args = (X, y, algo)
    mix_lst = pool.starmap(experiment_range, zip(repeat(args, count), range(count)))


    for fps, auk, f1b in mix_lst:
        fps_lst.append(fps)
        auc_lst.append(auk)
        f1b_lst.append(f1b)

    return fps_lst, auc_lst, f1b_lst

def experiment_range(args,  i):
    """
    process to send to separate core to be parallelised.
    takes a list, calculates outlier scores
    calculates fps and auc scores from those outleir scores.
    appends them to the given lists.

    """
    X, y, algo = args
    od = OD(algo)
    error=False
    out_scores = None
    try:
        out_scores = od.get_os(X)
        fps, auk, f1b, f1n, prec, rec = get_res_on_os(out_scores, y)

    except Exception as e:
        print('Error, {}'.format(e))
        error=True
        fps = np.nan
        auk = np.nan
        f1b = np.nan
        # raise

    return (fps, auk, f1b)

def write_to_file_range(varres_name, algo, name, machine, norm, fpr_lst, auc_lst, f1b_lst):
    """
    saves results to file. file name is varres_name
    algo is the algorithm used for the results set
    name is the data name
    norm is if the data is normalised or not, a boolean
    fps_lst is list of false positive rates
    auc_lst is list of auc scores
    f1b_lst is list of f1 best scores.. see su2019 for approach.
    """
    print(f'writing to {varres_name}')
    with open(varres_name, 'a+') as f:
        head = '{}, {}, {}, {}, '.format(algo, name, machine, norm, 'fpr')
        res = ''.join(['%0.4f, '])*len(fpr_lst)%tuple(fpr_lst)
        f.write(head+res[:-2]+'\n')
        print(head+res[:-2]+'\n')
        head = '{}, {}, {}, {}, '.format(algo, name, machine, norm, 'auc')
        res = ''.join(['%0.4f, '])*len(auc_lst)%tuple(auc_lst)
        f.write(head+res[:-2]+'\n')
        print(head+res[:-2]+'\n')
        head = '{}, {}, {}, {}, '.format(algo, name, machine, norm, 'f1b')
        res = ''.join(['%0.4f, '])*len(f1b_lst)%tuple(f1b_lst)
        f.write(head+res[:-2]+'\n')
        print(head+res[:-2]+'\n')                                                              count))

In [None]:
sys = platform.system()
if sys == 'Windows': # on BFGPU
    data_path = os.path.expanduser('~') +'/Data/'
    save_path = './results/'
elif sys == 'Darwin': # on home mac
    data_path = os.path.expanduser('~') +'/Data/'
    save_path = './results/'
elif sys == 'Linux': # on iridis system
    data_path = '/mainfs/home/jh1c18/Data/'
    save_path = '/mainfs/home/jh1c18/results/'
else:
    print('Where Am I?')
    raise
print('On system: ', sys)

n_workers = os.cpu_count()
print('n_workers = ', n_workers)



algo_lst=['VAR', 'FRO', 'FRL', 'FRR', 'OCSVM', 'GMM', 'IF', 'LSTM','GRU', 'AE', 'VAE', 'OP']
algo_lst = ['AE', 'VAE']
timestamp = datetime.datetime.fromtimestamp(time())
timestamp = timestamp.strftime('%Y-%m-%d_%H-%M-%S')
print(timestamp)

for name in ['MSL', 'SMAP', 'SMD']:
# for name in ['MSL']:
   do_range_data(algo_lst, name, timestamp, count=32, n_workers=n_workers)

In [2]:
timestamp = '2021-12-07_09-57-23'
results_path = os.path.expanduser('~')+ '/Documents/expts/iridis_run/results/'
runs=32

def get_range_df(timestamp, results_path, runs):
    run_cols = [str(x) for x in range(runs)]
    df_lst = []
    for file in sorted(os.listdir(results_path)):
        if timestamp in file:
            print(file)
            name, _, machine, date, time = file.split('_')
            df = pd.read_csv(results_path+file, header=None, names=['algo', 'name', 'norm', 'score']+run_cols, skipinitialspace=True)
    #         print(df.info())
    #         print(df.head())
            df['machine']=machine
            df_lst.append(df)

    df = pd.concat(df_lst)
    df.columns = df.columns.str.strip()
    print(df.info())
    print(df.head())    
    return df
    
df = get_range_df(timestamp, results_path, runs)

MSL_varres_C-1_2021-12-07_09-57-23.txt
MSL_varres_C-2_2021-12-07_09-57-23.txt
MSL_varres_D-14_2021-12-07_09-57-23.txt
MSL_varres_D-15_2021-12-07_09-57-23.txt
MSL_varres_D-16_2021-12-07_09-57-23.txt
MSL_varres_F-4_2021-12-07_09-57-23.txt
MSL_varres_F-5_2021-12-07_09-57-23.txt
MSL_varres_F-7_2021-12-07_09-57-23.txt
MSL_varres_F-8_2021-12-07_09-57-23.txt
MSL_varres_M-1_2021-12-07_09-57-23.txt
MSL_varres_M-2_2021-12-07_09-57-23.txt
MSL_varres_M-3_2021-12-07_09-57-23.txt
MSL_varres_M-4_2021-12-07_09-57-23.txt
MSL_varres_M-5_2021-12-07_09-57-23.txt
MSL_varres_M-6_2021-12-07_09-57-23.txt
MSL_varres_M-7_2021-12-07_09-57-23.txt
MSL_varres_P-10_2021-12-07_09-57-23.txt
MSL_varres_P-11_2021-12-07_09-57-23.txt
MSL_varres_P-14_2021-12-07_09-57-23.txt
MSL_varres_P-15_2021-12-07_09-57-23.txt
MSL_varres_S-2_2021-12-07_09-57-23.txt
MSL_varres_T-12_2021-12-07_09-57-23.txt
MSL_varres_T-13_2021-12-07_09-57-23.txt
MSL_varres_T-4_2021-12-07_09-57-23.txt
MSL_varres_T-5_2021-12-07_09-57-23.txt
MSL_varres_T-8_2

In [3]:
run_cols = [str(x) for x in range(runs)]
norm=True
mdf = df[df['norm']==norm]
mdf = mdf.drop(['norm'], axis=1)
mdf['avg']=mdf.mean(axis=1, numeric_only=True)
mdf = mdf.drop(run_cols, axis=1)
mdf

Unnamed: 0,algo,name,score,machine,avg
0,VAR,MSL,fpr,C-1,
1,VAR,MSL,auc,C-1,
2,VAR,MSL,f1b,C-1,
3,FRO,MSL,fpr,C-1,0.823800
4,FRO,MSL,auc,C-1,0.546597
...,...,...,...,...,...
37,OP,SMD,auc,machine-3-9,
38,OP,SMD,f1b,machine-3-9,
39,RAND,SMD,fpr,machine-3-9,0.989400
40,RAND,SMD,auc,machine-3-9,0.499878


In [4]:
pdf = mdf.pivot_table(index=['score','name'], columns='algo')
# 
pdf.T

Unnamed: 0_level_0,score,auc,auc,auc,f1b,f1b,f1b,fpr,fpr,fpr
Unnamed: 0_level_1,name,MSL,SMAP,SMD,MSL,SMAP,SMD,MSL,SMAP,SMD
Unnamed: 0_level_2,algo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
avg,AE,0.593596,0.559695,0.793371,0.903416,0.766973,0.962346,0.901173,0.876816,0.957939
avg,DBSCAN,0.595915,0.599298,0.50931,0.839678,0.757904,0.25425,0.868533,0.862137,0.955932
avg,FRL,0.569166,0.575646,0.744195,0.661416,0.694967,0.83892,0.886945,0.86535,0.955025
avg,FRO,0.560025,0.582322,0.74064,0.648379,0.491407,0.877493,0.87717,0.897479,0.962533
avg,FRR,0.561493,0.582064,0.763877,0.677051,0.702304,0.903101,0.873029,0.894358,0.957378
avg,GMM,0.513801,0.466463,0.83155,0.897686,0.726967,0.961826,0.882728,0.889756,0.954721
avg,GRU,0.531696,0.584771,0.750789,0.950269,0.9393,0.927031,0.918219,0.895818,0.959865
avg,IF,0.541549,0.576838,0.815855,0.847951,0.779187,0.907466,0.910027,0.866239,0.955579
avg,LSTM,0.53985,0.576682,0.748992,0.949883,0.937037,0.925303,0.913877,0.895063,0.957876
avg,OCSVM,0.543615,0.553906,0.752939,0.904696,0.826637,0.892739,0.878037,0.875024,0.955986


In [5]:
algos=['OCSVM', 'GMM', 'OP', 'FRO', 'FRL', 'FRR', 'IF', 'AE', 'VAE', 'LSTM','GRU', 'RAND']
pdf = pdf[[('avg', algos[i]) for i in range(len(algos))]]
pdf = pdf.round(2)
pdf.T

Unnamed: 0_level_0,score,auc,auc,auc,f1b,f1b,f1b,fpr,fpr,fpr
Unnamed: 0_level_1,name,MSL,SMAP,SMD,MSL,SMAP,SMD,MSL,SMAP,SMD
Unnamed: 0_level_2,algo,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
avg,OCSVM,0.54,0.55,0.75,0.9,0.83,0.89,0.88,0.88,0.96
avg,GMM,0.51,0.47,0.83,0.9,0.73,0.96,0.88,0.89,0.95
avg,OP,0.5,0.5,0.51,0.69,0.48,0.41,0.84,0.87,0.96
avg,FRO,0.56,0.58,0.74,0.65,0.49,0.88,0.88,0.9,0.96
avg,FRL,0.57,0.58,0.74,0.66,0.69,0.84,0.89,0.87,0.96
avg,FRR,0.56,0.58,0.76,0.68,0.7,0.9,0.87,0.89,0.96
avg,IF,0.54,0.58,0.82,0.85,0.78,0.91,0.91,0.87,0.96
avg,AE,0.59,0.56,0.79,0.9,0.77,0.96,0.9,0.88,0.96
avg,VAE,0.45,0.44,0.35,0.38,0.36,0.35,0.94,0.92,0.99
avg,LSTM,0.54,0.58,0.75,0.95,0.94,0.93,0.91,0.9,0.96
