In [2]:
import pandas as pd

from orion.data import load_signal
from orion import Orion
from orion.data import load_anomalies

from mlprimitives.custom.timeseries_preprocessing import time_segments_aggregate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from mlprimitives.custom.timeseries_preprocessing import rolling_window_sequences
from orion.primitives.timeseries_preprocessing import slice_array_by_dims
from mlprimitives import load_primitive

import os
import numpy as np
from pandas import Timestamp
import pickle
import json
import glob


cwd = os.getcwd()

# Dataset

Input - timestamp, values, exogenous variables

Output - start, end anomalous intervals, severity

## Loading the dataset

In [3]:
X_train = load_signal('multivariate/S-1-train')
X_test = load_signal('multivariate/S-1-test')

X_train.shape, X_test.shape

((2818, 26), (7331, 26))

In [4]:
with open('processed/SWaT.pickle', 'rb') as f:
    dataset = pickle.load(f)
    
interval = 25 * 60

FileNotFoundError: [Errno 2] No such file or directory: 'processed/SWaT.pickle'

In [4]:
split = len(dataset['X']) // 3
X_train = dataset['X'][:split]
X_train['timestamp'] = X_train.index
y_train = dataset['y'][:split]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Preprocessing

In [5]:
# Creates an equi-spaced time series by aggregating values over fixed specified interval
# Food for thought, we don't need to bin it by set intervals?
params = {
    "time_column": "timestamp", 
    "interval": 21600, 
    # "interval": interval,
    "method": "mean"
}
primitive = load_primitive('mlprimitives.custom.timeseries_preprocessing.time_segments_aggregate', arguments=params)
X, index = primitive.produce(X=X_train)

X.shape, index.shape

VersionConflict: (numpy 1.21.2 (/Users/lcwong/opt/anaconda3/envs/orion-env/lib/python3.7/site-packages), Requirement.parse('numpy<1.17,>=1.15.4'))

In [6]:
# This primitive is an imputation transformer for filling missing values
params = {
    'X': X
}
primitive = load_primitive('sklearn.impute.SimpleImputer', arguments=params)
primitive.fit()
X = primitive.produce(X=X)

X.shape

NameError: name 'X' is not defined

In [24]:
# This primitive transforms features by scaling each feature to a given range
params = {
    "feature_range": [-1, 1], 
    'X': X,
}
primitive = load_primitive('sklearn.preprocessing.MinMaxScaler', arguments=params)
primitive.fit()
X = primitive.produce(X=X)

X.shape

(2818, 25)


In [25]:
# Uses a rolling window approach to create the sub-sequences out of time series data
params = {
    "target_column": 0, 
    "window_size": 100, 
    'target_size': 1, 
    'step_size': 1
}
primitive = load_primitive('mlprimitives.custom.timeseries_preprocessing.rolling_window_sequences',
                           arguments=params)
X, y, index, target_index = primitive.produce(X=X, index=index)


# Target / target size is the next interval that is trying to predict.
# Index is the start of the interval
X.shape, y.shape, index.shape, target_index.shape

((2718, 100, 25), (2718, 1), (2718,), (2718,))

In [38]:
# Target
params = {
    "target_index": 0, 
    "axis": 2
}
primitive = load_primitive('orion.primitives.timeseries_preprocessing.slice_array_by_dims',
                           arguments=params)
y = primitive.produce(X=X)

# Trying to predict the target sequence which is the first column of X
X.shape, y.shape

((2718, 100, 25), (2718, 100, 1))

## Saving the Data

In [45]:
np.save('processed/data/X', X)
np.save('processed/data/y', y)
np.save('processed/data/index', index)

# NASA

In [21]:
df = pd.read_csv('datasets/NASA/source/labeled_anomalies.csv')
df.head()

Unnamed: 0,chan_id,spacecraft,anomaly_sequences,class,num_values
0,P-1,SMAP,"[[2149, 2349], [4536, 4844], [3539, 3779]]","[contextual, contextual, contextual]",8505
1,S-1,SMAP,"[[5300, 5747]]",[point],7331
2,E-1,SMAP,"[[5000, 5030], [5610, 6086]]","[contextual, contextual]",8516
3,E-2,SMAP,"[[5598, 6995]]",[point],8532
4,E-3,SMAP,"[[5094, 8306]]",[point],8307


In [132]:
MSL_dataset = {
    'train': [],
    'test': [],
    'anomaly': [],
}

SMAP_dataset = {
    'train': [],
    'test': [],
    'anomaly': [],
}

for split in ['train', 'test']:

    for filepath in glob.glob(f'datasets/NASA/source/{split}/**.npy'):
        filename = os.path.basename(filepath)
        signal, _ = filename.split('.')

        info = df[df.chan_id == signal]
    
        if len(info) > 0:
            dataset = pd.DataFrame(np.load(filepath))
            dataset = dataset.reset_index() 
            
            # Metadata
            original_columns = dataset.columns
            dataset['source'] = 'NASA'
            dataset['name'] = info['spacecraft'].iloc[0]
            dataset['signal'] = signal
            
            # Anomaly points
            anomaly_points = [0] * len(dataset)
            if split == 'test':
                for interval in eval(info.anomaly_sequences.iloc[0]):
                    for i in range(interval[0], interval[1]):
                        if i < len(anomaly_points):
                            anomaly_points[i] = 1
            dataset['anomaly'] = anomaly_points
            dataset = dataset[list(dataset.columns[-4:]) + list(dataset.columns[:-4])]
            
            # Anomaly intervals
            anomaly_interval = pd.DataFrame(eval(info.anomaly_sequences.iloc[0]))
            anomaly_interval.columns = ['start', 'end']
            anomaly_interval['source'] = 'NASA'
            anomaly_interval['name'] = name
            anomaly_interval['signal'] = signal
            anomaly_interval = anomaly_interval[list(anomaly_interval.columns[-3:]) + list(anomaly_interval.columns[:-3])]

            if info['spacecraft'].iloc[0] == 'SMAP':
                SMAP_dataset[split].append(dataset)
                if split == 'test':
                    SMAP_dataset['anomaly'].append(anomaly_interval)
            elif info['spacecraft'].iloc[0] == 'MSL':
                MSL_dataset[split].append(dataset)
                if split == 'test':
                    MSL_dataset['anomaly'].append(anomaly_interval)

SMAP_dataset['train'] = pd.concat(SMAP_dataset['train'])
SMAP_dataset['test'] = pd.concat(SMAP_dataset['test'])
SMAP_dataset['anomaly'] = pd.concat(SMAP_dataset['anomaly']).drop_duplicates()
MSL_dataset['train'] = pd.concat(MSL_dataset['train'])
MSL_dataset['test'] = pd.concat(MSL_dataset['test'])
MSL_dataset['anomaly'] = pd.concat(MSL_dataset['anomaly']).drop_duplicates()

In [224]:
def print_summary(train, test, anomaly):
    print('# channels', len(train.columns[5:]))
    print('# contextual', len(anomaly))
    print('# Anomaly', train.anomaly.value_counts().get(1, 0) + test.anomaly.value_counts().get(1, 0))
    print('# Data', len(train) + len(test))

In [225]:
SMAP_dataset['train'].head()

Unnamed: 0,source,name,signal,anomaly,index,0,1,2,3,4,...,15,16,17,18,19,20,21,22,23,24
0,NASA,SMAP,P-7,0,0,0.450982,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,NASA,SMAP,P-7,0,1,0.411767,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,NASA,SMAP,P-7,0,2,0.411767,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,NASA,SMAP,P-7,0,3,0.372547,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,NASA,SMAP,P-7,0,4,0.333332,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [214]:
SMAP_dataset['anomaly'].head()

Unnamed: 0,source,name,signal,start,end
0,NASA,SMAP,P-7,4950,6600
0,NASA,SMAP,E-8,5400,6022
0,NASA,SMAP,T-3,2098,2180
1,NASA,SMAP,T-3,5200,5300
0,NASA,SMAP,T-2,6840,8624


In [226]:
print_summary(SMAP_dataset['train'], SMAP_dataset['test'], SMAP_dataset['anomaly'])

# channels 25
# contextual 68
# Anomaly 55854
# Data 573830


In [228]:
print_summary(MSL_dataset['train'], MSL_dataset['test'], MSL_dataset['anomaly'])

# channels 55
# contextual 36
# Anomaly 7730
# Data 132046


In [161]:
with open('processed/datasets/MSL.pickle', 'wb') as f:
    pickle.dump(MSL_dataset, f)

In [162]:
with open('processed/datasets/SMAP.pickle', 'wb') as f:
    pickle.dump(SMAP_dataset, f)

# SWaT

In [181]:
df = pd.read_excel('datasets/SWaT/A1_A2_2015_dec/SWaT_Dataset_Attack_v0.xlsx')

In [182]:
df.columns = df.iloc[0]
df = df.iloc[1:]
df = df.set_index(pd.to_datetime(df[' Timestamp'], dayfirst=True))

In [184]:
df.head()

Unnamed: 0_level_0,Timestamp,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,...,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603,Normal/Attack
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-12-28 10:00:00,28/12/2015 10:00:00 AM,2.42706,522.847,2,2,1,262.016,8.39644,328.634,2.44539,...,2,1,250.865,1.64995,189.599,0.000128152,1,1,1,Normal
2015-12-28 10:00:01,28/12/2015 10:00:01 AM,2.44627,522.886,2,2,1,262.016,8.39644,328.634,2.44539,...,2,1,250.865,1.64995,189.679,0.000128152,1,1,1,Normal
2015-12-28 10:00:02,28/12/2015 10:00:02 AM,2.48919,522.847,2,2,1,262.016,8.39451,328.634,2.44232,...,2,1,250.881,1.64995,189.679,0.000128152,1,1,1,Normal
2015-12-28 10:00:03,28/12/2015 10:00:03 AM,2.53435,522.965,2,2,1,262.016,8.39451,328.634,2.44232,...,2,1,250.881,1.64995,189.615,0.000128152,1,1,1,Normal
2015-12-28 10:00:04,28/12/2015 10:00:04 AM,2.56926,523.475,2,2,1,262.016,8.39451,328.634,2.44308,...,2,1,250.881,1.64995,189.503,0.000128152,1,1,1,Normal


In [185]:
raw_abnormalities = """
28/12/2015 10:29:14	10:44:53
28/12/2015 10:51:08	10:58:30 
28/12/2015 11:22:00	11:28:22
28/12/2015 11:47:39	11:54:08
28/12/2015 12:00:55	12:04:10
28/12/2015 12:08:25	12:15:33
28/12/2015 13:10:10	13:26:13
28/12/2015 14:16:20	14:19:00
28/12/2015 14:19:00	14:28:20
29/12/2015 11:11:25	11:15:17
29/12/2015 11:35:40	11:42:50
29/12/2015 11:57:25	12:02:00
29/12/2015 14:38:12	14:50:08
29/12/2015 18:10:43	18:15:01
29/12/2015 18:15:43	18:22:17
29/12/2015 18:30:00	18:42:00
29/12/2015 22:55:18	23:03:00
30/12/2015 01:42:34	1:54:10
30/12/2015 09:51:08	9:56:28
30/12/2015 10:01:50	10:12:01
30/12/2015 17:04:56	17:29:00
31/12/2015 01:17:08	1:45:18
31/12/2015 01:45:19	11:15:27
31/12/2015 15:32:00	15:34:00
31/12/2015 15:47:40	16:07:10
31/12/2015 22:05:34	22:11:40
1/01/2016 10:36:00	10:46:00
1/01/2016 14:21:12	14:28:35
1/01/2016 17:12:40	17:14:20
1/01/2016 17:18:56	17:26:56
1/01/2016 22:16:01	22:25:00
2/01/2015 11:17:02	11:24:50
2/01/2015 11:31:38	11:36:18
2/01/2015 11:43:48	11:50:28
2/01/2015 11:51:42	11:56:38
2/01/2015 13:13:02	13:40:56
"""

In [215]:
split = raw_abnormalities.split()
abnormalities = {
    'start': [],
    'end': []
}
for i in range(0, len(split), 3):
    date, start, end = split[i], split[i+1], split[i+2]
    abnormalities['start'].append(date + ' ' + start)
    abnormalities['end'].append(date + ' ' + end)
    
abnormalities = pd.DataFrame(abnormalities)

abnormalities['source'] = 'SUTD'
abnormalities['name'] = 'SWaT'
abnormalities['signal'] = None
abnormalities['start'] = pd.to_datetime(abnormalities['start'], dayfirst=True)
abnormalities['end'] = pd.to_datetime(abnormalities['end'], dayfirst=True)

abnormalities = abnormalities[list(abnormalities.columns[-3:]) + list(abnormalities.columns[:-3])]

abnormalities.head()

Unnamed: 0,source,name,signal,start,end
0,SUTD,SWaT,,2015-12-28 10:29:14,2015-12-28 10:44:53
1,SUTD,SWaT,,2015-12-28 10:51:08,2015-12-28 10:58:30
2,SUTD,SWaT,,2015-12-28 11:22:00,2015-12-28 11:28:22
3,SUTD,SWaT,,2015-12-28 11:47:39,2015-12-28 11:54:08
4,SUTD,SWaT,,2015-12-28 12:00:55,2015-12-28 12:04:10


In [187]:
y = []
for idx, abnormality in abnormalities.iterrows():
    sub_df = pd.DataFrame(pd.date_range(abnormality['start'], abnormality['end'], freq='1s'))
    sub_df['target'] = 1
    y.append(sub_df)
y = pd.concat(y).set_index(0)

In [209]:
final_df = df.iloc[:, 1:52].reset_index()
final_df = final_df.rename(columns={' Timestamp': 'index'})

final_df['source'] = 'SUTD'
final_df['name'] = 'SWaT'
final_df['signal'] = None
final_df['anomaly'] = df.merge(y, left_index=True, right_index=True, how='left').drop_duplicates()['target'].fillna(0).values

In [210]:
final_df = final_df[list(final_df.columns[-4:]) + list(final_df.columns[:-4])]

In [219]:
split = len(final_df) * 2 // 3
train = final_df.iloc[:split]
test = final_df.iloc[split:]

In [221]:
SWAT_dataset = {
    'train': train,
    'test': test,
    'anomaly': abnormalities
}

In [229]:
print_summary(SWAT_dataset['train'], SWAT_dataset['test'], SWAT_dataset['anomaly'])

# channels 51
# contextual 36
# Anomaly 50762
# Data 449919


In [230]:
with open('processed/datasets/SWaT.pickle', 'wb') as f:
    pickle.dump(SWAT_dataset, f)

In [231]:
with open('processed/datasets/SWaT.pickle', 'rb') as f:
    SWAT_dataset_2 = pickle.load(f)

# WADI

In [235]:
df = pd.read_csv('datasets/WADI/A1_2017_oct/WADI_attackdataLABLE.csv', header=1)

In [236]:
df = df[:-3]

In [237]:
hour = []
counter = -1 
for t in df.Time:
    if t == '00:00.0':
        counter += 1
    if counter % 24 < 10:
        hour.append(f'0{counter % 24}')
    else:
        hour.append(str(counter % 24))
df['hour'] = hour

In [238]:
datetime = (df['Date '] + ' ' + df['hour']+ ':' + df['Time']).apply(lambda x: str(x)[:-2])
df = df.set_index(pd.to_datetime(datetime, dayfirst=True))

In [239]:
raw_abnormalities = """
9/10/17 19:25:00 19:50:16
10/10/17 10:24:10 10:34:00
10/10/17 10:55:00 11:24:00
10/10/17 11:30:40 11:44:50
10/10/17 13:39:30 13:50:40
10/10/17 14:48:17 14:59:55
10/10/17 17:40:00 17:49:40
11/10/17 10:55:00 10:56:27
11/10/17 11:17:54 11:31:20
11/10/17 11:36:31 11:47:00
11/10/17 11:59:00 12:05:00
11/10/17 12:07:30 12:10:52
11/10/17 12:16:00 12:25:36
11/10/17 15:26:30 15:37:00
"""

In [241]:
split = raw_abnormalities.split()
abnormalities = {
    'start': [],
    'end': []
}
for i in range(0, len(split), 3):
    date, start, end = split[i], split[i+1], split[i+2]
    abnormalities['start'].append(date + ' ' + start)
    abnormalities['end'].append(date + ' ' + end)
    
abnormalities = pd.DataFrame(abnormalities)
abnormalities['source'] = 'SUTD'
abnormalities['name'] = 'WADI'
abnormalities['signal'] = None
abnormalities['start'] = pd.to_datetime(abnormalities['start'], dayfirst=True)
abnormalities['end'] = pd.to_datetime(abnormalities['end'], dayfirst=True)

abnormalities = abnormalities[list(abnormalities.columns[-3:]) + list(abnormalities.columns[:-3])]

abnormalities.head()

Unnamed: 0,source,name,signal,start,end
0,SUTD,WADI,,2017-10-09 19:25:00,2017-10-09 19:50:16
1,SUTD,WADI,,2017-10-10 10:24:10,2017-10-10 10:34:00
2,SUTD,WADI,,2017-10-10 10:55:00,2017-10-10 11:24:00
3,SUTD,WADI,,2017-10-10 11:30:40,2017-10-10 11:44:50
4,SUTD,WADI,,2017-10-10 13:39:30,2017-10-10 13:50:40


In [242]:
y = []
for idx, abnormality in abnormalities.iterrows():
    sub_df = pd.DataFrame(pd.date_range(abnormality['start'], abnormality['end'], freq='1s'))
    sub_df['target'] = 1
    y.append(sub_df)
y = pd.concat(y).set_index(0)

In [244]:
final_df = df.iloc[:, 3:130].fillna(0).reset_index()
final_df = final_df.rename(columns={' Timestamp': 'index'})

final_df['source'] = 'SUTD'
final_df['name'] = 'WADI'
final_df['signal'] = None
final_df['anomaly'] = df.merge(y, left_index=True, right_index=True, how='left').drop_duplicates()['target'].fillna(0).values

In [245]:
final_df = final_df[list(final_df.columns[-4:]) + list(final_df.columns[:-4])]

In [247]:
split = len(final_df) * 2 // 3
train = final_df.iloc[:split]
test = final_df.iloc[split:]

In [248]:
WADI_dataset = {
    'train': train,
    'test': test,
    'anomaly': abnormalities
}

In [249]:
print_summary(WADI_dataset['train'], WADI_dataset['test'], WADI_dataset['anomaly'])

# channels 127
# contextual 14
# Anomaly 5134
# Data 172800


In [250]:
with open('processed/datasets/WADI.pickle', 'wb') as f:
    pickle.dump(WADI_dataset, f)

# Yahoo

In [311]:
A1_dataset = {
    'train': [],
    'test': [],
    'anomaly': [],
}

for filepath in glob.glob(f'datasets/YAHOO/A1Benchmark/**.csv'):
    df = pd.read_csv(filepath)
    df['source'] = 'Yahoo'
    df['name'] = 'A1'
    
    filename = os.path.basename(filepath)
    signal, _ = filename.split('.')
    df['signal'] = signal
    df = df[['source', 'name', 'signal', 'is_anomaly', 'timestamp', 'value']]
    df = df.rename(columns={'is_anomaly': 'anomaly', 'timestamp': 'index'})

    anomaly_index = []
    start = None
    prev = None
    for index, anomaly in zip(df['index'], df.anomaly):
        if start is None and anomaly == 1:
            start = [index]
        elif start is not None and anomaly == 0:
            start.append(prev)
            anomaly_index.append(start)
            start = None
        elif start is not None and index == len(df):
            start.append(index)
            anomaly_index.append(start)
        prev = index
        

    anomaly_index = pd.DataFrame(anomaly_index)
    if len(anomaly_index.columns == 2):
        anomaly_index.columns = ['start', 'end']
        anomaly_index['source'] = 'Yahoo'
        anomaly_index['name'] = 'A1'
        anomaly_index['signal'] = signal
        anomaly_index = anomaly_index[['source', 'name', 'signal', 'start', 'end']]
        
    split = (len(df) * 2) // 3
    train, test = df.iloc[:split], df.iloc[split:]
    A1_dataset['train'].append(train)
    A1_dataset['test'].append(test)
    A1_dataset['anomaly'].append(anomaly_index)

A1_dataset['train'] = pd.concat(A1_dataset['train'])
A1_dataset['test'] = pd.concat(A1_dataset['test'])
A1_dataset['anomaly'] = pd.concat(A1_dataset['anomaly'])

In [312]:
print_summary(A1_dataset['train'], A1_dataset['test'], A1_dataset['anomaly'])

# channels 1
# contextual 177
# Anomaly 1669
# Data 94866


In [313]:
b = sum(A1_dataset['anomaly']['start'] == A1_dataset['anomaly']['end'])
a = len(A1_dataset['anomaly'])
print(a, b, a-b)

177 67 110


In [314]:
with open('processed/datasets/A2.pickle', 'wb') as f:
    pickle.dump(A1_dataset, f)

In [324]:
A3_dataset = {
    'train': [],
    'test': [],
    'anomaly': [],
}

for filepath in glob.glob(f'datasets/YAHOO/A4Benchmark/A4Benchmark-**.csv'):
    df = pd.read_csv(filepath)

    df['source'] = 'Yahoo'
    df['name'] = 'A4'
    
    filename = os.path.basename(filepath)
    signal, _ = filename.split('.')
    df['signal'] = signal
    df = df[['source', 'name', 'signal', 'anomaly', 'timestamps', 'value', 'changepoint', 'trend', 'noise',
       'seasonality1', 'seasonality2', 'seasonality3']]
    df = df.rename(columns={'is_anomaly': 'anomaly', 'timestamps': 'index'})

    anomaly_index = []
    start = None
    prev = None
    for index, anomaly in zip(df['index'], df.anomaly):
        if start is None and anomaly == 1:
            start = [index]
        elif start is not None and anomaly == 0:
            start.append(prev)
            anomaly_index.append(start)
            start = None
        elif start is not None and index == len(df):
            start.append(index)
            anomaly_index.append(start)
        prev = index
        

    anomaly_index = pd.DataFrame(anomaly_index)
    if len(anomaly_index.columns == 2):
        anomaly_index.columns = ['start', 'end']
        anomaly_index['source'] = 'Yahoo'
        anomaly_index['name'] = 'A4'
        anomaly_index['signal'] = signal
        anomaly_index = anomaly_index[['source', 'name', 'signal', 'start', 'end']]
        
    split = (len(df) * 2) // 3
    train, test = df.iloc[:split], df.iloc[split:]
    A3_dataset['train'].append(train)
    A3_dataset['test'].append(test)
    A3_dataset['anomaly'].append(anomaly_index)

A3_dataset['train'] = pd.concat(A3_dataset['train'])
A3_dataset['test'] = pd.concat(A3_dataset['test'])
A3_dataset['anomaly'] = pd.concat(A3_dataset['anomaly'])

In [325]:
print_summary(A3_dataset['train'], A3_dataset['test'], A3_dataset['anomaly'])

# channels 7
# contextual 834
# Anomaly 837
# Data 168000


In [326]:
b = sum(A3_dataset['anomaly']['start'] == A3_dataset['anomaly']['end'])
a = len(A3_dataset['anomaly'])
print(a, b, a-b)

834 832 2


In [327]:
with open('processed/datasets/A4.pickle', 'wb') as f:
    pickle.dump(A3_dataset, f)

# NAB

In [376]:
with open('datasets/NAB/labels/combined_windows.json', 'rb') as f:
    labels = json.load(f)

In [377]:
dataset = {}
for filepath, anomalies in labels.items():
    category, filename = filepath.split('/')
    if len(anomalies) != 0:
        dataset.setdefault(category, dict())
        dataset[category].setdefault(filename, [])
        dataset[category][filename].extend(anomalies)

In [413]:
Artificial_dataset = {
    'train': [],
    'test': [],
    'anomaly': [],
}

for filepath in glob.glob(f'datasets/NAB/data/realTweets/**.csv'):
    df = pd.read_csv(filepath)
    df = df.rename(columns={'timestamp': 'index'})
    df['source'] = 'NAB'
    df['name'] = 'Tweets'
    
    filename = os.path.basename(filepath)
    signal, _ = filename.split('.')
    df['signal'] = signal
    
    df['anomaly'] = 0
    for interval in dataset['realTweets'].get(filename, []):
        cond = (interval[0] <= df['index']) & (df['index'] <= interval[1])
        df.loc[cond, 'anomaly'] = df.loc[cond, 'anomaly'] + 1
    
    df = df[['source', 'name', 'signal', 'anomaly', 'index', 'value']]
    
    
    anomaly_index = pd.DataFrame(dataset['realTweets'].get(filename, []))
    if len(anomaly_index.columns == 2):
        anomaly_index.columns = ['start', 'end']
        anomaly_index['source'] = 'NAB'
        anomaly_index['name'] = 'Tweets'
        anomaly_index['signal'] = signal
        anomaly_index = anomaly_index[['source', 'name', 'signal', 'start', 'end']]
        
    
    
    split = (len(df) * 2) // 3
    train, test = df.iloc[:split], df.iloc[split:]
    Artificial_dataset['train'].append(train)
    Artificial_dataset['test'].append(test)
    Artificial_dataset['anomaly'].append(anomaly_index)

    
Artificial_dataset['train'] = pd.concat(Artificial_dataset['train'])
Artificial_dataset['test'] = pd.concat(Artificial_dataset['test'])
Artificial_dataset['anomaly'] = pd.concat(Artificial_dataset['anomaly'])

In [414]:
print_summary(Artificial_dataset['train'], Artificial_dataset['test'], Artificial_dataset['anomaly'])

# channels 1
# contextual 33
# Anomaly 15618
# Data 158631


In [416]:
b = sum(Artificial_dataset['anomaly']['start'] == Artificial_dataset['anomaly']['end'])
a = len(Artificial_dataset['anomaly'])
print(a, b, a-b)

33 0 33


In [417]:
with open('processed/datasets/realTweets.pickle', 'wb') as f:
    pickle.dump(Artificial_dataset, f)