# Pre-processing WADI dataset

Following implementation of [Deng & Hooi (2021)](https://bhooi.github.io/papers/gdn_aaai2021.pdf): (1) MinMax Normalization of Data and (2) Down Sampling of Data.

## Import packages and dataset

After downloading the dataset from [iTrust](https://itrust.sutd.edu.sg/), we use the dataset - <u>WADI.A1_9 Oct 2017</u>

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
import os

In [2]:
train_path = "wadi2017/WADI_14days.csv" ## Training dataset
test_path = "wadi2017/WADI_attackdata.csv"
output_dir = "../data/wadi"

## Process Data

### Process Training Data

In [3]:
train = pd.read_csv(train_path,header=3)
train = train.fillna(0)
train = train.drop(columns=['Row'])
train['Date'] = train['Date'].apply(lambda x: '/'.join([i.zfill(2) for i in x.split('/')]))
train['Time'] = train['Time'].apply(lambda x: x.replace('.000','')) 
train['Time'] = train['Time'].apply(lambda x: ':'.join([i.zfill(2) for i in x.split(':')]))

In [4]:
train['datetime'] = train.apply(lambda x: datetime.strptime(x.Date +' '+x.Time,'%m/%d/%Y %I:%M:%S %p'),axis=1)

In [None]:
assert train['datetime'].tolist() == train.apply(lambda x: parser.parse(x.Date +' '+x.Time,fuzzy=True),
                                                 axis=1).tolist()

In [None]:
# Rename and resort
coi = ['datetime'] + [i for i in train.columns if i not in ['Date','Time','datetime']]
train = train[coi]
train = train.sort_values('datetime')

In [None]:
empty_cols = [col for col in train.columns if train[col].isnull().all()]
train[empty_cols] = train[empty_cols].fillna(0,inplace=True)

for i in train.columns[train.isnull().any(axis=0)]:     #---Applying Only on variables with NaN values
    train[i].fillna(train[i].mean(),inplace=True)

__MINMAX normalization__

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = train.iloc[:,1:].to_numpy()
scaler.fit(X_train)
normalized_train_X = scaler.transform(X_train)

normalized_train_X

normalized_train = train.copy()
normalized_train.iloc[:,1:] = normalized_train_X 

__Downsample data__

In [None]:
filtered_train = normalized_train.groupby(np.arange(len(train))//10).median()
time = [normalized_train.iloc[0,0] + timedelta(seconds=10*i) for i in range(0,len(filtered_train))]

filtered_train['datetime'] = time
filtered_train = filtered_train.iloc[2160:]
filtered_train = filtered_train.iloc[:-1,:]
filtered_train = filtered_train[coi] 

In [None]:
# Rename
import re 
pat = re.escape('\\\\WIN-25J4RO10SBF\\LOG_DATA\\SUTD_WADI\\LOG_DATA\\')

rename_col = [re.sub(pat,'',i) for i in filtered_train.columns]
filtered_train.columns = rename_col

In [None]:
final_train = filtered_train.reset_index().drop(columns=['datetime','index'])

final_train = final_train.reset_index().rename(columns={'index':'timestamp'})

### Process Test Data

__Inject anomaly labels based on the pdf table__

In [None]:
attacks = []

# Attack 1 
start = datetime(2017,10,9,19,25,00)
end = datetime(2017,10,9,19,50,16)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]
print(attacks[0],attacks[-1])
# Attack 2
start = datetime(2017,10,10,10,24,10)
end = datetime(2017,10,10,10,34,00)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]

# Attack 3-4
start = datetime(2017,10,10,10,55,00)
end = datetime(2017,10,10,11,24,00)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]
#attacks['1_AIT_001'] = [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds()))]

# Attack 5
start = datetime(2017,10,10,11,30,40)
end = datetime(2017,10,10,11,44,50)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]

# Attack 6
start = datetime(2017,10,10,13,39,30)
end = datetime(2017,10,10,13,50,40)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]
#attacks['2_MCV_101'] = [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds()))]

# Attack 7
start = datetime(2017,10,10,14,48,17)
end = datetime(2017,10,10,14,59,55)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]


# Attack 8
start = datetime(2017,10,10,17,40,00)
end = datetime(2017,10,10,17,49,40)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]


# Attack 9
start = datetime(2017,10,10,10,55,00)
end = datetime(2017,10,10,10,56,27)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]

# Attack 10
start = datetime(2017,10,11,11,17,54)
end = datetime(2017,10,11,11,31,20)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]


# Attack 11
start = datetime(2017,10,11,11,36,31)
end = datetime(2017,10,11,11,47,00)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]


# Attack 12
start = datetime(2017,10,11,11,59,00)
end = datetime(2017,10,11,12,5,00)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]


# Attack 13
start = datetime(2017,10,11,12,7,30)
end = datetime(2017,10,11,12,10,52)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]

# Attack 14
start = datetime(2017,10,11,12,16,00)
end = datetime(2017,10,11,12,25,36)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]

# Attack 15
start = datetime(2017,10,11,15,26,30)
end = datetime(2017,10,11,15,37,00)

attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]

tt = [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]

print(tt[0],tt[-1])

attacks_set = set(attacks)

__Import Test Data__

In [None]:
test = pd.read_csv(test_path)
#test = test.fillna(0)
test = test.drop(columns=['Row'])
test['Date'] = test['Date'].apply(lambda x: '/'.join([i.zfill(2) for i in x.split('/')]))
test['Time'] = test['Time'].apply(lambda x: x.replace('.000','')) 
test['Time'] = test['Time'].apply(lambda x: ':'.join([i.zfill(2) for i in x.split(':')]))

In [None]:
test['datetime'] = test.apply(lambda x: datetime.strptime(x.Date +' '+x.Time,'%m/%d/%Y %I:%M:%S %p'),axis=1)

In [None]:
empty_cols = [col for col in test.columns if test[col].isnull().all()]
test[empty_cols] = test[empty_cols].fillna(0)

for i in test.columns[test.isnull().any(axis=0)]:     #---Applying Only on variables with NaN values
    test[i].fillna(test[i].mean(),inplace=True)

In [None]:
def attacked(datetime,datetime_list):
    if datetime in datetime_list:
        return int(1)
    else:
        return int(0)

test['attack'] = test['datetime'].apply(lambda x: attacked(x,attacks_set))

In [None]:
# Rename and resort
coi = ['datetime'] + [i for i in train.columns if i not in ['Date','Time','datetime']]
coi += ['attack']
test = test[coi]
test = test.sort_values('datetime')

#test

__Minmax Normalization__

In [None]:
## NORMALIZATION
X_test = test.iloc[:,1:-1].to_numpy()
normalized_test_X = scaler.transform(X_test)

normalized_test = test.copy()
normalized_test.iloc[:,1:-1] = normalized_test_X 

__Downsample__

In [None]:
filtered_test = normalized_test.iloc[:,1:].groupby(np.arange(len(test.iloc[:,1:]))//10).median()
max_ftest = normalized_test.iloc[:,1:].groupby(np.arange(len(test.iloc[:,1:]))//10).max()

In [None]:
final_test = filtered_test.iloc[:-1,:].copy()
final_test['attack'] = final_test['attack'].round()
final_test['datetime'] = [test['datetime'][0] + timedelta(seconds=10*i) for i in range(0,len(final_test))]

## Renaming
import re 
pat = re.escape('\\\\WIN-25J4RO10SBF\\LOG_DATA\\SUTD_WADI\\LOG_DATA\\')

rename_col = [re.sub(pat,'',i) for i in final_test.columns]
final_test.columns = rename_col

## Sort columns
sort_col = ['datetime'] + [i for i in final_test if i != 'datetime']
final_test = final_test[sort_col]

In [None]:
final_test = final_test.reset_index().drop(columns=['datetime','index'])

final_test = final_test.reset_index().rename(columns={'index':'timestamp'})

## Generate Output Files 

__Import function for data generation__

In [None]:

def generate_graph_seq2seq_io_data(
        df, x_offsets, y_offsets, scaler=None):
    """
    Generate samples from
    :param df:
    :param x_offsets:
    :param y_offsets:
    :param add_time_in_day:
    :param add_day_in_week:
    :param scaler:
    :return:
    # x: (epoch_size, input_length, num_nodes, input_dim)
    # y: (epoch_size, output_length, num_nodes, output_dim)
    """
    ## FOR WADI and SWAT DATASET
    #df = df.set_index('datetime')

    if 'attack' in df.columns:
        df = df.drop(columns=['attack'])
    if 'timestamp' in df.columns:
        df = df.drop(columns=['timestamp'])
    
    num_samples, num_nodes = df.shape
    data = np.expand_dims(df.values, axis=-1)
    data_list = [data]
    data = np.concatenate(data_list, axis=-1)
    # epoch_len = num_samples + min(x_offsets) - max(y_offsets)
    x, y = [], []
    # t is the index of the last observation.
    min_t = abs(min(x_offsets))
    max_t = abs(num_samples - abs(max(y_offsets)))  # Exclusive
    for t in range(min_t, max_t):
        x_t = data[t + x_offsets, ...]
        y_t = data[t + y_offsets, ...]
        x.append(x_t)
        y.append(y_t)
    x = np.stack(x, axis=0)
    y = np.stack(y, axis=0)
    return x, y


def generate_train_val_test(df,test_size,val_ratio,window_size,output_dir):
    df = df.reset_index(drop=True)
    x_offsets = np.sort(
        np.concatenate((np.arange(-(window_size-1), 1, 1),))
    )
    # Predict the next one hour
    y_offsets = np.sort(np.arange(1, 2, 1))
    # x: (num_samples, input_length, num_nodes, input_dim)
    # y: (num_samples, output_length, num_nodes, output_dim)
    x, y = generate_graph_seq2seq_io_data(
        df,
        x_offsets=x_offsets,
        y_offsets=y_offsets,
    )

    print("x shape: ", x.shape, ", y shape: ", y.shape)
    # Write the data into npz file.
    # num_test = 6831, using the last 6831 examples as testing.
    # for the rest: 7/8 is used for training, and 1/8 is used for validation.
    num_samples = x.shape[0]
    num_test = test_size
    train_samples = num_samples - num_test
    num_train = round(train_samples * (1-val_ratio))
    num_val = train_samples - num_train

    # train
    x_train, y_train = x[:num_train], y[:num_train]
    # val
    x_val, y_val = (
        x[num_train: num_train + num_val],
        y[num_train: num_train + num_val],
    )
    # test
    x_test, y_test = (
        x[-num_test: ],
        y[-num_test: ],
    )
    for cat in ["train", "val","test"]:
        _x, _y = locals()["x_" + cat], locals()["y_" + cat]
        print(cat, "x: ", _x.shape, "y:", _y.shape)
        np.savez_compressed(
            os.path.join(output_dir, "%s.npz" % cat),
            x=_x,
            y=_y,
            x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]),
            y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]),
        )

__Data Generate__

In [None]:
# Parameters for data generation
val_ratio = 0.1
window_size = 5
# Output dir 
output_dir = '../data/wadi'
if 'wadi' not in output_dir:
    os.mkdir('../data/wadi')

In [None]:
labels = final_test.attack.tolist()
## Generate anomaly labels
with open(output_dir+'/anomaly_labels.txt','w') as f:
    f.write(','.join([str(i) for i in labels]))
    
final_test = final_test.drop(columns=['attack'])

In [None]:
data = final_train.append([final_test]).reset_index(drop=True)

In [None]:
## Generate final dataset
generate_train_val_test(data,len(final_test),val_ratio,window_size,output_dir)