In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Raw Data

In [2]:
raw = pd.read_csv("raw_dataset/train.csv", index_col=0)

In [3]:
raw.head(5)

Unnamed: 0_level_0,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,MV201,...,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603,Normal/Attack
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22/12/2015 4:30:00 PM,0.0,124.3135,1,1,1,251.9226,8.313446,312.7916,0.0,1,...,1,1,9.100231,0.0,3.3485,0.000256,1,1,1,Normal
22/12/2015 4:30:01 PM,0.0,124.392,1,1,1,251.9226,8.313446,312.7916,0.0,1,...,1,1,9.100231,0.0,3.3485,0.000256,1,1,1,Normal
22/12/2015 4:30:02 PM,0.0,124.4705,1,1,1,251.9226,8.313446,312.7916,0.0,1,...,1,1,9.100231,0.0,3.3485,0.000256,1,1,1,Normal
22/12/2015 4:30:03 PM,0.0,124.6668,1,1,1,251.9226,8.313446,312.7916,0.0,1,...,1,1,9.100231,0.0,3.3485,0.000256,1,1,1,Normal
22/12/2015 4:30:04 PM,0.0,124.5098,1,1,1,251.9226,8.313446,312.7916,0.0,1,...,1,1,9.100231,0.0,3.3485,0.000256,1,1,1,Normal


In [4]:

raw = raw.rename(columns={"Normal/Attack":"label"})
raw['label'].map({
    "Normal": 0,
    "Attack": 1
})

Timestamp
22/12/2015 4:30:00 PM    0
22/12/2015 4:30:01 PM    0
22/12/2015 4:30:02 PM    0
22/12/2015 4:30:03 PM    0
22/12/2015 4:30:04 PM    0
                        ..
28/12/2015 9:59:55 AM    0
28/12/2015 9:59:56 AM    0
28/12/2015 9:59:57 AM    0
28/12/2015 9:59:58 AM    0
28/12/2015 9:59:59 AM    0
Name: label, Length: 495000, dtype: int64

# Class Water and WaterLabel

In [50]:
# > The Water class takes in a dataframe and a label, and returns a dataset object that can be used to
# train a model
class WaterLabel(Dataset):
    def __init__(self, df, label, timestamp, window_size=60, stride_size=10):
        super(WaterLabel, self).__init__()
        self.df = df
        self.window_size = window_size
        self.stride_size = stride_size

        self.data, self.idx, self.label = self.preprocess(df,label)
        self.label = 1.0-2*self.label 
        self.mask = self.get_mask()
        self.timestamp = [x.total_seconds() for x in timestamp]
        # self.timestamp = timestamp
    
    def preprocess(self, df, label):

        start_idx = np.arange(0,len(df)-self.window_size,self.stride_size)
        end_idx = np.arange(self.window_size, len(df), self.stride_size)

        delat_time =  df.index[end_idx]-df.index[start_idx]
        idx_mask = delat_time==pd.Timedelta(self.window_size,unit='s')

        return df.values, start_idx[idx_mask], label[start_idx[idx_mask]]
    
    def get_mask(self):
        # 2022.4.20 15: 49
        # writen by sync 
        mask = 1 - (self.data == -1).astype(float)
        return mask

    def __len__(self):

        length = len(self.idx)

        return length

    def __getitem__(self, index):
        #  N X K X L X D 
        start = self.idx[index]
        end = start + self.window_size
        data = self.data[start:end].reshape([self.window_size,-1])
        mask = self.mask[start:end].reshape([self.window_size,-1])
        time = torch.Tensor(self.timestamp[start:end]).reshape([self.window_size, -1])
        
        return torch.cat(
            [torch.FloatTensor(data), 
            torch.FloatTensor(mask),
            time], 1), self.label[index]

        # return torch.FloatTensor(data).transpose(0,1), self.label[index]


# test WaterLabel Class

In [51]:
from datetime import timedelta

def get_waterlabel(root, batch_size,label=False):
    
    data = pd.read_csv(root)
    data = data.rename(columns={"Normal/Attack":"label"})

    data = data.head(int(data.shape[0]/14))
    # data.label[data.label!="Normal"]=1
    # data.label[data.label=="Normal"]=0
    data.label = data['label'].map({
        "Normal": 0,
        "Attack": 1
    })
    data["Timestamp"] = pd.to_datetime(data["Timestamp"])
    data["Timestamp"] = (data["Timestamp"] - data["Timestamp"][0])

    data = data.set_index("Timestamp")
    # data.index = pd.to_datetime(timedelta.total_seconds(data.index - data.index[0]))

    #%%
    feature = data.iloc[:,:51]
    mean_df = feature.mean(axis=0)
    std_df = feature.std(axis=0)

    norm_feature = (feature-mean_df)/std_df
    norm_feature = norm_feature.dropna(axis=1)
    n_sensor = len(norm_feature.columns)

    train_df = norm_feature.iloc[:int(0.6*len(data))]
    train_label = data.label.iloc[:int(0.6*len(data))]

    val_df = norm_feature.iloc[int(0.6*len(data)):int(0.8*len(data))]
    val_label = data.label.iloc[int(0.6*len(data)):int(0.8*len(data))]
    
    test_df = norm_feature.iloc[int(0.8*len(data)):]
    test_label = data.label.iloc[int(0.8*len(data)):]
    
    return WaterLabel(train_df,train_label, timestamp = data.index)

In [52]:
test = get_waterlabel('raw_dataset/train.csv', batch_size=128,)

In [53]:
test.timestamp

[0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 112.0,
 113.0,
 114.0,
 115.0,
 116.0,
 117.0,
 118.0,
 119.0,
 120.0,
 121.0,
 122.0,
 123.0,
 124.0,
 125.0,
 126.0,
 127.0,
 128.0,
 129.0,
 130.0,
 131.0,
 132.0,
 133.0,
 134.0,
 135.0,
 136.0,
 137.0,
 138.0

In [54]:
test.__getitem__(0)[0][:,-1]

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
        28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
        42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54., 55.,
        56., 57., 58., 59.])

In [82]:
# start = test.idx[0]
# end = start + test.window_size
# torch.FloatTensor(test.timestamp[start:end]).reshape()


item = test.__getitem__(0)

In [85]:
test.data.shape

(21214, 39)

In [91]:
item[0].shape

torch.Size([60, 79])

In [92]:
dim = 39
observed_data, observed_mask, observed_tp = item[0][:, :dim], item[0][:, dim:2*dim], item[0][:, -1]

In [93]:
observed_data

tensor([[-1.9959, -2.2194, -1.9298,  ..., -4.8150, -0.1145, -0.1071],
        [-1.9959, -2.2189, -1.9298,  ..., -4.8150, -0.1145, -0.1071],
        [-1.9959, -2.2185, -1.9298,  ..., -4.8150, -0.1145, -0.1071],
        ...,
        [-1.9959, -2.2239, -1.9298,  ..., -4.8150, -0.1145, -0.1071],
        [-1.9959, -2.2257, -1.9298,  ..., -4.8150, -0.1145, -0.1071],
        [-1.9959, -2.2248, -1.9298,  ..., -4.8150, -0.1145, -0.1071]])

# load water


In [94]:
def load_water(root, batch_size,label=False):
    
    data = pd.read_csv(root)
    data = data.rename(columns={"Normal/Attack":"label"})

    data = data.head(int(data.shape[0]/4))
    # data.label[data.label!="Normal"]=1
    # data.label[data.label=="Normal"]=0
    data.label = data['label'].map({
        "Normal": 0,
        "Attack": 1
    })
    data["Timestamp"] = pd.to_datetime(data["Timestamp"])
    data = data.set_index("Timestamp")

    #%%
    feature = data.iloc[:,:51]
    mean_df = feature.mean(axis=0)
    std_df = feature.std(axis=0)

    norm_feature = (feature-mean_df)/std_df
    norm_feature = norm_feature.dropna(axis=1)
    n_sensor = len(norm_feature.columns)

    train_df = norm_feature.iloc[:int(0.6*len(data))]
    train_label = data.label.iloc[:int(0.6*len(data))]

    val_df = norm_feature.iloc[int(0.6*len(data)):int(0.8*len(data))]
    val_label = data.label.iloc[int(0.6*len(data)):int(0.8*len(data))]
    
    test_df = norm_feature.iloc[int(0.8*len(data)):]
    test_label = data.label.iloc[int(0.8*len(data)):]
    
    if label:
        # add timestamp
        train_loader = DataLoader(WaterLabel(train_df,train_label, timestamp = data.index), batch_size=batch_size, shuffle=True)
    else:
        train_loader = DataLoader(Water(train_df,train_label), batch_size=batch_size, shuffle=True)
    
    val_loader = DataLoader(Water(val_df,val_label), batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(Water(test_df,test_label), batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader, n_sensor

In [100]:
train_loader, val_loader, test_loader, n_sensor \
    = load_water('raw_dataset/test.csv', batch_size=128, label=True)

In [101]:
# dim = 40 
for train_batch, label in train_loader:
    train_batch, label = train_batch, label# .to('cpu'), label.to("CPU")
    batch_len  = train_batch.shape[0]
    observed_data, observed_mask, observed_tp \
                = train_batch[:, :, :dim], train_batch[:, :, dim:2*dim], train_batch[:, :, -1]
    # print(train_batch.shape)
    break

In [102]:
train_batch.shape

torch.Size([128, 60, 81])

# dataloader

In [None]:
# train_dataloader = DataLoader(
#     train_data_combined, batch_size=batch_size, shuffle=False)
# test_dataloader = DataLoader(
#     test_data_combined, batch_size=batch_size, shuffle=False)

In [None]:
# def get_SWaT_data():
#     # read train data
#     noraml = pd.read_csv('path')

#     return data_obj

In [17]:
mask = -np.ones([40, 60], dtype = float)
mask

array([[-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       ...,
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.]])

In [39]:
tst =np.linspace(-1,100, 40*60).reshape([40,60])
mask = 1 - (tst == -1).astype(float)
mask

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])