In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset
import numpy as np

from torch.utils.data import DataLoader

# Class Water

In [12]:

class Water(Dataset):
    def __init__(self, df, label, window_size=60, stride_size=10):
        super(Water, self).__init__()
        self.df = df
        self.window_size = window_size
        self.stride_size = stride_size

        self.data, self.idx, self.label = self.preprocess(df,label)
    
    def preprocess(self, df, label):

        start_idx = np.arange(0,len(df)-self.window_size,self.stride_size)
        end_idx = np.arange(self.window_size, len(df), self.stride_size)

        delat_time =  df.index[end_idx]-df.index[start_idx]
        idx_mask = delat_time==pd.Timedelta(self.window_size,unit='s')

        return df.values, start_idx[idx_mask], label[start_idx[idx_mask]]

    def __len__(self):

        length = len(self.idx)

        return length

    def __getitem__(self, index):
        #  N X K X L X D 
        start = self.idx[index]
        end = start + self.window_size
        data = self.data[start:end].reshape([self.window_size,-1, 1])
        
        print(f"data[start: end].shape: {self.data[start:end].shape}")
        print(f"Reshaped data.shape: {data.shape}")
        
        return torch.FloatTensor(data).transpose(0,1)



# load water

In [3]:
root = '/root/zengzihui/ISST/GANF/data/SWaT_Dataset_Attack_v0.csv'
batch_size = 512

In [4]:
data = pd.read_csv(root)
data = data.rename(columns={"Normal/Attack":"label"})
data.label[data.label!="Normal"]=1
data.label[data.label=="Normal"]=0
data["Timestamp"] = pd.to_datetime(data["Timestamp"])
data = data.set_index("Timestamp")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.label[data.label!="Normal"]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.label[data.label=="Normal"]=0


In [5]:
feature = data.iloc[:,:51]
mean_df = feature.mean(axis=0)
std_df = feature.std(axis=0)

norm_feature = (feature-mean_df)/std_df
norm_feature = norm_feature.dropna(axis=1)
n_sensor = len(norm_feature.columns)

In [6]:
print("feature_df: ",feature.shape,
"mean_df: ", mean_df.shape,
"std_df: ",  std_df.shape,
"NA_normal_df: ", ((feature-mean_df)/std_df).shape,
"norma_df: ", norm_feature.shape)

feature_df:  (449919, 51) mean_df:  (51,) std_df:  (51,) NA_normal_df:  (449919, 51) norma_df:  (449919, 44)


In [7]:
train_df = norm_feature.iloc[:int(0.6*len(data))]
train_label = data.label.iloc[:int(0.6*len(data))]

val_df = norm_feature.iloc[int(0.6*len(data)):int(0.8*len(data))]
val_label = data.label.iloc[int(0.6*len(data)):int(0.8*len(data))]

test_df = norm_feature.iloc[int(0.8*len(data)):]
test_label = data.label.iloc[int(0.8*len(data)):]

In [8]:
print("train_df: ",train_df.shape,
"val_df: ", val_df.shape,
"test_df: ",  test_df.shape)

train_df:  (269951, 44) val_df:  (89984, 44) test_df:  (89984, 44)


In [9]:
test_label.shape

(89984,)

In [10]:
train_loader = DataLoader(Water(train_df,train_label), batch_size=batch_size, shuffle=True)

In [11]:
Water(train_df,train_label).__len__()

26990

# Water shape

In [12]:
for i in range(2):
    print(f'++++++++++{i}')
    # temp = Water(train_df, train_label)._ 
    print(Water(train_df, train_label).__getitem__(i).shape)

++++++++++0
data[start: end].shape: (60, 44)
Reshaped data.shape: (60, 44, 1)
torch.Size([44, 60, 1])
++++++++++1
data[start: end].shape: (60, 44)
Reshaped data.shape: (60, 44, 1)
torch.Size([44, 60, 1])


In [30]:
temp = Water(train_df, train_label)

In [45]:
train_label.shape

(269951,)

In [46]:
train_df.shape

(269951, 44)

In [43]:
print(temp.data.shape) 
print(temp.idx)
print(temp.label.shape)

(269951, 44)
[     0     10     20 ... 269870 269880 269890]
(26990,)


# Water Item Label

In [16]:
item  = Water(train_df, train_label)

In [20]:
item.label

Timestamp
2015-12-28 10:00:00    0
2015-12-28 10:00:10    0
2015-12-28 10:00:20    0
2015-12-28 10:00:30    0
2015-12-28 10:00:40    0
                      ..
2015-12-31 12:57:30    0
2015-12-31 12:57:40    0
2015-12-31 12:57:50    0
2015-12-31 12:58:00    0
2015-12-31 12:58:10    0
Name: label, Length: 26990, dtype: object

# get ALL dataset

In [5]:
from get_all_dataset import load_water
import pandas as pd

In [28]:
train_path = "/root/zengzihui/ISST/GANF/data/SWaT_Dataset_Normal_v1.csv"
test_path = "/root/zengzihui/ISST/GANF/data/SWaT_Dataset_Attack_v0.csv"

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [29]:
test_data['Normal/Attack'].unique()

array(['Normal', 'Attack'], dtype=object)

In [30]:
data = pd.concat([train_data, test_data])

In [31]:
data = data.rename(columns={"Normal/Attack":"label"})
# data.label[data.label!="Normal"]=1
# data.label[data.label=="Normal"]=0
data.label = data['label'].map({"Normal": 0,"Attack": 1})


In [32]:
data["Timestamp"] = pd.to_datetime(data["Timestamp"])
data = data.set_index("Timestamp")