In [1]:
import pandas as pd
# import vaex
import numpy as np
import glob
import dask.dataframe as dd
import json
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm

In [2]:
def fill_flag(sample):
    if not isinstance(sample['Flag'], str):
        col = 'Data' + str(sample['DLC'])
        sample['Flag'] = sample[col]
    return sample

In [3]:
def convert_canid_bits(cid):
    try:
        s = bin(int(str(cid), 16))[2:].zfill(29)
        bits = list(map(int, list(s)))
        return bits
    except:
        return None

In [4]:
# Read by dask first
attributes = ['Timestamp', 'canID', 'DLC', 
                           'Data0', 'Data1', 'Data2', 
                           'Data3', 'Data4', 'Data5', 
                           'Data6', 'Data7', 'Flag']
folder = './Data/Car-Hacking/'
attack_types = ['DoS', 'Fuzzy', 'gear', 'RPM']
attack = attack_types[1]
file_name = '{}{}_dataset.csv'.format(folder, attack)
print(file_name)
# df = pd.read_csv(file_name, header=None, names=attributes)
# for f in files[1]:
#     print('Reading file: ', f)
#     df = df.append(pd.read_csv(f, header=None))

./Data/Car-Hacking/Fuzzy_dataset.csv


In [5]:
def preprocess(file_name):
    df = dd.read_csv(file_name, header=None, names=attributes)
    print('Reading from {}: DONE'.format(file_name))
    print('Dask processing: -------------')
    df = df.apply(fill_flag, axis=1)
    pd_df = df.compute()
    pd_df = pd_df[['Timestamp', 'canID', 'Flag']].sort_values('Timestamp',  ascending=True)
    pd_df['canBits'] = pd_df.canID.apply(convert_canid_bits)
    pd_df['Flag'] = pd_df['Flag'].apply(lambda x: True if x == 'T' else False)
    print('Dask processing: DONE')
    print('Aggregate data -----------------')
    as_strided = np.lib.stride_tricks.as_strided  
    win = 29
    s = 29
    feature = as_strided(pd_df.canBits, ((len(pd_df) - win) // s + 1, win), (8*s, 8)) #Stride is counted by bytes
    label = as_strided(pd_df.Flag, ((len(pd_df) - win) // s + 1, win), (1*s, 1))
    df = pd.DataFrame({
        'features': pd.Series(feature.tolist()),
        'label': pd.Series(label.tolist())
    }, index= range(len(feature)))

    df['label'] = df['label'].apply(lambda x: 1 if any(x) else 0)
    print('Preprocessing: DONE')
    print('#Normal: ', df[df['label'] == 0].shape[0])
    print('#Attack: ', df[df['label'] == 1].shape[0])
    return df[['features', 'label']].reset_index().drop(['index'], axis=1)

In [6]:
df = preprocess(file_name)

Reading from ./Data/Car-Hacking/Fuzzy_dataset.csv: DONE
Dask processing: -------------


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Dask processing: DONE
Aggregate data -----------------
Preprocessing: DONE
#Normal:  87888
#Attack:  44486


In [5]:
df = dd.read_csv(file_name, header=None, names=attributes)
print('Reading from {}: DONE'.format(file_name))
print('Dask processing: -------------')
df = df.apply(fill_flag, axis=1)
pd_df = df.compute()
pd_df = pd_df[['Timestamp', 'canID', 'Flag']].sort_values('Timestamp',  ascending=True)
pd_df['canBits'] = pd_df.canID.apply(convert_canid_bits)
pd_df['Flag'] = pd_df['Flag'].apply(lambda x: True if x == 'T' else False)
pd_df

Reading from ./Data/Car-Hacking/Fuzzy_dataset.csv: DONE
Dask processing: -------------


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'})



Unnamed: 0,Timestamp,canID,Flag,canBits
0,1.478196e+09,0545,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1.478196e+09,02b0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1.478196e+09,0002,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1.478196e+09,0153,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1.478196e+09,0130,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
125863,1.478201e+09,018f,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
125864,1.478201e+09,0260,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
125865,1.478201e+09,02a0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
125866,1.478201e+09,0329,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
print('#Normal:', pd_df[pd_df['Flag'] == False].shape[0])
print('#Attack:', pd_df[pd_df['Flag'] == True].shape[0])

#Normal: 3347013
#Attack: 491847


In [7]:
as_strided = np.lib.stride_tricks.as_strided  
win = 29
s = 29
feature = as_strided(pd_df.canBits, ((len(pd_df) - win) // s + 1, win), (8*s, 8))
label = as_strided(pd_df.Flag, ((len(pd_df) - win) // s + 1, win), (1*s, 1))
df = pd.DataFrame({
    'features': pd.Series(feature.tolist()),
    'label': pd.Series(label.tolist())
}, index= range(len(feature)))

df['label'] = df['label'].apply(lambda x: 1 if any(x) else 0)

In [8]:
print('#Normal: ', df[df['label'] == 0].shape[0])
print('#Attack: ', df[df['label'] == 1].shape[0])

#Normal:  87888
#Attack:  44486


In [39]:
# np.where(df['label'] == 0)[0].tolist()

In [18]:
for attack in attack_types[1:]:
    print('Attack: {} ==============='.format(attack))
    file_name = '{}{}_dataset.csv'.format(folder, attack)
    df = preprocess(file_name)
    write_tfrecord(df[df['label'] == 1], './Data/TFRecord/{}'.format(attack))
    write_tfrecord(df[df['label'] == 0], './Data/TFRecord/Normal_{}'.format(attack))

Reading from ./Data/Car-Hacking/Fuzzy_dataset.csv: DONE
Dask processing: -------------


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'})



Dask processing: DONE
Aggregate data -----------------
Preprocessing: DONE


1290001it [07:33, 2847.35it/s]
2548831it [14:31, 2925.12it/s]
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'})



Reading from ./Data/Car-Hacking/gear_dataset.csv: DONE
Dask processing: -------------
Dask processing: DONE
Aggregate data -----------------
Preprocessing: DONE


1893041it [10:39, 2957.95it/s]
2550073it [14:00, 3035.69it/s]


Reading from ./Data/Car-Hacking/RPM_dataset.csv: DONE
Dask processing: -------------


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'})



Dask processing: DONE
Aggregate data -----------------
Preprocessing: DONE


2069648it [11:10, 3086.33it/s]
2552026it [13:58, 3042.46it/s]


In [12]:
def create_train_test(df):
    print('Create train - test - val: ')
    train, test = train_test_split(df, test_size=0.3, shuffle=True)
    train, val = train_test_split(train, test_size=0.2, shuffle=True)
    train_ul, train_l = train_test_split(train, test_size=0.1, shuffle=True)
    train_ul = train_ul.reset_index().drop(['index'], axis=1)
    train_l = train_l.reset_index().drop(['index'], axis=1)
    test = test.reset_index().drop(['index'], axis=1)
    val = val.reset_index().drop(['index'], axis=1)
    
    data_info = {
        "train_unlabel": train_ul.shape[0],
        "train_label": train_l.shape[0],
        "validation": val.shape[0],
        "test": test.shape[0]
    }
    
    return data_info, train_ul, train_l, val, test

In [14]:
def write_tfrecord(data, filename):
    tfrecord_writer = tf.io.TFRecordWriter(filename)
    for _, row in tqdm(data.iterrows()):
        tfrecord_writer.write(serialize_example(row['features'], row['label']))
    tfrecord_writer.close()    

In [9]:
%time
for attack in attack_types[1:]:
    file_name = '{}{}_dataset.csv'.format(folder, attack)
    print(file_name + '---------------------------')
    df = preprocess(file_name)
    data_info, train_ul, train_l, val, test = create_train_test(df)
    save_path = './Data/{}/'.format(attack)
    print('Path: ', save_path)
    print('Writing train_unlabel.......................')
    write_tfrecord(train_ul, save_path + "train_unlabel")
    print('Writing train_label.......................')
    write_tfrecord(train_l, save_path + "train_label")
    print('Writing test.......................')
    write_tfrecord(test, save_path + "test")
    print('Writing val.......................')
    write_tfrecord(val, save_path + "val")
    print('Writing data info')
    json.dump(data_info, open(save_path + 'datainfo.txt', 'w'))
    print('==========================================')

CPU times: user 0 ns, sys: 9 µs, total: 9 µs
Wall time: 16.7 µs
./Data/Car-Hacking/Fuzzy_dataset.csv---------------------------
Reading from ./Data/Car-Hacking/Fuzzy_dataset.csv: DONE
Dask processing: -------------


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'})



Dask processing: DONE
Aggregate data -----------------
Preprocessing: DONE
Create train - test - val: 


1it [00:00,  7.08it/s]

Path:  ./Data/Fuzzy/
Writing train_unlabel.......................


1934770it [10:54, 2954.10it/s]
570it [00:00, 2896.08it/s]

Writing train_label.......................


214975it [01:11, 2997.53it/s]
64it [00:00, 638.36it/s]

Writing test.......................


1151650it [06:20, 3026.12it/s]
184it [00:00, 1837.91it/s]

Writing val.......................


537437it [03:06, 2876.34it/s]


Writing data info
./Data/Car-Hacking/gear_dataset.csv---------------------------
Reading from ./Data/Car-Hacking/gear_dataset.csv: DONE
Dask processing: -------------


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Dask processing: DONE
Aggregate data -----------------
Preprocessing: DONE
Create train - test - val: 


1it [00:00,  5.94it/s]

Path:  ./Data/gear/
Writing train_unlabel.......................


2239328it [12:46, 2921.27it/s]
238it [00:00, 2379.68it/s]

Writing train_label.......................


248815it [01:29, 2769.91it/s]
315it [00:00, 1836.83it/s]

Writing test.......................


1332935it [07:37, 2912.73it/s]
145it [00:00, 1448.68it/s]

Writing val.......................


622036it [03:25, 3025.76it/s]
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta={'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'object', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'})



Writing data info
./Data/Car-Hacking/RPM_dataset.csv---------------------------
Reading from ./Data/Car-Hacking/RPM_dataset.csv: DONE
Dask processing: -------------
Dask processing: DONE
Aggregate data -----------------
Preprocessing: DONE
Create train - test - val: 


1it [00:00,  5.68it/s]

Path:  ./Data/RPM/
Writing train_unlabel.......................


2329322it [12:51, 3019.52it/s]
252it [00:00, 2514.30it/s]

Writing train_label.......................


258814it [01:23, 3106.57it/s]
1it [00:00,  9.79it/s]

Writing test.......................


1386503it [07:30, 3074.51it/s]
463it [00:00, 2450.96it/s]

Writing val.......................


647035it [03:27, 3111.81it/s]


Writing data info


In [42]:
# with open(save_path + 'datainfo.txt', 'w') as f:
#     f.write('Train Unlabel: {}\n'.format(train_ul.shape[0]))
#     f.write('Train Label: {}\n'.format(train_l.shape[0]))
#     f.write('Test: {}\n'.format(test.shape[0]))
#     f.write('Validation: {}\n'.format(val.shape[0]))
#     f.close()

In [None]:
# raw_data = tf.data.TFRecordDataset('data')
# feature_description = {
#     'input_features': tf.io.FixedLenFeature([29*29], tf.int64),
#     'label': tf.io.FixedLenFeature([1], tf.int64)
# }

# def _parse_image_function(example_proto):
#   # Parse the input tf.train.Example proto using the dictionary above.
#   return tf.io.parse_single_example(example_proto, feature_description)

# parsed_image_dataset = raw_data.map(_parse_image_function)
# parsed_image_dataset

In [3]:
def serialize_example(x, y):
    """converts x, y to tf.train.Example and serialize"""
    #Need to pay attention to whether it needs to be converted to numpy() form
    input_features = tf.train.Int64List(value = np.array(x).flatten())
    label = tf.train.Int64List(value = np.array([y]))
    features = tf.train.Features(
        feature = {
            "input_features": tf.train.Feature(int64_list = input_features),
            "label" : tf.train.Feature(int64_list = label)
        }
    )
    example = tf.train.Example(features = features)
    return example.SerializeToString()

In [4]:
def read_tfrecord(example):
    input_dim = 841
    feature_description = {
    'input_features': tf.io.FixedLenFeature([input_dim], tf.int64),
    'label': tf.io.FixedLenFeature([1], tf.int64)
    }
    return tf.io.parse_single_example(example, feature_description)

In [5]:
def data_from_tfrecord(tf_filepath, batch_size, repeat_time):
    data = tf.data.TFRecordDataset(tf_filepath)
    data = data.map(read_tfrecord)
    data = data.shuffle(2)
    data = data.repeat(repeat_time + 1)
    data = data.batch(batch_size)
    # print(tf.data.experimental.cardinality(data))
    iterator = data.make_one_shot_iterator()
    return iterator.get_next()

In [6]:
def data_helper(data_tf, sess):
    n_labels = 2
    data = sess.run(data_tf)
    x, y = data['input_features'], data['label']
    size = x.shape[0]
    y_one_hot = np.eye(n_labels)[y].reshape([size, n_labels])
    return x, y_one_hot

In [7]:
def get_size(file_path):
    dataset = data_from_tfrecord(file_path, 1000, 0)
    # print(tf.data.experimental.cardinality(dataset).numpy())
    init = tf.global_variables_initializer()
    size = 0
    with tf.Session() as sess:
        sess.run(init)
        while True:
            try:
                x_l, y_l = data_helper(dataset, sess)
                size += x_l.shape[0]
            except Exception as e:
                print(type(e).__name__)
                break
                
    return size

In [8]:
data_info = json.load(open('./Data/TFRecord/datainfo.txt'))
data_info

{'./Data/TFRecord/RPM': 2069648,
 './Data/TFRecord/DoS': 1085895,
 './Data/TFRecord/Normal_Fuzzy': 2548831,
 './Data/TFRecord/Normal_DoS': 2579848,
 './Data/TFRecord/gear': 1893041,
 './Data/TFRecord/Normal_gear': 2550073,
 './Data/TFRecord/Fuzzy': 1290001,
 './Data/TFRecord/Normal_RPM': 2552026}

In [9]:
def write_tfrecord(data, filename):
    print('Writing {}================= '.format(filename))
    iterator = data.make_one_shot_iterator().get_next()
    init = tf.global_variables_initializer()
    tfrecord_writer = tf.io.TFRecordWriter(filename)
    with tf.Session() as sess:
        sess.run(init)
        while True:
            try:
                batch_data = sess.run(iterator)
                for x, y in zip(batch_data['input_features'], batch_data['label']):
                    tfrecord_writer.write(serialize_example(x, y))
            except:
                break
            
    tfrecord_writer.close()

In [10]:
def train_test_split(source_path, dest_path, DATASET_SIZE,\
                     train_size = 500 * 1000, train_label_size = 100 * 1000):
    # dataset = data_from_tfrecord('./Data/TFRecord/DoS', 1000, 0)
    #DATASET_SIZE = data_info['./Data/TFRecord/DoS']
    #train_size = 500 * 1000
    #train_label_size = 100 * 1000
    val_size = int((DATASET_SIZE - train_size) * 0.2)
    test_size = DATASET_SIZE - train_size - val_size
    print(train_size, val_size, test_size)
    dataset = tf.data.TFRecordDataset(source_path)
    dataset = dataset.shuffle(1000000)
    dataset = dataset.map(read_tfrecord)
    train = dataset.take(train_size)
    train_label = train.take(train_label_size)
    train_unlabel = train.skip(train_label_size)
    val = dataset.skip(train_size)
    test = val.skip(val_size)
    val = val.take(val_size)
    batch_size = 10000
    train_label = train_label.batch(batch_size)
    train_unlabel = train_unlabel.batch(batch_size)
    test = test.batch(batch_size)
    val = val.batch(batch_size)

    train_test_info = {
        "train_unlabel": train_size - train_label_size,
        "train_label": train_label_size,
        "validation": val_size,
        "test": test_size
    }
    json.dump(train_test_info, open(dest_path + 'datainfo.txt', 'w'))
    write_tfrecord(train_label, dest_path + 'train_label')
    write_tfrecord(train_unlabel, dest_path + 'train_unlabel')
    write_tfrecord(test, dest_path + 'test')
    write_tfrecord(val, dest_path + 'val')
    

In [10]:
%%time
data_info = json.load(open('./Data/TFRecord/datainfo.txt'))
attack_types = ['DoS', 'Fuzzy', 'gear', 'RPM']
for attack in attack_types:
    print("Attack: {} ==============".format(attack))
    source = './Data/TFRecord/{}'.format(attack)
    dest = './Data/{}/'.format(attack)
    train_test_split(source, dest, data_info[source])

500000 117179 468716
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
500000 158000 632001
500000 278608 1114433
500000 313929 1255719
CPU times: user 24min 25s, sys: 1min 48s, total: 26min 14s
Wall time: 21min 32s


In [12]:
%%time
normal_size = 0
data_info = json.load(open('./Data/TFRecord/datainfo.txt'))
attack_types = ['DoS', 'Fuzzy', 'gear', 'RPM']
for attack in attack_types:
    normal_size += data_info['./Data/TFRecord/Normal_{}'.format(attack)]
sources = ['./Data/TFRecord/Normal_{}'.format(a) for a in attack_types]
dest = './Data/Normal/'
train_test_split(sources, dest, normal_size, train_size=500*1000*4*3, train_label_size=100*1000*4*3)

6000000 846155 3384623
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
CPU times: user 42min 37s, sys: 3min 41s, total: 46min 19s
Wall time: 36min 18s


In [11]:
# dataset = data_from_tfrecord('./Data/TFRecord/DoS', 1000, 0)
DATASET_SIZE = data_info['./Data/TFRecord/DoS']
train_size = 500 * 1000
train_label_size = 100 * 1000
val_size = int((DATASET_SIZE - train_size) * 0.2)
test_size = DATASET_SIZE - train_size - val_size
print(train_size, val_size, test_size)
dataset = tf.data.TFRecordDataset('./Data/TFRecord/DoS')
dataset = dataset.map(read_tfrecord)
dataset = dataset.shuffle(2)
train = dataset.take(train_size)
train_label = train.take(train_label_size)
train_unlabel = train.skip(train_label_size)
val = dataset.skip(train_size)
test = val.skip(val_size)
val = val.take(val_size)
batch_size = 10000
train = train.batch(batch_size)
test = test.batch(batch_size)
val = val.batch(batch_size)

write_tfrecord(train_label, './Data/DoS/train_label')
write_tfrecord(train_unlabel, './Data/DoS/train_unlabel')
write_tfrecord(test, './Data/DoS/test')
write_tfrecord(val, './Data/DoS/val')

500000 117179 468716


In [None]:
train_test_info = {
        "train_unlabel": train_size - train_label_size,
        "train_label": train_label_size,
        "validation": val_size,
        "test": test_size
}
json.dump(data_info, open(save_path + 'datainfo.txt', 'w'))

In [8]:
iterator = train.make_one_shot_iterator().get_next()
init = tf.global_variables_initializer()
size = 0
with tf.Session() as sess:
    sess.run(init)
    data = sess.run(iterator)

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


In [15]:
tfrecord_writer = tf.io.TFRecordWriter('./Data/DoS/train')
for x, y in tqdm.tqdm(zip(data['input_features'], data['label'])):
    tfrecord_writer.write(serialize_example(x, y))
tfrecord_writer.close()

TypeError: close() takes 1 positional argument but 2 were given

In [3]:
data_path = './Data/*/'
glob.glob(data_path + 'train_unlabel')

['./Data/RPM/train_unlabel',
 './Data/DoS/train_unlabel',
 './Data/Normal/train_unlabel',
 './Data/gear/train_unlabel',
 './Data/Fuzzy/train_unlabel']