## Data Preprocessing

In [1]:
import pandas as pd

# Import data
df = pd.read_parquet("train.parquet", engine='fastparquet')

In [2]:
# Import target channels
tar_chan = pd.read_csv("target_channels.csv")

# Extracting channels to monitor as an array
channels = tar_chan['target_channels'].to_numpy()

In [3]:
# Getting basic info on the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14728321 entries, 0 to 14728320
Data columns (total 89 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int64  
 1   channel_1        float32
 2   channel_10       float32
 3   channel_11       float32
 4   channel_12       float32
 5   channel_13       float32
 6   channel_14       float32
 7   channel_15       float32
 8   channel_16       float32
 9   channel_17       float32
 10  channel_18       float32
 11  channel_19       float32
 12  channel_2        float32
 13  channel_20       float32
 14  channel_21       float32
 15  channel_22       float32
 16  channel_23       float32
 17  channel_24       float32
 18  channel_25       float32
 19  channel_26       float32
 20  channel_27       float32
 21  channel_28       float32
 22  channel_29       float32
 23  channel_3        float32
 24  channel_30       float32
 25  channel_31       float32
 26  channel_32       float32
 27  channel_33

In [4]:
# Checking for missing values
pd.set_option('display.max_rows', None) # Displays all rows in cell output
df.isna().any()
# Data is complete

id                 False
channel_1          False
channel_10         False
channel_11         False
channel_12         False
channel_13         False
channel_14         False
channel_15         False
channel_16         False
channel_17         False
channel_18         False
channel_19         False
channel_2          False
channel_20         False
channel_21         False
channel_22         False
channel_23         False
channel_24         False
channel_25         False
channel_26         False
channel_27         False
channel_28         False
channel_29         False
channel_3          False
channel_30         False
channel_31         False
channel_32         False
channel_33         False
channel_34         False
channel_35         False
channel_36         False
channel_37         False
channel_38         False
channel_39         False
channel_4          False
channel_40         False
channel_41         False
channel_42         False
channel_43         False
channel_44         False


In [5]:
# Change dtype of telecommands for memory optimization
# Telecommands are binary controls (only uses values 0 and 1)
# The dtype can be changed from float64 to int8 to reduce memory usage
telecommands = ['telecommand_244', 'telecommand_350', 'telecommand_351', 'telecommand_352', 'telecommand_352', 'telecommand_353',
                'telecommand_354', 'telecommand_36', 'telecommand_376', 'telecommand_38', 'telecommand_39', 'telecommand_40']
for command in telecommands:
    df[command] = df[command].astype('int8')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14728321 entries, 0 to 14728320
Data columns (total 89 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int64  
 1   channel_1        float32
 2   channel_10       float32
 3   channel_11       float32
 4   channel_12       float32
 5   channel_13       float32
 6   channel_14       float32
 7   channel_15       float32
 8   channel_16       float32
 9   channel_17       float32
 10  channel_18       float32
 11  channel_19       float32
 12  channel_2        float32
 13  channel_20       float32
 14  channel_21       float32
 15  channel_22       float32
 16  channel_23       float32
 17  channel_24       float32
 18  channel_25       float32
 19  channel_26       float32
 20  channel_27       float32
 21  channel_28       float32
 22  channel_29       float32
 23  channel_3        float32
 24  channel_30       float32
 25  channel_31       float32
 26  channel_32       float32
 27  channel_33

In [6]:
# Checking for class imbalance
df['is_anomaly'].value_counts()
# Dataset features a class imbalance, where only about 10% of data contains anomalies/rare events

is_anomaly
0    13184217
1     1544104
Name: count, dtype: int64

In [7]:
# Creating features
X = df.drop(['is_anomaly', 'id'], axis=1)
y = df['is_anomaly']

# Creating a validation split
t_end = int(0.8 * len(X)) # using 80:20 split
X_train, y_train = X[:t_end], y[:t_end]
X_val, y_val = X[t_end:], y[t_end:]

# Scaling the feature data
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
# Fit scaler on non-anomalous samples
sc.fit(X_train[y_train == 0])

X_train = sc.transform(X_train)
X_val = sc.transform(X_val)

In [8]:
# Bin the discrete data using sliding windows
import tensorflow as tf

# Streaming data for memory optimization
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))

def split_windows(ds, window_size, batch_size):
    ds = ds.window(window_size, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda x, y: tf.data.Dataset.zip((x.batch(window_size), y.batch(window_size))))
    ds = ds.map(lambda x, y: (x, y[-1]))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

windows = 128
batch_size = 256
train_ds = split_windows(train_ds, windows, batch_size)
val_ds = split_windows(val_ds, windows, batch_size)