#  Anomaly Detection in Time-Series

### Use autoencoder to detect anomalies in ECG time-series data.

#### 1. Prepare the data

In [1]:
import pandas as pd

df = pd.read_csv("http://storage.googleapis.com/" 
                 + "download.tensorflow.org/data/ecg.csv",
                 header=None)
print(df.shape)
df.head()

(4998, 141)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,131,132,133,134,135,136,137,138,139,140
0,-0.112522,-2.827204,-3.773897,-4.349751,-4.376041,-3.474986,-2.181408,-1.818286,-1.250522,-0.477492,...,0.792168,0.933541,0.796958,0.578621,0.25774,0.228077,0.123431,0.925286,0.193137,1.0
1,-1.100878,-3.99684,-4.285843,-4.506579,-4.022377,-3.234368,-1.566126,-0.992258,-0.75468,0.042321,...,0.538356,0.656881,0.78749,0.724046,0.555784,0.476333,0.77382,1.119621,-1.43625,1.0
2,-0.567088,-2.59345,-3.87423,-4.584095,-4.187449,-3.151462,-1.74294,-1.490659,-1.18358,-0.394229,...,0.886073,0.531452,0.311377,-0.021919,-0.713683,-0.532197,0.321097,0.904227,-0.421797,1.0
3,0.490473,-1.914407,-3.616364,-4.318823,-4.268016,-3.88111,-2.99328,-1.671131,-1.333884,-0.965629,...,0.350816,0.499111,0.600345,0.842069,0.952074,0.990133,1.086798,1.403011,-0.383564,1.0
4,0.800232,-0.874252,-2.384761,-3.973292,-4.338224,-3.802422,-2.53451,-1.783423,-1.59445,-0.753199,...,1.148884,0.958434,1.059025,1.371682,1.277392,0.960304,0.97102,1.614392,1.421456,1.0


 The dataset has 140 columns which represents the ECG readings and a labels column which has been encoded to 0 or 1 showing whether the ECG is abnormal or normal.

In [2]:
from src.utils import plotters

plotters.show_traces(df.iloc[0:10, :-1])

Split the data into training and testing sets.

In [3]:
from sklearn.model_selection import train_test_split

# Separate the data and labels
data = df.iloc[:,:-1].values
labels = df.iloc[:,-1].values

# Split the data into training, validation, and test sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=21)

# Further split the training data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    train_data, train_labels, test_size=0.25, random_state=21)  # 验证集占训练集的 25%

print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))

Training set size: 2998
Validation set size: 1000
Test set size: 1000


Investigate the distribution of the data.

In [13]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from plotly import graph_objects as go
from src.utils import plotters

# Standardize and normalize the training data
scaler_standard = StandardScaler()
train_data_standardized = scaler_standard.fit_transform(train_data)

scaler_minmax = MinMaxScaler()
train_data_normalized = scaler_minmax.fit_transform(train_data)

# 比较第一个特征在不同处理方法下的分布
plotters.plot_overlay_density(
    [train_data, train_data_standardized, train_data_normalized],
    ["Original", "Standardized", "Normalized"],
    feature_idx=0,  # 比较第一个特征
    title="Feature 1 Distribution Comparison"
)
# plotters.plot_overlay_density(
#     [train_data, train_data_standardized, train_data_normalized],
#     ["Original", "Standardized", "Normalized"],
#     feature_idx=2,  # 比较第一个特征
#     title="Feature 2 Distribution Comparison"
# )
# plotters.plot_overlay_density(
#     [train_data, train_data_standardized, train_data_normalized],
#     ["Original", "Standardized", "Normalized"],
#     feature_idx=3,  # 比较第一个特征
#     title="Feature 3 Distribution Comparison"
# )

**Why split before normalization/standardization?**

If you use the entire dataset to calculate the mean and standard deviation, information from the test set will "leak" into the training process. This can make the model's performance on the test set seem better than it really is.

In [5]:
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Create a standard scaler
# scaler = StandardScaler()
scaler = MinMaxScaler(feature_range=(0, 1))  # 将特征缩放到 [0, 1] 范围

# Fit and transform the training data
train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
test_data = scaler.transform(test_data)

**TensorFlow 3D Array and Axis Example**

This example demonstrates how the `axis` parameter in TensorFlow's `reduce_sum` function works using a 3D array. The array has a shape of `(2, 3, 4)`, containing 2 matrices, each with dimensions `(3, 4)`. 


**Input Array**

    array_3d = tf.constant([
        [
            [1, 2, 3, 4],    # Row 1 of Matrix 1
            [5, 6, 7, 8],    # Row 2 of Matrix 1
            [9, 10, 11, 12]  # Row 3 of Matrix 1
        ],
        [
            [13, 14, 15, 16],  # Row 1 of Matrix 2
            [17, 18, 19, 20],  # Row 2 of Matrix 2
            [21, 22, 23, 24]   # Row 3 of Matrix 2
        ]
    ], dtype=tf.int32)  # Shape: (2, 3, 4)

**Sum along axis=0:**

        Axis 0: Corresponding elements from the 2 matrices are summed
        resulting in 3 rows and 4 columns.
        [[14, 16, 18, 20],  # Row 1
        [22, 24, 26, 28],  # Row 2
        [30, 32, 34, 36]]  # Row 3

**Sum along axis=1:**

        Axis 1: Each matrix has 3 rows, and the elements of these rows
        are summed, resulting in 4 columns.
        [[15, 18, 21, 24],  # Matrix 1
        [51, 54, 57, 60]]  # Matrix 2

**Sum along axis=2:**

        Axis 2 (dimension=2): Each row has 4 columns, and the elements of these columns
        are summed.
        第 2 维 (axis=2)：每行有 4 列, 4列元素之和。
        [[10, 26, 42],  # Matrix 1
        [58, 74, 90]]  # Matrix 2

Separate the data for normal and abnormal ECGs

In [None]:
#The labels are either 0 or 1, so I will convert them into boolean(true or false) 
train_labels = train_labels.astype(bool)
val_labels = val_labels.astype(bool)
test_labels = test_labels.astype(bool)

#Now let's separate the data for normal ECG from that of abnormal ones
#Normal ECG data
normal_train_data = train_data[train_labels]
normal_val_data = val_data[val_labels]
normal_test_data = test_data[test_labels]

#Abnormal ECG data
abnormal_train_data = train_data[~train_labels]
abnormal_val_data = val_data[~val_labels]
abnormal_test_data = test_data[~test_labels]

# Plot the first 10 normal and abnormal ECGs
plotters.show_traces(pd.DataFrame(normal_train_data).iloc[0:10, :],
                     title="Normal ECGs")
plotters.show_traces(pd.DataFrame(abnormal_train_data).iloc[0:10, :],
                     title="Abnormal ECGs")

#### 2. Create autoencoder model

AutoEncoder is an unsupervised Artificial Neural Network that attempts to encode the data by compressing it into the lower dimensions (bottleneck layer or code) and then decoding the data to reconstruct the original input. The bottleneck layer (or code) holds the compressed representation of the input data.

In [7]:
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

# Define the model!
class detector(Model):
  def __init__(self):
    super(detector, self).__init__() 
    # leaky_relu relu PReLU
    self.encoder = tf.keras.Sequential([
                                        layers.Dense(32, activation='relu'),
                                        layers.Dense(16, activation='relu'),
                                        layers.Dense(8, activation='relu')
    ])
    self.decoder = tf.keras.Sequential([
                                        layers.Dense(16, activation='relu'),
                                        layers.Dense(32, activation='relu'),
                                        layers.Dense(140, activation='sigmoid')
    ])

  def call(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

Comparison of Activation Functions

In [14]:
import numpy as np
import plotly.graph_objects as go

# Define activation functions
x = np.linspace(-5, 5, 1000)

activation_functions = {
    'ReLU': np.maximum(0, x),
    'Leaky ReLU': np.where(x > 0, x, 0.01 * x),
    'ELU': np.where(x > 0, x, 1.0 * (np.exp(x) - 1)),
    'SELU': np.where(x > 0, 1.0507 * x, 1.0507 * 1.67326 * (np.exp(x) - 1)),
    'PReLU': np.where(x > 0, x, 0.2 * x),
    'Swish': x / (1 + np.exp(-x)),
    'GELU': 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3))),
    'Softplus': np.log(1 + np.exp(x)),
    'Mish': x * np.tanh(np.log(1 + np.exp(x))),
    'Sigmoid (traditional)': 1 / (1 + np.exp(-x)),
    'Tanh (traditional)': np.tanh(x)
}

# Create the plot
fig = go.Figure()

for name, y in activation_functions.items():
    fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name=name))

# Customize layout
fig.update_layout(
    title='Comparison of Activation Functions',
    xaxis_title='Input',
    yaxis_title='Output',
    legend_title='Activation Functions',
    template='plotly_white'
)

fig.show()


#### 3. Train the model

In [9]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Assuming `detector` is already defined and normal_train_data, normal_val_data are prepared
autoencoder = detector()

# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=10,         # Stop if no improvement for 10 epochs
    restore_best_weights=True
)

# Learning rate scheduler to reduce the learning rate on plateau
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',  # Monitor validation loss
    factor=0.5,          # Reduce learning rate by a factor of 0.5
    patience=5,          # Wait for 5 epochs with no improvement
    min_lr=1e-6          # Minimum learning rate
)

# Train the model
history = autoencoder.fit(
    normal_train_data,
    normal_train_data,
    epochs=200,
    batch_size=512,
    validation_data=(normal_val_data, normal_val_data),
    callbacks=[early_stopping, lr_scheduler]
)

Epoch 1/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - loss: 0.0310 - val_loss: 0.0291
Epoch 2/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.0287 - val_loss: 0.0270
Epoch 3/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0264 - val_loss: 0.0247
Epoch 4/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0242 - val_loss: 0.0223
Epoch 5/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0218 - val_loss: 0.0198
Epoch 6/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0191 - val_loss: 0.0172
Epoch 7/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0165 - val_loss: 0.0147
Epoch 8/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0141 - val_loss: 0.0125
Epoch 9/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x203f2f98c10>

- 加入训练loss曲线, early stopping, learning rate scheduler
- evaluate 时显示误差

#### 4. Evaluate the model on the test set

In [10]:
# Evaluate the model on the test set
test_loss = autoencoder.evaluate(normal_test_data, normal_test_data)
print(f"Test Loss (MSE): {test_loss}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0015 
Test Loss (MSE): 0.0014119630213826895


In [11]:
import plotly.graph_objects as go
import numpy as np

enc_img = autoencoder.encoder(normal_test_data)
dec_img = autoencoder.decoder(enc_img)

input_data = normal_test_data[0]
reconstructed_data = dec_img[0]

# 计算误差
error = np.abs(input_data - reconstructed_data)
# Plot the error between the input and the reconstructed data
plotters.compare_org_reconstructed(input_data, reconstructed_data)


In [12]:

enc_img = autoencoder.encoder(abnormal_test_data)
dec_img = autoencoder.decoder(enc_img)

input_data = abnormal_test_data[0]
reconstructed_data = dec_img[0]

# 计算误差
error = np.abs(input_data - reconstructed_data)
# Plot the error between the input and the reconstructed data
plotters.compare_org_reconstructed(input_data, reconstructed_data)