#  Anomaly Detection in Time-Series

### Use autoencoder to detect anomalies in ECG time-series data.

#### 1. Prepare the data

In [1]:
import pandas as pd

df = pd.read_csv("http://storage.googleapis.com/" 
                 + "download.tensorflow.org/data/ecg.csv",
                 header=None)
print(df.shape)
df.head()

(4998, 141)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,131,132,133,134,135,136,137,138,139,140
0,-0.112522,-2.827204,-3.773897,-4.349751,-4.376041,-3.474986,-2.181408,-1.818286,-1.250522,-0.477492,...,0.792168,0.933541,0.796958,0.578621,0.25774,0.228077,0.123431,0.925286,0.193137,1.0
1,-1.100878,-3.99684,-4.285843,-4.506579,-4.022377,-3.234368,-1.566126,-0.992258,-0.75468,0.042321,...,0.538356,0.656881,0.78749,0.724046,0.555784,0.476333,0.77382,1.119621,-1.43625,1.0
2,-0.567088,-2.59345,-3.87423,-4.584095,-4.187449,-3.151462,-1.74294,-1.490659,-1.18358,-0.394229,...,0.886073,0.531452,0.311377,-0.021919,-0.713683,-0.532197,0.321097,0.904227,-0.421797,1.0
3,0.490473,-1.914407,-3.616364,-4.318823,-4.268016,-3.88111,-2.99328,-1.671131,-1.333884,-0.965629,...,0.350816,0.499111,0.600345,0.842069,0.952074,0.990133,1.086798,1.403011,-0.383564,1.0
4,0.800232,-0.874252,-2.384761,-3.973292,-4.338224,-3.802422,-2.53451,-1.783423,-1.59445,-0.753199,...,1.148884,0.958434,1.059025,1.371682,1.277392,0.960304,0.97102,1.614392,1.421456,1.0


 The dataset has 140 columns which represents the ECG readings and a labels column which has been encoded to 0 or 1 showing whether the ECG is abnormal or normal.

In [2]:
from src.utils import plotters

plotters.show_traces(df.iloc[0:10, :-1])

Split the data into training and testing sets.

In [3]:
from sklearn.model_selection import train_test_split

# Separate the data and labels
data = df.iloc[:,:-1].values
labels = df.iloc[:,-1].values

# Split the data into training, validation, and test sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=21)

# Further split the training data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    train_data, train_labels, test_size=0.25, random_state=21)  # 验证集占训练集的 25%

print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))

Training set size: 2998
Validation set size: 1000
Test set size: 1000


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Create a standard scaler
# scaler = StandardScaler()
scaler = MinMaxScaler(feature_range=(0, 1))  # 将特征缩放到 [0, 1] 范围

# Fit and transform the training data
train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
test_data = scaler.transform(test_data)
# Reshape the data to (num_samples, 140, 1)
test_data = np.expand_dims(test_data, axis=-1)

Separate the data for normal and abnormal ECGs

In [5]:
import numpy as np

# [0 or 1] showing whether the ECG is [abnormal or normal].

#The labels are either 0 or 1, so I will convert them into boolean(true or false) 
train_labels = train_labels.astype(bool)
num_true = np.sum(train_labels)
num_false = len(train_labels) - num_true
print(f"True: {num_true}, False: {num_false}")

val_labels = val_labels.astype(bool)
num_true = np.sum(val_labels)
num_false = len(val_labels) - num_true
print(f"True: {num_true}, False: {num_false}")

test_labels = test_labels.astype(bool)
num_true = np.sum(test_labels)
num_false = len(test_labels) - num_true
print(f"True: {num_true}, False: {num_false}")

# Separate the data for normal ECG from that of abnormal ones
# Normal ECG data
normal_train_data = train_data[train_labels]
normal_val_data = val_data[val_labels]
normal_test_data = test_data[test_labels]

# Reshape the data to (num_samples, 140, 1)
normal_train_data = np.expand_dims(normal_train_data, axis=-1)
normal_val_data = np.expand_dims(normal_val_data, axis=-1)
normal_test_data = np.expand_dims(normal_test_data, axis=-1)

# Abnormal ECG data
abnormal_train_data = train_data[~train_labels]
abnormal_val_data = val_data[~val_labels]
abnormal_test_data = test_data[~test_labels]

# Plot the first 10 normal and abnormal ECGs
plotters.show_traces(pd.DataFrame(normal_train_data.squeeze(-1)).iloc[0:10, :],
                     title="Normal ECGs")
plotters.show_traces(pd.DataFrame(abnormal_train_data).iloc[0:10, :],
                     title="Abnormal ECGs")

True: 1771, False: 1227
True: 588, False: 412
True: 560, False: 440


#### 2. Create autoencoder model

AutoEncoder is an unsupervised Artificial Neural Network that attempts to encode the data by compressing it into the lower dimensions (bottleneck layer or code) and then decoding the data to reconstruct the original input. The bottleneck layer (or code) holds the compressed representation of the input data.

In [6]:
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

# Define the model with 1D CNN!

# input_data = tf.random.normal((32, 140, 1)) 
# 32 samples, sequence length 140, 1 feature
# (batch_size, sequence_length, num_features)

class CNNDetector(Model):
    def __init__(self):
        super(CNNDetector, self).__init__()
        
        # Encoder with 1D CNN
        self.encoder = tf.keras.Sequential([
            layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="tanh"),
            layers.MaxPooling1D(pool_size=2, strides=2, padding="same"),
            layers.Dropout(0.2),
            layers.Conv1D(64, kernel_size=3, strides=1, padding="same", activation="tanh"),
            layers.MaxPooling1D(pool_size=2, strides=2, padding="same"),
            layers.Dropout(0.2),
            layers.Conv1D(32, kernel_size=3, strides=1, padding="same", activation="tanh"),
            layers.MaxPooling1D(pool_size=2, strides=2, padding="same"),
            layers.Dropout(0.2),
            layers.Conv1D(16, kernel_size=3, strides=1, padding="same", activation="tanh"),
            layers.GlobalAveragePooling1D()  # Flatten the output to a latent vector
        ])
        
        # Decoder with 1D CNN
        self.decoder = tf.keras.Sequential([
            layers.Dense(35 * 16, activation="tanh"),  # 将潜在向量展开为一个特定的形状
            layers.Reshape((35, 16)),                 # 35 是根据编码器的下采样比例计算出来的
            layers.Conv1DTranspose(32, kernel_size=3, strides=1, padding="same", activation="tanh"),
            layers.UpSampling1D(size=2),             # 时间步数变为 70
            layers.Conv1DTranspose(64, kernel_size=3, strides=1, padding="same", activation="tanh"),
            layers.UpSampling1D(size=2),             # 时间步数变为 140
            layers.Conv1D(1, kernel_size=3, strides=1, padding="same", activation="tanh")  # 输出形状为 (batch_size, 140, 1)
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


#### 3. Train the model

In [7]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import plotly.graph_objects as go

# Assuming `detector` is already defined and normal_train_data, normal_val_data are prepared
autoencoder = CNNDetector()

# Custom weighted MSE loss
def weighted_mse(y_true, y_pred):
    error = tf.square(y_true - y_pred)
    weights = tf.where(error > 0.015, 5.0, 1.0)  # 放大大于 0.02 的误差
    return tf.reduce_mean(weights * error)


# Early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=10,         # Stop if no improvement for 10 epochs
    restore_best_weights=True
)

# Learning rate scheduler to reduce the learning rate on plateau
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',  # Monitor validation loss
    factor=0.5,          # Reduce learning rate by a factor of 0.5
    patience=5,          # Wait for 5 epochs with no improvement
    min_lr=1e-6          # Minimum learning rate
)

tf.keras.optimizers.Adam(learning_rate=0.001)

# Compile the model
autoencoder.compile(optimizer='adam', loss=weighted_mse)

# Train the model
history = autoencoder.fit(
    normal_train_data,
    normal_train_data,
    epochs=20,
    batch_size=128,
    validation_data=(normal_val_data, normal_val_data),
    callbacks=[early_stopping, lr_scheduler]
)

# Plot training and validation loss using Plotly
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        y=history.history['loss'],
        mode='lines',
        name='Training Loss'
    )
)
fig.add_trace(
    go.Scatter(
        y=history.history['val_loss'],
        mode='lines',
        name='Validation Loss'
    )
)
fig.update_layout(
    title='Training and Validation Loss',
    xaxis_title='Epochs',
    yaxis_title='Loss (MSE)',
    template='plotly_white'
)
fig.show()

AttributeError: module 'ml_dtypes' has no attribute 'float8_e3m4'
Epoch 1/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - loss: 1.0037 - val_loss: 0.0944 - learning_rate: 0.0010
Epoch 2/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 0.0768 - val_loss: 0.0432 - learning_rate: 0.0010
Epoch 3/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.0365 - val_loss: 0.0268 - learning_rate: 0.0010
Epoch 4/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 0.0245 - val_loss: 0.0219 - learning_rate: 0.0010
Epoch 5/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.0208 - val_loss: 0.0211 - learning_rate: 0.0010
Epoch 6/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 0.0198 - val_loss: 0.0206 - learning_rate: 0.0010
Epoch 7/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

In [8]:
import numpy as np
import plotly.graph_objects as go
from sklearn.manifold import TSNE

skip = True
if not skip:
    # 假设 model.encoder(normal_data) 和 model.encoder(anomalous_data) 已经得到了潜在空间表示
    latent_space_normal = autoencoder.encoder(normal_train_data).numpy()
    latent_space_anomaly = autoencoder.encoder(abnormal_train_data).numpy()

    # 使用 t-SNE 将高维数据降到 2D
    tsne = TSNE(n_components=2, random_state=42)
    latent_space_2d = tsne.fit_transform(np.concatenate([latent_space_normal, latent_space_anomaly]))

    # 分离降维后的正常和异常数据
    latent_space_2d_normal = latent_space_2d[:len(latent_space_normal)]
    latent_space_2d_anomaly = latent_space_2d[len(latent_space_normal):]

    # 使用 Plotly 绘图
    fig = go.Figure()

    # 添加正常样本的点
    fig.add_trace(go.Scatter(
        x=latent_space_2d_normal[:, 0],
        y=latent_space_2d_normal[:, 1],
        mode='markers',
        name='Normal',
        marker=dict(color='blue', size=5, opacity=0.8)
    ))

    # 添加异常样本的点
    fig.add_trace(go.Scatter(
        x=latent_space_2d_anomaly[:, 0],
        y=latent_space_2d_anomaly[:, 1],
        mode='markers',
        name='Anomalous',
        marker=dict(color='red', size=5, opacity=0.8)
    ))

    # 更新布局
    fig.update_layout(
        title="t-SNE Visualization of Latent Space",
        xaxis_title="t-SNE Component 1",
        yaxis_title="t-SNE Component 2",
        legend=dict(title="Sample Type"),
        template="plotly_white"
    )

    # 显示图形
    fig.show()

Calculate hte threshold with the mean value and standard deviation of the reconstruction loss, or use the 95 percentile.

In [9]:
# Set threshold based on training reconstruction error
train_reconstructed = autoencoder.predict(normal_train_data)
train_reconstruction_error = tf.reduce_mean(
    tf.square(normal_train_data - train_reconstructed),
    axis=1).numpy()
threshold = train_reconstruction_error.mean() + 1 * train_reconstruction_error.std()
print(f"Reconstruction Error Threshold 1 std: {threshold}")

# # Set threshold at the 95th percentile
# threshold = np.percentile(train_reconstruction_error, 95)
# print(f"Reconstruction Error Threshold percentile: {threshold}")


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Reconstruction Error Threshold 1 std: 0.01203011546071514


In [10]:
# Reconstruct the test data to calculate reconstruction error
reconstructed_data = autoencoder.predict(test_data)
reconstruction_error = tf.reduce_mean(
    tf.square(test_data - reconstructed_data),
    axis=1
    ).numpy().squeeze(-1)

# Separate normal and anomaly samples based on test_labels
normal_indices = (test_labels == 1)
anomaly_indices = (test_labels == 0)

# Plot reconstruction errors and threshold
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=np.arange(len(reconstruction_error))[normal_indices],
        y=reconstruction_error[normal_indices],
        mode='markers',
        name='Normal Samples',        
        marker=dict(color='blue')
    )
)
fig.add_trace(
    go.Scatter(
        x=np.arange(len(reconstruction_error))[anomaly_indices],
        y=reconstruction_error[anomaly_indices],
        mode='markers',
        name='Anomaly Samples',        
        marker=dict(color='red')
    )
)
fig.add_trace(
    go.Scatter(
        y=[threshold] * len(reconstruction_error), mode='lines', name='Threshold'
    )
)
fig.update_layout(
    title='Reconstruction Errors and Threshold',
    xaxis_title='Sample Index',
    yaxis_title='Reconstruction Error',
    template='plotly_white'
)
fig.show()

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [11]:
from sklearn.metrics import confusion_matrix

# [0 or 1] showing whether the ECG is [abnormal or normal].
# Test dataset: True: 560, False(anomaly): 440

# Use threshold to classify anomalies
y_pred = (reconstruction_error > threshold).astype(int)
y_pred = 1 - y_pred  # 0: Anomaly, 1: Normal
y_true = test_labels

# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=[0, 1]) # 0: Anomaly, 1: Normal
print("Confusion Matrix:\n", cm)

# Extract confusion matrix values
TN, FP = cm[0, 0], cm[1, 0]  # TN: Anomaly correctly classified, FP: Anomaly misclassified as Normal
FN, TP = cm[0, 1], cm[1, 1]  # FN: Normal misclassified as Anomaly, TP: Normal correctly classified

# Correctly reorder the confusion matrix for visualization
# custom_cm = [[TP, FP], [FN, TN]]
custom_cm = [[FN, TN], [TP, FP]]
print("Custom Confusion Matrix:\n", custom_cm)

# Define labels
labels = ["Normal (Positive)", "Anomaly (Negative)"]

fig = go.Figure(
    data=go.Heatmap(
        z=custom_cm,
        x=labels,
        y=labels,
        colorscale="Blues",
        texttemplate="%{z}",
        textfont={"size": 14}
    )
)
fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Labels",
    yaxis_title="True Labels",
    template="plotly_white",
    xaxis=dict(tickmode="array", tickvals=[1, 0], ticktext=labels),
    yaxis=dict(tickmode="array", tickvals=[1, 0], ticktext=labels),
)
fig.show()

Confusion Matrix:
 [[429  11]
 [ 35 525]]
Custom Confusion Matrix:
 [[np.int64(11), np.int64(429)], [np.int64(525), np.int64(35)]]


Model evaluation metrics

In [12]:
# Accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)

# Precision
precision = TP / (TP + FP) if (TP + FP) > 0 else 0

# Recall (Sensitivity)
recall = TP / (TP + FN) if (TP + FN) > 0 else 0

# Specificity
specificity = TN / (TN + FP) if (TN + FP) > 0 else 0

# F1 Score
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall (Sensitivity): {recall:.2f}")
print(f"Specificity: {specificity:.2f}")
print(f"F1 Score: {f1_score:.2f}")

Accuracy: 0.95
Precision: 0.94
Recall (Sensitivity): 0.98
Specificity: 0.92
F1 Score: 0.96


calculate_probabilities

In [13]:
from sklearn.metrics import precision_recall_curve, auc, roc_curve
import plotly.graph_objects as go

def calculate_probabilities(reconstruction_error, threshold):
    return 1 / (1 + np.exp(-(reconstruction_error - threshold)))

probabilities = calculate_probabilities(reconstruction_error, threshold)

# Min-Max Normalization
min_error = np.min(reconstruction_error)
max_error = np.max(reconstruction_error)
normalized_error = (reconstruction_error - min_error) / (max_error - min_error)

# Z-Score Normalization
mean_error = np.mean(reconstruction_error)
std_error = np.std(reconstruction_error)
z_scores = (reconstruction_error - mean_error) / std_error
standarlized_error = 1 / (1 + np.exp(-z_scores))

# # log error
# log_error = np.log1p(reconstruction_error)  # 使用对数扩展

probabilities = standarlized_error

Plot reconstruction loss distribution

In [14]:
import plotly.graph_objects as go
import numpy as np

y_true_binary = np.array(y_true, dtype=int)

# Negative and Positive samples based on y_true_binary
negative_samples = reconstruction_error[y_true_binary == 0]
positive_samples = reconstruction_error[y_true_binary == 1]

# Create histogram for Negative Samples
hist_negative = go.Histogram(
    x=negative_samples,
    nbinsx=50,
    opacity=0.3,
    name="Negative Samples",
    marker_color="blue"
)

# Create histogram for Positive Samples
hist_positive = go.Histogram(
    x=positive_samples,
    nbinsx=50,
    opacity=0.3,
    name="Positive Samples",
    marker_color="red"
)

# Create figure and layout
fig = go.Figure()
fig.add_trace(hist_negative)
fig.add_trace(hist_positive)

# Update layout
fig.update_layout(
    title="Reconstruction Error Distribution",
    xaxis_title="Reconstruction Error",
    yaxis_title="Frequency",
    barmode="overlay",  # Overlay histograms
    template="plotly_white"
)

# Show figure
fig.show()


The model's ability to distinguish between positive and negative samples is insufficient:

- This might be because the autoencoder's learning capacity is limited and it fails to effectively capture the differences between positive and negative samples.

- Adding more layers or neurons to the encoder part of the autoencoder can enhance the model's ability to represent the data.

Plot precision-recall curve

In [15]:
y_true_binary = np.array(y_true, dtype=int)

# Calculate precision-recall values
precision_values, recall_values, _ = precision_recall_curve(y_true_binary, probabilities)
pr_auc = auc(recall_values, precision_values)

# Calculate ROC curve values
fpr, tpr, _ = roc_curve(y_true_binary, probabilities)
roc_auc = auc(fpr, tpr)

# Plot PR curve
fig_pr = go.Figure()
fig_pr.add_trace(go.Scatter(
    x=recall_values, y=precision_values,
    mode='lines', name='PR Curve',
    line=dict(color='blue', width=2)
))
fig_pr.add_trace(go.Scatter(
    x=[0, 1], y=[1, 0],
    mode='lines', name='Random Classifier',
    line=dict(dash='dash', color='gray')
))
fig_pr.update_layout(
    title=f'Precision-Recall Curve (AUC: {pr_auc:.2f})',
    xaxis_title='Recall',
    yaxis_title='Precision',
    template='plotly_white'
)
fig_pr.show()

# Plot ROC curve
fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(
    x=fpr, y=tpr,
    mode='lines', name='ROC Curve',
    line=dict(color='red', width=2)
))
fig_roc.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines', name='Random Classifier',
    line=dict(dash='dash', color='gray')
))
fig_roc.update_layout(
    title=f'Receiver Operating Characteristic (AUC: {roc_auc:.2f})',
    xaxis_title='False Positive Rate (FPR)',
    yaxis_title='True Positive Rate (TPR)',
    template='plotly_white'
)
fig_roc.show()

# Print AUC values
print(f"PR AUC: {pr_auc:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")


PR AUC: 0.36
ROC AUC: 0.03


In [36]:
import plotly.graph_objects as go
import numpy as np

enc_img = autoencoder.encoder(normal_test_data)
dec_img = autoencoder.decoder(enc_img)

input_data = normal_test_data[0].squeeze(-1)
reconstructed_data = dec_img[0]
# 确保数据的维度正确
input_data = tf.squeeze(input_data)
reconstructed_data = tf.squeeze(reconstructed_data)
print(f"Input data shape before squeeze: {input_data.shape}")
print(f"Reconstructed data shape: {reconstructed_data.shape}")

# Plot the error between the input and the reconstructed data
plotters.compare_org_reconstructed(input_data, reconstructed_data)


Input data shape before squeeze: (140,)
Reconstructed data shape: (140,)


In [41]:
enc_img = autoencoder.encoder(abnormal_test_data)
dec_img = autoencoder.decoder(enc_img)

input_data = abnormal_test_data[0].squeeze(-1)
reconstructed_data = dec_img[0]
input_data = tf.squeeze(input_data)
reconstructed_data = tf.squeeze(reconstructed_data)
print(f"Input data shape before squeeze: {input_data.shape}")
print(f"Reconstructed data shape: {reconstructed_data.shape}")

# Plot the error between the input and the reconstructed data
plotters.compare_org_reconstructed(input_data, reconstructed_data)

Input data shape before squeeze: (140,)
Reconstructed data shape: (140,)
