#  Anomaly Detection in Time-Series

### Use autoencoder to detect anomalies in ECG time-series data.

#### 1. Prepare the data

In [1]:
import pandas as pd

df = pd.read_csv("http://storage.googleapis.com/" 
                 + "download.tensorflow.org/data/ecg.csv",
                 header=None)
print(df.shape)
df.head()

(4998, 141)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,131,132,133,134,135,136,137,138,139,140
0,-0.112522,-2.827204,-3.773897,-4.349751,-4.376041,-3.474986,-2.181408,-1.818286,-1.250522,-0.477492,...,0.792168,0.933541,0.796958,0.578621,0.25774,0.228077,0.123431,0.925286,0.193137,1.0
1,-1.100878,-3.99684,-4.285843,-4.506579,-4.022377,-3.234368,-1.566126,-0.992258,-0.75468,0.042321,...,0.538356,0.656881,0.78749,0.724046,0.555784,0.476333,0.77382,1.119621,-1.43625,1.0
2,-0.567088,-2.59345,-3.87423,-4.584095,-4.187449,-3.151462,-1.74294,-1.490659,-1.18358,-0.394229,...,0.886073,0.531452,0.311377,-0.021919,-0.713683,-0.532197,0.321097,0.904227,-0.421797,1.0
3,0.490473,-1.914407,-3.616364,-4.318823,-4.268016,-3.88111,-2.99328,-1.671131,-1.333884,-0.965629,...,0.350816,0.499111,0.600345,0.842069,0.952074,0.990133,1.086798,1.403011,-0.383564,1.0
4,0.800232,-0.874252,-2.384761,-3.973292,-4.338224,-3.802422,-2.53451,-1.783423,-1.59445,-0.753199,...,1.148884,0.958434,1.059025,1.371682,1.277392,0.960304,0.97102,1.614392,1.421456,1.0


 The dataset has 140 columns which represents the ECG readings and a labels column which has been encoded to 0 or 1 showing whether the ECG is abnormal or normal.

In [2]:
from src.utils import plotters

plotters.show_traces(df.iloc[0:10, :-1])

Split the data into training and testing sets.

In [3]:
from sklearn.model_selection import train_test_split

# Separate the data and labels
data = df.iloc[:,:-1].values
labels = df.iloc[:,-1].values

# Split the data into training, validation, and test sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=21)

# Further split the training data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    train_data, train_labels, test_size=0.25, random_state=21)  # 验证集占训练集的 25%

print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))

Training set size: 2998
Validation set size: 1000
Test set size: 1000


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from plotly import graph_objects as go

# 标准化和归一化
scaler_standard = StandardScaler()
train_data_standardized = scaler_standard.fit_transform(train_data)

scaler_minmax = MinMaxScaler()
train_data_normalized = scaler_minmax.fit_transform(train_data)

def plot_overlay_density(data_list, labels, feature_idx, title):
    """
    使用 Plotly 绘制重叠密度图，展示不同处理方法的分布对比
    """
    fig = go.Figure()
    for data, label in zip(data_list, labels):
        fig.add_trace(go.Histogram(
            x=data[:, feature_idx],
            histnorm='density',
            name=label,
            opacity=0.7
        ))
    fig.update_layout(
        title=title,
        xaxis_title="Value",
        yaxis_title="Density",
        barmode='overlay',
        template="plotly_white"
    )
    fig.show()

# 比较第一个特征在不同处理方法下的分布
plot_overlay_density(
    [train_data, train_data_standardized, train_data_normalized],
    ["Original", "Standardized", "Normalized"],
    feature_idx=0,  # 比较第一个特征
    title="Feature 1 Distribution Comparison"
)

**Why split before standardizing?**

If you use the entire dataset to calculate the mean and standard deviation, information from the test set will "leak" into the training process. This can make the model's performance on the test set seem better than it really is.

In [4]:
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Create a standard scaler
# scaler = StandardScaler()
scaler = MinMaxScaler(feature_range=(0, 1))  # 将特征缩放到 [0, 1] 范围

# Fit and transform the training data
train_data = scaler.fit_transform(train_data)
val_data = scaler.transform(val_data)
test_data = scaler.transform(test_data)

In [8]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

# 生成示例数据
np.random.seed(42)
data = np.random.normal(50, 15, size=(1000, 1))  # 正态分布数据

# 加入少量离群值
data_with_outliers = np.append(data, [[200], [300]])  # 添加离群值
data_with_outliers = data_with_outliers.reshape(-1, 1)  # 将一维数组转换为二维

# 原始数据与归一化数据对比
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data_with_outliers)

# 转换为 DataFrame
df = pd.DataFrame({
    "Original Data": data_with_outliers.flatten(),
    "Normalized Data": normalized_data.flatten()
})

# 绘制分布图
fig = px.histogram(df.melt(var_name="Type", value_name="Value"), x="Value", color="Type", nbins=50, title="Original vs Normalized Data Distribution")
fig.show()


**TensorFlow 3D Array and Axis Example**

Below is an example of working with a 3D array in TensorFlow to demonstrate how the `axis` parameter in `reduce_sum` works. The array has a shape of `(2, 3, 4)`, meaning it contains 2 matrices, each of size `(3, 4)`. By specifying different `axis` values, we can sum along various dimensions.

**Input Array**

    array_3d = tf.constant([
        [
            [1, 2, 3, 4],    # Row 1 of Matrix 1
            [5, 6, 7, 8],    # Row 2 of Matrix 1
            [9, 10, 11, 12]  # Row 3 of Matrix 1
        ],
        [
            [13, 14, 15, 16],  # Row 1 of Matrix 2
            [17, 18, 19, 20],  # Row 2 of Matrix 2
            [21, 22, 23, 24]   # Row 3 of Matrix 2
        ]
    ], dtype=tf.int32)  # Shape: (2, 3, 4)

**Sum along axis=0:**

        [[14, 16, 18, 20],  # Row 1
        [22, 24, 26, 28],  # Row 2
        [30, 32, 34, 36]]  # Row 3

**Sum along axis=1:**

        [[15, 18, 21, 24],  # Matrix 1
        [51, 54, 57, 60]]  # Matrix 2

**Sum along axis=2:**

        [[10, 26, 42],  # Matrix 1
        [58, 74, 90]]  # Matrix 2

Separate the data for normal and abnormal ECGs

In [6]:
#The labels are either 0 or 1, so I will convert them into boolean(true or false) 
train_labels = train_labels.astype(bool)
val_labels = val_labels.astype(bool)
test_labels = test_labels.astype(bool)

#Now let's separate the data for normal ECG from that of abnormal ones
#Normal ECG data
normal_train_data = train_data[train_labels]
normal_val_data = val_data[val_labels]
normal_test_data = test_data[test_labels]

#Abnormal ECG data
abnormal_train_data = train_data[~train_labels]
abnormal_val_data = val_data[~val_labels]
abnormal_test_data = test_data[~test_labels]

# Plot the first 10 normal and abnormal ECGs
plotters.show_traces(pd.DataFrame(normal_train_data).iloc[0:10, :],
                     title="Normal ECG")
plotters.show_traces(pd.DataFrame(abnormal_train_data).iloc[0:10, :],
                     title="Abnormal ECG")

#### 2. Create autoencoder model

AutoEncoder is an unsupervised Artificial Neural Network that attempts to encode the data by compressing it into the lower dimensions (bottleneck layer or code) and then decoding the data to reconstruct the original input. The bottleneck layer (or code) holds the compressed representation of the input data.

In [7]:
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

#Now let's define the model!
#Here I have used the Model Subclassing API (but we can also use the Sequential API)
#The model has 2 parts : 1. Encoder and 2. Decoder
 
class detector(Model):
  def __init__(self):
    super(detector, self).__init__() 
    self.encoder = tf.keras.Sequential([
                                        layers.Dense(32, activation='relu'),
                                        layers.Dense(16, activation='relu'),
                                        layers.Dense(8, activation='relu')
    ])
    self.decoder = tf.keras.Sequential([
                                        layers.Dense(16, activation='relu'),
                                        layers.Dense(32, activation='relu'),
                                        layers.Dense(140, activation='sigmoid')
    ])

  def call(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

#### 3. Train the model

In [None]:
#Let's compile and train the model!!
autoencoder = detector()
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(normal_train_data,
                normal_train_data,
                epochs = 150,
                batch_size=512,
                validation_data=(normal_val_data, normal_val_data)
)

Epoch 1/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0305 - val_loss: 0.0298
Epoch 2/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0294 - val_loss: 0.0284
Epoch 3/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0280 - val_loss: 0.0266
Epoch 4/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0261 - val_loss: 0.0244
Epoch 5/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0238 - val_loss: 0.0220
Epoch 6/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0214 - val_loss: 0.0195
Epoch 7/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0188 - val_loss: 0.0169
Epoch 8/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0161 - val_loss: 0.0144
Epoch 9/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x275703599d0>

- 加入训练loss曲线, early stopping, learning rate scheduler
- evaluate 时显示误差

#### 4. Evaluate the model on the test set

In [9]:
# Evaluate the model on the test set
test_loss = autoencoder.evaluate(normal_test_data, normal_test_data)
print(f"Test Loss (MSE): {test_loss}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0020 
Test Loss (MSE): 0.0019699621479958296


In [None]:
import plotly.graph_objects as go
import numpy as np

enc_img = autoencoder.encoder(normal_test_data)
dec_img = autoencoder.decoder(enc_img)

input_data = normal_test_data[0]
reconstructed_data = dec_img[0]

# 计算误差
error = np.abs(input_data - reconstructed_data)
# Plot the error between the input and the reconstructed data
plotters.compare_org_reconstructed(input_data, reconstructed_data)


In [11]:

enc_img = autoencoder.encoder(abnormal_test_data)
dec_img = autoencoder.decoder(enc_img)

input_data = abnormal_test_data[0]
reconstructed_data = dec_img[0]

# 计算误差
error = np.abs(input_data - reconstructed_data)
# Plot the error between the input and the reconstructed data
plotters.compare_org_reconstructed(input_data, reconstructed_data)