# Test rig anomaly detection

## Setup

### Libraries import

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow import keras
from keras import layers

In [None]:
os.chdir('\\Users\\iokhotnikov\\Documents\\Python\\hhl\\test_rig\\code')
from scripts.utils.readers import DataReader, Preprocessor
from scripts.utils.config import FEATURES, FEATURES_NO_TIME

In [None]:
os.chdir('\\Users\\iokhotnikov\\Documents\\Python\\hhl\\test_rig')

### Data import

In [None]:
def read_data(mode='preprocessed'):
    if mode == 'raw':
        df = DataReader.read_all_raw_data(verbose=True,
                                          features_to_read=FEATURES)
        df = Preprocessor.remove_step_zero(df, inplace=False)
        df.sort_values(by=['DATE', 'TIME'], inplace=True, ignore_index=True)
    if mode == 'processed':
        df = pd.read_csv(os.path.join('data', 'processed',
                                      'combined_timed_data.csv'),
                         parse_dates=True,
                         infer_datetime_format=True,
                         dtype=dict(
                             zip(FEATURES_NO_TIME,
                                 [np.float32] * len(FEATURES_NO_TIME))))
        df[['STEP', 'UNIT', 'TEST',
            'ARMANI']] = df[['STEP', 'UNIT', 'TEST',
                             'ARMANI']].astype(np.uint8)
        df['TIME'] = pd.to_datetime(df['TIME'])
        df['DATE'] = pd.to_datetime(df['DATE'])
    df['RUNNING TIME'] = pd.date_range(start=f'{min(df["DATE"])} 00:00:00',
                                       periods=len(df),
                                       freq='s')
    df['RUNNING DURATION'] = pd.to_timedelta(range(len(df)), unit='s')
    df['RUNNING HOURS'] = (
        pd.to_timedelta(range(len(df)), unit='s').total_seconds() /
        3600).astype(np.float32)
    return df

In [None]:
df = read_data(mode='processed')

### Inspect and cleanup

In [None]:
test_lengths = []
step_lengths = []
for unit in df['UNIT'].unique():
    for unit_test in df[df['UNIT'] == unit]['TEST'].unique():
        test_lengths.append(
            len(df[(df['UNIT'] == unit) & (df['TEST'] == unit_test)]))
        for step in df[(df['UNIT'] == unit)
                       & (df['TEST'] == unit_test)]['STEP'].unique():
            step_lengths.append(
                len(df[(df['UNIT'] == unit) & (df['TEST'] == unit_test) &
                       (df['STEP'] == step)]))
mean_test_dur_sec = np.mean(test_lengths)
mean_step_dur_sec = np.mean(step_lengths)
print(
    f'Mean test duration {mean_test_dur_sec:.2f} seconds = {mean_test_dur_sec/60:.2f} minutes = {mean_test_dur_sec/3600:.2f} hours'
)
print(
    f'Mean step duration {mean_step_dur_sec:.2f} seconds = {mean_step_dur_sec/60:.2f} minutes = {mean_step_dur_sec/3600:.2f} hours'
)

In [None]:
def plot_feature(df, feature):
    plt.figure(figsize=(20, 5), tight_layout=True)
    plt.plot(df['RUNNING HOURS'], df[feature])
    plt.ylabel(feature)
    plt.xlabel('TIME, HOURS')
    plt.show()


def plot_data(df):
    for feature in df.columns:
        if 'RUNNING' not in feature:
            plot_feature(df, feature)

In [None]:
# plot_data(df[FEATURES_NO_TIME + ['RUNNING HOURS']])

### Feature engineering

In [None]:
# INITIAL_TREND_FEATURES = [
#     'M1 CURRENT', 'M1 TORQUE', 'PT4', 'D1 RPM', 'D1 CURRENT', 'D1 TORQUE',
#     'M2 RPM', 'M2 Amp', 'M2 Torque', 'CHARGE PT', 'CHARGE FLOW', 'M3 Amp',
#     'M3 Torque', 'Servo PT', 'SERVO FLOW', 'HSU IN', 'TT2', 'HSU OUT',
#     'M5 Amp', 'M5 Torque', 'M6 RPM', 'M6 Amp', 'M6 Torque', 'M7 RPM', 'M7 Amp',
#     'M7 Torque', 'Vibration 1', ' Vibration 2'
# ]

In [None]:
# ENGINEERED_FEATURES = [
#     'DRIVE POWER', 'LOAD POWER', 'CHARGE MECH POWER', 'CHARGE HYD POWER',
#     'SERVO MECH POWER', 'SERVO HYD POWER', 'SCAVENGE POWER',
#     'MAIN COOLER POWER', 'GEARBOX COOLER POWER'
# ]
# df['DRIVE POWER'] = (df['M1 SPEED'] * df['M1 TORQUE'] * np.pi / 30 /
#                      1e3).astype(np.float32)
# df['LOAD POWER'] = abs(df['D1 RPM'] * df['D1 TORQUE'] * np.pi / 30 /
#                        1e3).astype(np.float32)
# df['CHARGE MECH POWER'] = (df['M2 RPM'] * df['M2 Torque'] * np.pi / 30 /
#                            1e3).astype(np.float32)
# df['CHARGE HYD POWER'] = (df['CHARGE PT'] * 1e5 * df['CHARGE FLOW'] * 1e-3 /
#                           60 / 1e3).astype(np.float32)
# df['SERVO MECH POWER'] = (df['M3 RPM'] * df['M3 Torque'] * np.pi / 30 /
#                           1e3).astype(np.float32)
# df['SERVO HYD POWER'] = (df['Servo PT'] * 1e5 * df['SERVO FLOW'] * 1e-3 / 60 /
#                          1e3).astype(np.float32)
# df['SCAVENGE POWER'] = (df['M5 RPM'] * df['M5 Torque'] * np.pi / 30 /
#                         1e3).astype(np.float32)
# df['MAIN COOLER POWER'] = (df['M6 RPM'] * df['M6 Torque'] * np.pi / 30 /
#                            1e3).astype(np.float32)
# df['GEARBOX COOLER POWER'] = (df['M7 RPM'] * df['M7 Torque'] * np.pi / 30 /
#                               1e3).astype(np.float32)

In [None]:
# TREND_FEATURES = ENGINEERED_FEATURES + [
#     'PT4', 'HSU IN', 'TT2', 'HSU OUT', 'Vibration 1', ' Vibration 2'
# ]

Computing the number of samples we'll use for each data split

In [None]:
# data = df[TREND_FEATURES + ['RUNNING HOURS']].copy(deep=True)
# raw_data = df[TREND_FEATURES].copy(deep=True)
# vibration = raw_data.pop('Vibration 1')
vibration = df.pop('Vibration 1').astype(np.float32)
del df

In [None]:
vibration_mean = vibration.mean()
vibration_std = vibration.std()
train_vibration = (vibration - vibration_mean) / vibration_std

In [None]:
train_vibration.values.shape

In [None]:
TIME_STEPS = 3600

def create_sequences(values, time_steps=TIME_STEPS):
    output = []
    for i in range(len(values) - time_steps + 1):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)

In [None]:
x_train = create_sequences(train_vibration.values.reshape(-1, 1))
print(x_train.shape)

In [None]:
model = keras.models.Sequential([
    layers.LSTM(128, input_shape=(x_train.shape[1], x_train.shape[2])),
    layers.Dropout(0.2),
    layers.RepeatVector(TIME_STEPS),  # replicates features from outputs (30 times)
    layers.LSTM(128, return_sequences=True),
    layers.Dropout(0.2),
    # Time distributed layer to get an output with right shape
    layers.TimeDistributed(layers.Dense(x_train.shape[2]))
])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mse')
model.summary()

In [None]:
history = model.fit(x_train,
                    x_train,
                    epochs=5,
                    batch_size=128,
                    validation_split=0.1,
                    callbacks=[
                        keras.callbacks.EarlyStopping(monitor="val_loss",
                                                      patience=5,
                                                      mode="min")
                    ])


In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.show()

In [None]:
# Get train MAE loss.
x_train_pred = model.predict(x_train)
train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis=1)

plt.hist(train_mae_loss, bins=50)
plt.xlabel("Train MAE loss")
plt.ylabel("No of samples")
plt.show()

# Get reconstruction loss threshold.
threshold = np.max(train_mae_loss)
print("Reconstruction error threshold: ", threshold)

In [None]:
# Checking how the first sequence is learnt
plt.plot(x_train[2])
plt.plot(x_train_pred[2])
plt.show()