In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('../assets/trains/102.csv', sep=';')
# Sort by timestamps_UTC
timestamp_col = 'timestamps_UTC'
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data = data.sort_values(by=timestamp_col)
# Remove data when consecutive timestamps delta is bigger than 30min
data = data[data[timestamp_col].diff() < pd.Timedelta(minutes=30)]

# data[data[data[timestamp_col].diff() > pd.Timedelta(minutes=30)]]

In [None]:
AirTemp = data['RS_E_InAirTemp_PC2']
Timestamps = data['timestamps_UTC']

ts = pd.Series(AirTemp.values, index=Timestamps)
ts = ts[2000:4000]

# Normalize the data between 0 and 1
ts = (ts - ts.min()) / (ts.max() - ts.min())

In [None]:
import tensorflow as tf
from keras.src.layers import LSTM, TimeDistributed, Dense
from keras.src.layers import Dropout
from keras.src.layers import RepeatVector
from keras import Sequential
import numpy as np
import pandas as pd

# Assuming normalized_ts is your normalized time series
sequence_length = 30

# Function to create sequences
def create_sequences(values, time_steps=sequence_length):
    output = []
    for i in range(len(values) - time_steps):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)

# Create sequences
x_train = create_sequences(ts)
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))

# Build the model
model = Sequential([
    LSTM(128, input_shape=(x_train.shape[1], x_train.shape[2])),
    Dropout(0.2),
    RepeatVector(x_train.shape[1]),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    TimeDistributed(Dense(x_train.shape[2]))
])

model.compile(optimizer='adam', loss='mae')

# Train the model
model.fit(x_train, x_train, epochs=10, batch_size=32, validation_split=0.1)

# Anomaly detection
x_train_pred = model.predict(x_train)
x_train_pred
train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis=1)

In [None]:
# Detect all the samples which are anomalies
anomalies = train_mae_loss > 2 * np.std(train_mae_loss)

In [None]:
# Plot anomalies
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 5))
plt.plot(ts[sequence_length:].index, ts[sequence_length:], label='AirTemp')
plt.plot(ts[sequence_length:].index, train_mae_loss, label='Train MAE loss')
plt.plot(ts[sequence_length:].index, anomalies, label='Anomaly')

plt.legend()


In [None]:
# Plot loss
plt.figure(figsize=(15, 5))
plt.plot(train_mae_loss, label='Train MAE loss')

plt.legend()


In [None]:
from processing.outliers.moving_average import moving_average
from processing.outliers.outliers import compute_outliers

# Use moving average to detect outliers
window_size = 10
# Compute moving average of moving average
ts_ma = moving_average(ts, window_size)
compute_outliers(ts_ma, ts).plot()

# Compute difference between moving average and initial data
diff = ts_ma - ts
# Compute derivative of difference
diff = diff.diff()
#Absolute value of derivative
diff = diff.abs()
# Now plot the original data and the moving average, along with the outliers
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 5))
plt.plot(ts.index, ts, label='Original data')
plt.plot(ts_ma.index, ts_ma, label='Moving average (n=%d)' % window_size)
# plot arima

# When the derivative is bigger than 0.1, we have an outlier
plt.plot(diff[diff > 0.3].index, ts[diff > 0.3], 'ro', label='Outliers')
plt.legend(loc='best')

# Outliers detection using ARIMA

In [None]:
data = pd.read_csv('../assets/trains/102.csv', sep=';')
# Sort by timestamps_UTC
timestamp_col = 'timestamps_UTC'
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data = data.sort_values(by=timestamp_col)
# Remove data when consecutive timestamps delta is bigger than 30min
data = data[data[timestamp_col].diff() < pd.Timedelta(minutes=30)]

# Remove data when 'RS_E_InAirTemp_PC2' is 0
data_clean = data[data['RS_E_InAirTemp_PC2'] != 0]

AirTemp = data['RS_E_InAirTemp_PC2']
Timestamps = data['timestamps_UTC']

AirTemp_clean = data_clean['RS_E_InAirTemp_PC2']
Timestamps_clean = data_clean['timestamps_UTC']

ts = pd.Series(AirTemp.values, index=Timestamps)
ts = ts[0:100000]

ts_clean = pd.Series(AirTemp_clean.values, index=Timestamps_clean)
ts_clean = ts_clean[0:100000]

# Normalize the data between 0 and 1
ts = (ts - ts.min()) / (ts.max() - ts.min())
ts_clean = (ts_clean - ts_clean.min()) / (ts_clean.max() - ts_clean.min())

In [None]:
# Import the library
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt

# Create one model for the clean data and one for the original data

# Fit the model for the original data
model = ARIMA(ts, order=(5, 1, 0))
model_fit = model.fit()

# Compute the residuals
residuals = pd.DataFrame(model_fit.resid)
residuals.plot(label='Residuals')
plt.legend()

# Fit the model for the clean data
model_clean = ARIMA(ts_clean, order=(5, 1, 0))
model_fit_clean = model_clean.fit()

# Compute the residuals
residuals_clean = pd.DataFrame(model_fit_clean.resid)
residuals_clean.plot(label='Residuals')
plt.legend()

In [None]:
# Detect outliers
outliers = residuals[abs(residuals[0]) > 0.3]

plt.figure(figsize=(15, 5))

# Plot the outliers
plt.plot(ts.index, ts, label='Original data')
plt.plot(outliers.index, ts[outliers.index], 'ro', label='Outliers')
plt.legend(loc='best')
plt.show()

plt.figure(figsize=(15, 5))

# Plot the outliers for the clean data
outliers_clean = residuals_clean[abs(residuals_clean[0]) > 0.3]
plt.plot(ts_clean.index, ts_clean, label='Clean data')
plt.plot(outliers_clean.index, ts_clean[outliers_clean.index], 'ro', label='Outliers')
plt.legend(loc='best')
plt.show()
