In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import joblib
import os

from matplotlib import pyplot as plt
from keras.layers import Input, Dense, LSTM, TimeDistributed, RepeatVector
from keras.models import Model
from keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler

#### Declare global values

In [None]:
station_location = "Beluru"
directory = os.path.dirname(os.getcwd()) + '/dataset/' + station_location 
dataStorage = os.path.dirname(os.getcwd()) + "/train/" + station_location + "/data/"
result = os.path.dirname(os.getcwd()) + "/train/" + station_location + "/result_lstm/"
testYear = "_" + str(2018)
trainYear = "_" + str(2019)

#### Plot the training losses

In [None]:
def plot_lossfunction(history):
    fig, ax = plt.subplots(figsize=(14, 6), dpi=80)
    ax.plot(history['loss'], 'b', label='Train', linewidth=2)
    ax.plot(history['val_loss'], 'r', label='Validation', linewidth=2)
    ax.set_title('Model loss', fontsize=16)
    ax.set_ylabel('Loss (mae)')
    ax.set_xlabel('Epoch')
    ax.legend(loc='upper right')
    plt.show()

#### Frequencies of the healthy sensor signal

In [None]:
def plot_fft(data, title):
    fig, ax = plt.subplots(figsize=(14, 6), dpi=80)
    ax.plot(data[:,0].real, label='FFT', color='blue', animated = True, linewidth=1)
    plt.legend(loc='lower left')
    ax.set_title(title, fontsize=16)
    plt.show()

#### Define the autoencoder network model

In [None]:
def autoencoder_model(X):
    inputs = Input(shape=(X.shape[1], X.shape[2]))
    L1 = LSTM(16, activation='relu', return_sequences=True,
             kernel_regularizer=regularizers.l2(0.00))(inputs)
    L2 = LSTM(4, activation='relu', return_sequences=False)(L1)
    L3 = RepeatVector(X.shape[1])(L2)
    L4 = LSTM(4, activation='relu', return_sequences=True)(L3)
    L5 = LSTM(16, activation='relu', return_sequences=True)(L4)
    
    output = TimeDistributed(Dense(X.shape[2]))(L5)
    model = Model(inputs=inputs, outputs=output)
    return model

In [None]:
def fillWithLine(y, spiketnt, timestamp, waterlevel):
    df_temp = pd.DataFrame()
    df_temp['timestamp'] = timestamp
    df_temp['waterlevel'] = waterlevel
    
    df_raw = df_temp['waterlevel']
    df_keep = df_raw.loc[np.where(spiketnt != 1)[0]] # find those that are normal
    df_out = pd.merge(df_keep, df_raw, how='outer', left_index=True, right_index=True)
    
    # keep first column
    s = df_out.iloc[:,0]
    
    # fill in missing vals
    df_complete = s.fillna(axis=0, method='ffill').fillna(axis=0,method="bfill")
    df_temp['waterlevel'] = df_complete.values
    print(df_complete.values)

    return df_temp['waterlevel'].values

In [None]:
# def saveToExcelFile(df, time_name, water_name, filename):
# #     check if directory correct
#     filename = result + filename + "_result.csv"
    
#     if not os.path.exists( directory):
#         os.makedirs( directory)
        
#     df = df.rename_axis("timestamp")
#     df = df.rename(
#         columns={
#             time_name:"timestamp",
#             water_name:"waterlevel"
#         })
#     df.to_csv(filename)

In [None]:
# this one not sure if use
def plotOriGraph(df_new, timestamp, waterlevel, waterlevel_flat, title):
    fig = (px.scatter(x = timestamp,y = waterlevel).update_traces(mode='markers+lines'))
    fig.update_xaxes(rangeslider_visible=True)
    fig.update_layout(
        { 
            "title":title,
            "xaxis":{
                "title":"timestamp"
            },
            "yaxis":{
                "title":"waterlevel"
            }
        })
    fig.show()

In [None]:
def data(filename, water_name, time_name):

    '''
    - read in the timestamp and waterlevel;
    - select those waterlevel!=nan
    - drop duplicates timestamps & waterlevel and keep the last
    - set the 'timestamp' column into DatetimeIndex and set as index and sort it (timestamp must be monotronic)
    '''
    df = pd.read_csv(filename, usecols=[time_name, water_name])
    df_new = df[df[water_name].notna()]
    print("after droppping na: " + str(df_new.shape))

    # there are duplicates timestamp in the files,keep the last
    df_new = df_new.drop_duplicates(subset=time_name, keep='last', ignore_index=True)
    print("after droppping duplicates: " + str(df_new.shape))

    df_new[time_name] = pd.DatetimeIndex(df_new[time_name], dayfirst=True)
    df_new = df_new.set_index(time_name)
    df_new = df_new.sort_index()
    print("original size: " + str(df.shape))
    print("after sort index: " + str(df_new.shape))
#     print(df_new[water_name])

    '''
    - change timestamp from "date" format to "string format" 
    '''
    timestamp = df_new[water_name].index.strftime("%D-M%-Y%")
    waterlevel = df_new[water_name].values
    print(timestamp.shape)
    
    plotOriGraph(df_new,timestamp,waterlevel,None,"Original")
    
    return df_new

In [None]:
def plotGraph(df_new, timestamp, waterlevel, waterlevel_flat, title):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=timestamp, y=waterlevel,
                    mode='lines+markers',
                    name='Original'))
    fig.add_trace(go.Scatter(x=timestamp, y=waterlevel_flat,
                    mode='lines+markers',
                    name=title))
#     fig = px.add_line(x=timestamp,y=waterlevel_flat)
    fig.update_xaxes(rangeslider_visible=True)
    fig.update_layout(
        {
            "title":title,
            "xaxis":{
                "title":"timestamp"
            },
            "yaxis":{
                "title":"waterlevel"
            }
        })
    fig.show()

#### Train & Test

In [None]:
filename_test = directory + "/" + station_location + testYear + ".csv"
test_data = data(filename_test,'actual_reading','timestamp') #the one mad cannot detect

In [None]:
filename = directory + "/" + station_location + trainYear + ".csv"
train_data = data(filename,'actual_reading','timestamp') #the one mad cannot detect

In [None]:
print(test_data.head())

In [None]:
print(train_data.head())

#### Transforming data from the time domain to the frequency domain using fast Fourier transform

In [None]:
train_fft = np.fft.fft(train_data)
test_fft = np.fft.fft(test_data)

plot_fft(train_fft,"Training")

plot_fft(test_fft,"Testing")

#### Normalize the data

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_data)
X_test = scaler.transform(test_data)

#### Reshape inputs for LSTM 
- samples, timestamps, features

In [None]:
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
print("Training data shape:", X_train.shape)
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
print("Test data shape:", X_test.shape)

#### Create the autoencoder model

In [None]:
model = autoencoder_model(X_train)
model.compile(optimizer='adam', loss='mae')
model.summary()

#### Fit the model to the data

In [None]:
nb_epochs = 100
batch_size = 10
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, 
        verbose=1, mode='auto', restore_best_weights=True)
history = model.fit(X_train, X_train, epochs=nb_epochs, batch_size=batch_size,
                    validation_split=0.05,callbacks=[monitor]).history

plot_lossfunction(history)

#### Plot the loss distribution of the training set

In [None]:
X_pred = model.predict(X_train)
X_pred = X_pred.reshape(X_pred.shape[0], X_pred.shape[2])
X_pred = pd.DataFrame(X_pred, columns=train_data.columns)
X_pred.index = train_data.index
X_pred.head()

In [None]:
scored_train = pd.DataFrame(index=train_data.index)

Xtrain = X_train.reshape(X_train.shape[0], X_train.shape[2])
scored_train['Loss_mae'] = np.mean(np.abs(X_pred-Xtrain), axis = 1)
scored_train['Threshold'] = 0.008
scored_train['Anomaly'] = scored_train['Loss_mae'] > scored_train['Threshold']

plt.figure(figsize=(16,9), dpi=80)
plt.title('Loss Distribution', fontsize=16)
sns.distplot(scored_train['Loss_mae'], bins = 20, kde= True, color = 'blue');

#### Calculate the loss on the test set

In [None]:
X_pred = model.predict(X_test)
X_pred = X_pred.reshape(X_pred.shape[0], X_pred.shape[2])
X_pred = pd.DataFrame(X_pred, columns=test_data.columns)
X_pred.index = test_data.index
X_pred.head()

In [None]:
scored_test = pd.DataFrame(index=test_data.index)

Xtest = X_test.reshape(X_test.shape[0], X_test.shape[2])
scored_test['Loss_mae'] = np.mean(np.abs(X_pred-Xtest), axis = 1)
scored_test['Threshold'] = 0.008
scored_test['Anomaly'] = scored_test['Loss_mae'] > scored_test['Threshold']
scored_test.head()

#### Merge both test and train data in a single dataframe for plotting

In [None]:
scoredBoth = pd.concat([scored_train, scored_test])

# plot bearing failure time plot
scoredBoth.plot(logy=True,  figsize=(16,9), ylim=[1e-2,1e2], color=['blue','red'])

#### Save the result

In [None]:
test_data.head()

test_data['anomalies'] = scoredBoth['Anomaly']
test_data.head()

print(test_data['anomalies']==1)
test_data['rectified'] = fillWithLine(test_data,test_data['anomalies'].values,test_data.index,test_data['actual_reading'].values)
test_data.head()

In [None]:
plotGraph(test_data, test_data.index, test_data['actual_reading'].values, test_data['rectified'].values, title="Rectified")

In [None]:
scaler_filename = dataStorage + "scaler_data_" + station_location + trainYear
joblib.dump(scaler, scaler_filename)
model.save(dataStorage + station_location + trainYear + ".h5")

In [None]:
# for i in range(2016,2022):
# csvName = station_location + trainYear
# saveToExcelFile(test_data,"timestamp","actual_reading", csvName)