In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import warnings
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import class_weight, shuffle
from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, concatenate, Reshape, Flatten, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

warnings.filterwarnings("ignore")
BATCH_SIZE = 32
EPOCHS = 300
ec = LabelEncoder() # encode the categorical status
sc = StandardScaler() # scale the whole dataset to make the model learn faster

class status_LSTM:
    def __init__(self, eq_id, hour_horizontal, hour_vertical):
        self.eq_id = eq_id
        self.status_table = self.query_status()
        self.hour_horizontal = hour_horizontal
        self.hour_vertical = hour_vertical
        
        self.start_date = self.status_table.iloc[0]["TIMESTAMP_START"].date() + timedelta(days=1) # add one day to make it start from 00:00:00
        self.end_date = self.status_table.iloc[len(self.status_table)-1]["TIMESTAMP_START"].date()
        self.important_state_name = ["Break down Maintenance", "Utility Problem", "Maintenance", "IT Problem", \
                                    "Waiting For Spares", "MOTOR ERROR", "Half Yearly PM", "IT Maintenance", \
                                    "Machine Failure", "HANG UP", "Waiting For Repair", "BAD WEDGE FORM", "INDEXER PROBLEM",\
                                    "PC Buyoff Failed", "SHORT TAIL", "WIRE BREAK"]
        
        self.timeframe_table = self.generate_time(self.start_date.strftime("%d/%m/%Y"), self.end_date.strftime("%d/%m/%Y"), \
                                                  self.hour_horizontal, self.hour_vertical)

        self.impt_state_seq, self.impt_duration_seq = self.status_sequence(self.timeframe_table, self.status_table, important="YES")
        self.major_down_arr = self.major_down(self.timeframe_table, self.status_table, 6, 3600)
         
    def query_status(self):
            try:
                oracle_string = "oracle+cx_oracle://{username}:{password}@{hostname}:{port}/{database}"
                engine = create_engine(
                    oracle_string.format(
                        username = 'TFM4CEBERUS',
                        password = 'TFM4CEBERUS',
                        hostname = 'ome-db.bth.infineon.com',
                        port = '1538',
                        database = 'ome'
                        )
                    )
            except Exception as e:
                print(str(e))

            query = f"""select EQ_ID, TIMESTAMP_START, TIMESTAMP_END, DURATION, STATE_NAME, LEVEL3_NAME, LEVEL3 
                    from (SELECT
                      eq.eq_id, eq.name, eq.eq_type_ident
                    , data.timestamp_start,data.timestamp_end
                    , ROUND((data.timestamp_end - data.timestamp_start)*24*60*60,0) AS Duration
                    , data.tr25_3_status,data.tr25_4_status,data.tr25_5_status,data.eq_status
                    , level5s.state_name
                    , level5.state_name Level5_Name, level5.state_sign Level5
                    , level4.state_name Level4_Name, level4.state_sign Level4
                    , level3.state_name Level3_Name, level3.state_sign Level3
                    ,mh.device
                    ,mh.package,
                    mh.lotid as lot,
                    mh.product,
                    mh.operation

                    FROM OMEDATA.EQUIPMENT_STATE_HISTORY data
                    , OMEADMIN.EQUIPMENT_INSTANCES eq
                    , V_EQ_STATES level5s
                    , OMEADMIN.DEF_STANDARD_STATEMODEL level5
                    , OMEADMIN.DEF_STANDARD_STATEMODEL level4
                    , OMEADMIN.DEF_STANDARD_STATEMODEL level3
                    , OMEDATA.METAKEY_HISTORY mh

                    WHERE data.eq_ident  = eq.eq_ident
                    AND  data.eq_status = level5s.state_ident(+)
                    AND level5.state_ident = data.tr25_5_status
                    AND level4.state_ident = data.tr25_4_status
                    AND level3.state_ident = data.tr25_3_status
                    AND  data.metakey_ident =mh.ident(+)
                    and data.timestamp_start > sysdate - 1050)
                    where eq_id = '{self.eq_id}'
                    ORDER BY TIMESTAMP_START"""

            status = pd.read_sql(query, engine)
            status.columns = map(lambda x: str(x).upper(), status.columns) 

            return status

    def generate_time(self, start_date:str, end_date:str, hours_row:int, hour:int):
        start = datetime.strptime(start_date, '%d/%m/%Y')
        end = datetime.strptime(end_date, '%d/%m/%Y')

        dates = []
        while start+timedelta(hours=hours_row)<=end:
            row = [start, start+timedelta(hours=hours_row)]
            dates.append(row)
            start += timedelta(hours=hour)

        return pd.DataFrame(dates, columns=['TIMESTAMP_START', 'TIMESTAMP_END'])


    def major_down(self, input_df, status_table, hour, threshold):
            hour = pd.Timedelta(hours=hour)
            major_down = []
            
            # timeframe table must be a subset of the status table to correctly determine major down
            if status_table.iloc[0]["TIMESTAMP_START"] >= input_df.iloc[0]["TIMESTAMP_START"]:
                raise Exception("Timeframe table must be a subset of the status table")
            if status_table.iloc[len(status_table)-1]["TIMESTAMP_START"] <= input_df.iloc[len(input_df)-1]["TIMESTAMP_START"]:
                raise Exception("Timeframe table must be a subset of the status table")

            for idx, row in input_df.iterrows():
                start = row['TIMESTAMP_END']
                end = start+hour
                frame = status_table[(status_table['TIMESTAMP_START']>start) & (status_table['TIMESTAMP_START']<end)]
                UD = frame.loc[frame['LEVEL3']=='UDT']

                if len(UD) == 0: #no record within this 6 hours:
                    major_down.append(0)
                else:
                    time_diff = (UD['TIMESTAMP_END']-UD['TIMESTAMP_START']).dt.seconds
                    if any(time_diff>threshold):
                        major_down.append(1)
                    else:
                        major_down.append(0)
            return major_down

    def status_sequence(self, input_table, status_table, important=None):
        status_seq = []
        duration_seq = []
        
        # validation check
        if status_table.iloc[0]["TIMESTAMP_START"] > input_table.iloc[0]["TIMESTAMP_START"]:
            raise Exception("Timeframe table must be a subset of the status table")
        if status_table.iloc[len(status_table)-1]["TIMESTAMP_START"] <= input_table.iloc[len(input_table)-1]["TIMESTAMP_START"]:
                raise Exception("Timeframe table must be a subset of the status table")
        
        for idx, row in input_table.iterrows():
            start = row["TIMESTAMP_START"]
            end = row["TIMESTAMP_END"]
            
            condition = (status_table["TIMESTAMP_START"]>=start) & (status_table["TIMESTAMP_START"]<=end)
            if important == "YES":
                condition = (status_table["TIMESTAMP_START"]>=start) & (status_table["TIMESTAMP_START"]<=end) & \
                                (status_table.STATE_NAME.isin(self.important_state_name))
            
            table = status_table[condition]
            status_seq.append(table["STATE_NAME"].values)
            duration_seq.append(table["DURATION"].values)

        return status_seq, duration_seq

    def preprocess(self, status_seq, duration_seq):
        X_seq = []
        for ele, dur in zip(status_seq, duration_seq):
            tmp = []
            for idx in range(len(ele)):
                tmp.append([ele[idx], int(dur[idx])])
            X_seq.append(tmp)
        return np.array(X_seq)

# Training without excluding unimportant state name
##### Skipped for now, training took too long (one epoch 30 mins!) as the padded time stamp contains too many status values 

In [2]:
# hour = 24
# start = datetime.now()
# print(f"Training by looking back {hour} hours of alarm data")
# wba124 = status_LSTM("WBA124", hour, 3)

# # pad the alarm to train on LSTM
# unpadded_status_arr = wba124.encoded_status_seq
# unpadded_duration_arr = wba124.duration_seq

# padded_status_arr = np.zeros([len(unpadded_status_arr),len(max(unpadded_status_arr,key = lambda x: len(x)))])
# padded_duration_arr = np.zeros([len(unpadded_duration_arr),len(max(unpadded_duration_arr,key = lambda x: len(x)))])
# for i,j in enumerate(unpadded_status_arr):
#     padded_status_arr[i][0:len(j)] = j
#     padded_duration_arr[i][0:len(j)] = unpadded_duration_arr[i]

# # standard scale for the model to learn faster
# padded_X_seq = wba124.preprocess(padded_status_arr, padded_duration_arr)
# for i in range(padded_X_seq.shape[1]):
#     padded_X_seq[:, i, :] = sc.fit_transform(padded_X_seq[:, i, :])

# #train_val_test split
# val_percentage = 0.2
# test_percentage = 0.1

# test_index = int(len(padded_X_seq) * (1-test_percentage))
# val_index = int(len(padded_X_seq) * (1- val_percentage - test_percentage))

# X_train_seq, X_val_seq, X_test_seq = padded_X_seq[:val_index], padded_X_seq[val_index:test_index], padded_X_seq[test_index:]
# y_train_seq, y_val_seq, y_test_seq = wba124.major_down_arr[:val_index], wba124.major_down_arr[val_index:test_index], wba124.major_down_arr[test_index:]

# X_train_seq = X_train_seq.reshape(X_train_seq.shape[0], X_train_seq.shape[1], 2)
# X_val_seq = X_val_seq.reshape(X_val_seq.shape[0], X_val_seq.shape[1], 2)
# X_test_seq = X_test_seq.reshape(X_test_seq.shape[0], X_test_seq.shape[1], 2)

In [3]:
# ## Training took too long, one epoch 30 MINS!!! ##
# seq_result = {}

# class_weights = class_weight.compute_class_weight('balanced',
#                                              np.unique(y_train_seq),
#                                              y_train_seq)
# class_weights_dict = dict(enumerate(class_weights))

# #need to reinitialize the model because x_train_seq changes in shape
# model = Sequential()
# model.add(LSTM(128, input_shape=(X_train_seq.shape[1:]), return_sequences=True))
# model.add(Dropout(0.5))

# model.add(LSTM(128, input_shape=(X_train_seq.shape[1:])))
# model.add(Dropout(0.5))

# model.add(Dense(32, activation = 'relu'))
# model.add(Dropout(0.2))

# model.add(Dense(2, activation = 'softmax'))

# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# history = model.fit(np.array(X_train_seq), np.array(y_train_seq), 
#                 batch_size=BATCH_SIZE, epochs=EPOCHS, 
#                 validation_data=(np.array(X_val_seq), np.array(y_val_seq)),
#                 class_weight = class_weights_dict)

# evaluate = model.evaluate(np.array(X_test_seq), np.array(y_test_seq)) #loss, mse

# seq_result[hour] = evaluate
# end = datetime.now()
# time = end - start
# print(f"Training took a total of {time.seconds} seconds")

# Training with only important state name and duration
## Padding the sequence with max length

In [5]:
seq_result = {}
# lookback = [24,48,72,96]

# for hour in lookback:
hour = 12
start = datetime.now()
print(f"Training by looking back {hour} hours of important STATE NAME data")
wba124 = status_LSTM("WBA124", hour, 3)

# pad the alarm to train on LSTM
unpadded_status_arr = wba124.encoded_impt_state_seq
unpadded_duration_arr = wba124.impt_duration_seq

padded_status_arr = np.zeros([len(unpadded_status_arr),len(max(unpadded_status_arr,key = lambda x: len(x)))])
padded_duration_arr = np.zeros([len(unpadded_duration_arr),len(max(unpadded_duration_arr,key = lambda x: len(x)))])
for i,j in enumerate(unpadded_status_arr):
    padded_status_arr[i][0:len(j)] = j
    padded_duration_arr[i][0:len(j)] = unpadded_duration_arr[i]

# standard scale for the model to learn faster
padded_X_seq = wba124.preprocess(padded_status_arr, padded_duration_arr)
for i in range(padded_X_seq.shape[1]):
    padded_X_seq[:, i, :] = sc.fit_transform(padded_X_seq[:, i, :])

#train_val_test split
val_percentage = 0.2
test_percentage = 0.1

test_index = int(len(padded_X_seq) * (1-test_percentage))
val_index = int(len(padded_X_seq) * (1- val_percentage - test_percentage))

X_train_seq, X_val_seq, X_test_seq = padded_X_seq[:val_index], padded_X_seq[val_index:test_index], padded_X_seq[test_index:]
y_train_seq, y_val_seq, y_test_seq = wba124.major_down_arr[:val_index], wba124.major_down_arr[val_index:test_index], wba124.major_down_arr[test_index:]

X_train_seq = X_train_seq.reshape(X_train_seq.shape[0], X_train_seq.shape[1], 2)
X_val_seq = X_val_seq.reshape(X_val_seq.shape[0], X_val_seq.shape[1], 2)
X_test_seq = X_test_seq.reshape(X_test_seq.shape[0], X_test_seq.shape[1], 2)

class_weights = class_weight.compute_class_weight('balanced',
                                             np.unique(y_train_seq),
                                             y_train_seq)
class_weights_dict = dict(enumerate(class_weights))

#need to reinitialize the model because x_train_seq changes in shape
model = Sequential()
model.add(LSTM(256, input_shape=(X_train_seq.shape[1:]), return_sequences=True))
model.add(Dropout(0.5))

model.add(LSTM(256, input_shape=(X_train_seq.shape[1:])))
model.add(Dropout(0.5))

model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation = 'softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(np.array(X_train_seq), np.array(y_train_seq), 
                batch_size=BATCH_SIZE, epochs=EPOCHS, 
                validation_data=(np.array(X_val_seq), np.array(y_val_seq)),
                class_weight = class_weights_dict)

evaluate = model.evaluate(np.array(X_test_seq), np.array(y_test_seq)) #loss, mse

seq_result[hour] = evaluate
end = datetime.now()
time = end - start
print(f"Training took a total of {time.seconds} seconds")

Training by looking back 12 hours of important STATE NAME data
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/1

In [6]:
seq_result

{12: [0.6964600682258606, 0.07508939504623413]}

In [10]:
X_train_seq.shape # this is padded with the max length of all sequences

(5872, 127, 2)

# Training with only important state name and duration
## Padding the sequence with the average length
#### https://towardsdatascience.com/using-tensorflow-ragged-tensors-2af07849a7bd
#### Apparently can also greatly help boost accracy

In [23]:
seq_result = {}

lookback = [12,24,48,72,96]
for hour in lookback:
    start = datetime.now()
    print(f"Training by looking back {hour} hours of important STATE NAME data")
    wba124 = status_LSTM("WBA124", hour, 3)

    # pad the alarm to train on LSTM
    unpadded_status_arr = wba124.encoded_impt_state_seq
    unpadded_duration_arr = wba124.impt_duration_seq

    mean_length = int(np.mean([len(x) for x in unpadded_status_arr]))
    padded_status_arr = np.zeros([len(unpadded_status_arr), mean_length])
    padded_duration_arr = np.zeros([len(unpadded_duration_arr), mean_length])

    for i,j in enumerate(unpadded_status_arr):
        padded_status_arr[i][0:len(j)] = j[:mean_length]
        padded_duration_arr[i][0:len(unpadded_duration_arr[i])] = unpadded_duration_arr[i][:mean_length]

    # standard scale for the model to learn faster
    padded_X_seq = wba124.preprocess(padded_status_arr, padded_duration_arr)
    for i in range(padded_X_seq.shape[1]):
        padded_X_seq[:, i, :] = sc.fit_transform(padded_X_seq[:, i, :])

    #train_val_test split
    val_percentage = 0.2
    test_percentage = 0.1

    test_index = int(len(padded_X_seq) * (1-test_percentage))
    val_index = int(len(padded_X_seq) * (1- val_percentage - test_percentage))

    X_train_seq, X_val_seq, X_test_seq = padded_X_seq[:val_index], padded_X_seq[val_index:test_index], padded_X_seq[test_index:]
    y_train_seq, y_val_seq, y_test_seq = wba124.major_down_arr[:val_index], wba124.major_down_arr[val_index:test_index], wba124.major_down_arr[test_index:]

    X_train_seq = X_train_seq.reshape(X_train_seq.shape[0], X_train_seq.shape[1], 2)
    X_val_seq = X_val_seq.reshape(X_val_seq.shape[0], X_val_seq.shape[1], 2)
    X_test_seq = X_test_seq.reshape(X_test_seq.shape[0], X_test_seq.shape[1], 2)

    class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train_seq),
                                                 y_train_seq)
    class_weights_dict = dict(enumerate(class_weights))

    #need to reinitialize the model because x_train_seq changes in shape
    model = Sequential()
    model.add(LSTM(256, input_shape=(X_train_seq.shape[1:]), return_sequences=True))
    model.add(Dropout(0.5))

    model.add(LSTM(256, input_shape=(X_train_seq.shape[1:])))
    model.add(Dropout(0.5))

    model.add(Dense(32, activation = 'relu'))
    model.add(Dropout(0.2))

    model.add(Dense(2, activation = 'softmax'))

    callbacks = [EarlyStopping(monitor='val_accuracy', mode='max', patience=5)]

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(np.array(X_train_seq), np.array(y_train_seq), 
                    batch_size=BATCH_SIZE, epochs=EPOCHS, 
                    validation_data=(np.array(X_val_seq), np.array(y_val_seq)),
                    class_weight = class_weights_dict, callbacks=callbacks)

    evaluate = model.evaluate(np.array(X_test_seq), np.array(y_test_seq)) #loss, mse

    seq_result[hour, mean_length] = evaluate
    end = datetime.now()
    time = end - start
    print(f"Training took a total of {time.seconds} seconds")

Training by looking back 12 hours of important STATE NAME data
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Training took a total of 214 seconds
Training by looking back 24 hours of important STATE NAME data
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Training took a total of 191 seconds
Training by looking back 48 hours of important STATE NAME data
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Training took a total of 257 seconds
Training by looking back 72 hours of important STATE NAME data
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training took a total of 278 seconds


In [24]:
seq_result

{(12, 6): [0.6557784676551819, 0.6348448395729065],
 (24, 12): [0.6868782043457031, 0.5519713163375854],
 (48, 24): [0.7382815480232239, 0.4169653654098511],
 (72, 37): [0.6986674070358276, 0.5454545617103577],
 (96, 49): [0.7332104444503784, 0.3353293538093567]}

# Training with Embedding Layer, multiple inputs

### one categorical variable + one numerical variable
#### model overfits very badly, introduce some callbacks and Dropout
#### TODO: check if the target value is computed correctly, values looks suspicious

In [2]:
def label_encode(statename_seq): # do this the manual way as we are not certain if sklearn LabelEncoder can handle 3D array
    all_unique_statename = [set(ele) for ele in statename_seq]
    unique_statenames = set()
    for ele in all_unique_statename:
        unique_statenames |= ele
    
    enc_label = 1  #start encoding from 1 as we have to pad the sequence with 0
    mapping_dict = {}
    for ele in unique_statenames:
        mapping_dict[ele] = enc_label
        enc_label += 1

        enc_array = []
        
    #X_seq is a 3D array
    for timestamp in statename_seq:
        tmp_arr = []
        for ele in timestamp:
            tmp_arr.append(mapping_dict[ele])
        enc_array.append(np.array(tmp_arr))

    return np.array(enc_array), len(unique_statenames)+1

In [9]:
def remove_empty_datapoint(impt_state_seq, impt_duration_seq, major_down_arr):
    print(f"Remove {len([ele for ele in impt_state_seq if len(ele)==0])} rows on WBA124 out of {len(impt_state_seq)} as it has no data")
    bool_arr = [len(ele)==0 for ele in impt_state_seq] #this is to find the index to remove for bot y array and X_seq
    idx_remove = np.where(bool_arr)[0]
    
    major_down_arr = np.delete(np.array(major_down_arr), idx_remove) # remove the corresponding y value as well
    impt_state_seq = np.delete(impt_state_seq, idx_remove) #remove rows with no state name
    impt_duration_seq = np.delete(impt_duration_seq, idx_remove)
    return impt_state_seq, impt_duration_seq, major_down_arr

In [10]:
def randomly_select_negative(X_seq, X_dur, major_down_arr, ratio):
    bool_arr = [ele==0 for ele in major_down_arr]
    major_arr = np.array(major_down_arr)[np.where(bool_arr)[0]]
    print(len(major_arr), len(major_down_arr))
    negative = X_seq[np.where(bool_arr)[0]]
    duration = X_dur[np.where(bool_arr)[0]]
    
    positive_bool_arr = [ele==1 for ele in major_down_arr]
    positive_major_arr = np.array(major_down_arr)[np.where(positive_bool_arr)[0]]
    print(len(positive_major_arr))
    positive = X_seq[np.where(positive_bool_arr)[0]]
    positive_dur = X_dur[np.where(positive_bool_arr)[0]]
    
    discard, keep, dur_discard, dur_keep, target_discard, target_keep = train_test_split(negative, duration, major_arr, test_size=ratio)
    
    handpicked = np.concatenate((positive, keep))
    dur = np.concatenate((positive_dur, dur_keep))
    target = np.concatenate((positive_major_arr, target_keep))
    
    if len(handpicked) != len(dur) or len(handpicked) != len(target):
        raise Exception("Length of training inputs are different")
    
    return handpicked, dur, target

In [11]:
seq_result = {}
lookback = [12, 24, 48, 72]
count = 0
mode = "max"
for hour in lookback:
    if count  == 0:
        monitor = "val_recall"
    else:
        monitor = f"val_recall_{count}"
    
    count += 1
    #NEW MACHINES WITH LITTLE DATA: 120, 121 122, 125
    #CAN USE 123, 124, 126, 127, 128, 129, 130, 131, 132, 133
    wba123 = status_LSTM("WBA123", hour, 3)
    wba124 = status_LSTM("WBA124", hour, 3)
    wba126 = status_LSTM("WBA126", hour, 3)
    wba127 = status_LSTM("WBA127", hour, 3)
    wba128 = status_LSTM("WBA128", hour, 3)
    
    #remove data points
    wba123.impt_state_seq, wba123.impt_duration_seq, wba123.major_down_arr = remove_empty_datapoint(wba123.impt_state_seq, wba123.impt_duration_seq, wba123.major_down_arr)
    wba124.impt_state_seq, wba124.impt_duration_seq, wba124.major_down_arr = remove_empty_datapoint(wba124.impt_state_seq, wba124.impt_duration_seq, wba124.major_down_arr)
    wba126.impt_state_seq, wba126.impt_duration_seq, wba126.major_down_arr = remove_empty_datapoint(wba126.impt_state_seq, wba126.impt_duration_seq, wba126.major_down_arr)
    wba127.impt_state_seq, wba127.impt_duration_seq, wba127.major_down_arr = remove_empty_datapoint(wba127.impt_state_seq, wba127.impt_duration_seq, wba127.major_down_arr)
    wba128.impt_state_seq, wba128.impt_duration_seq, wba128.major_down_arr = remove_empty_datapoint(wba128.impt_state_seq, wba128.impt_duration_seq, wba128.major_down_arr)

    start = datetime.now()
    tmp1 = np.concatenate((wba123.impt_state_seq, wba124.impt_state_seq, wba126.impt_state_seq, wba127.impt_state_seq, wba128.impt_state_seq))
    encoded_X_seq, n_statename = label_encode(tmp1)
    numerical_X_seq = np.concatenate((wba123.impt_duration_seq, wba124.impt_duration_seq, wba126.impt_duration_seq, wba127.impt_duration_seq, wba128.impt_duration_seq,)) # one numerical variable (duration associated with the statename)
    target = np.concatenate((wba123.major_down_arr, wba124.major_down_arr, wba126.major_down_arr, wba127.major_down_arr, wba128.major_down_arr, ))
    
    # downsample negative data
    encoded_X_seq, numerical_X_seq, target = randomly_select_negative(encoded_X_seq, numerical_X_seq,  target, 0.2)
    
    #shuffle the X and Y values to make generalize better
    shuffled = shuffle(encoded_X_seq, numerical_X_seq, target)
    encoded_X_seq = shuffled[0]
    numerical_X_seq = shuffled[1]
    target = shuffled[2]
    
    # padding to average length
    mean_length = int(np.mean([len(x) for x in encoded_X_seq]))
    padded_statename = np.zeros([len(encoded_X_seq), mean_length])
    padded_duration = np.zeros([len(encoded_X_seq), mean_length])
    for i,j in enumerate(encoded_X_seq):
        padded_statename[i][0:len(j)] = j[:mean_length]
        padded_duration[i][0:len(j)] = numerical_X_seq[i][:mean_length]

    #train_val_test split
    X_train_statename_seq, X_val_statename_seq, X_train_duration_seq, X_val_duration_seq, y_train_seq, y_val_seq =  train_test_split(padded_statename, padded_duration, target, test_size=0.4, random_state=42, stratify=target)
    X_val_statename_seq, X_test_statename_seq, X_val_duration_seq, X_test_duration_seq, y_val_seq, y_test_seq =  train_test_split(X_val_statename_seq, X_val_duration_seq, y_val_seq, test_size=0.4, random_state=42, stratify=y_val_seq)    
    
    # embed the categorical variable using Keras Functional API
    in_layer = Input(shape=(X_train_statename_seq.shape[1],), name="statename")
    em_layer = Embedding(n_statename, 128, mask_zero=True)(in_layer)
    em_layer = Reshape((X_train_statename_seq.shape[1], -1))(em_layer)

    # input layer for numerical variable
    in_num = Input(shape=(X_train_duration_seq.shape[1], 1), name="duration")

    merge = concatenate([em_layer, in_num])
    lstm_layer1 = Bidirectional(LSTM(128, return_sequences=True))(merge)
    dropout1 = Dropout(0.5)(lstm_layer1)
    lstm_layer2 = LSTM(64)(dropout1)
    dropout2 = Dropout(0.5)(lstm_layer2)
    dense = Dense(64, activation='relu')(dropout2)
    classifier = Dense(1, activation='sigmoid')(dense)

    all_inputs = [in_layer, in_num]
    model = Model(inputs=all_inputs, outputs=classifier)

    # model seems to be overfitting, try to reduce overfitting by reduce LR, but model should take longer to converge so use a larger EPOCH
    callbacks = [ReduceLROnPlateau(monitor=monitor, factor=0.2, patience=5, min_lr=0.001), \
                EarlyStopping(monitor=monitor, mode=mode, patience=30, restore_best_weights=True)]

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), 'accuracy'])

    model.fit([X_train_statename_seq, X_train_duration_seq], y_train_seq,
             validation_data=([X_val_statename_seq, X_val_duration_seq], y_val_seq),
             callbacks=callbacks, epochs=EPOCHS, batch_size=BATCH_SIZE)

    evaluate = model.evaluate([X_test_statename_seq, X_test_duration_seq], y_test_seq) #loss, mse
    
    pred = model.predict([X_test_statename_seq, X_test_duration_seq])
    classes = []
    for ele in pred:
        classes.append(int((ele>0.5)[0]))

    from sklearn.metrics import classification_report, confusion_matrix

    cm = confusion_matrix(y_test_seq, classes)
    print(cm)
    
    seq_result[hour] = [evaluate, cm]

    end = datetime.now()
    time = end - start
    print(f"Training took {time.seconds} seconds to complete.")

Remove 2786 rows on WBA124 out of 8389 as it has no data
Remove 2900 rows on WBA124 out of 8381 as it has no data
Remove 2215 rows on WBA124 out of 8381 as it has no data
Remove 2675 rows on WBA124 out of 8389 as it has no data
Remove 2478 rows on WBA124 out of 8389 as it has no data
25078 28875
3797
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
[[526 277]
 [310 298]]
Training took 107 seconds to complete.
Remove 1862 rows on WBA124 out of 8385 as it has no data
Remove 1791 rows on WBA124 out of 8377 as it has no data
Remove 1459 rows on WBA124 out of 8377 as it has no data
Remove 1595 rows on WBA124 out of 8385 as it has no data
R

32610 36946
4336
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
[[722 322]
 [397 297]]
Training took 365 seconds to complete.
Remove 810 rows on WBA124 out of 8369 as it has no data
Remove 660 rows on WBA124 out of 8361 as it has no data
Remove 777 rows on WBA124 out of 8361 as it has no data
Remove 585 rows on WBA124 out of 8369 as it has no data
Remove 548 rows on WBA124 out of 8369 as it has no data


34045 38449
4404
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
[[715 375]
 [410 295]]
Training took 511 seconds to complete.


In [12]:
seq_result # monitor val recall bidirectional

{12: [[0.6746309399604797,
   0.5182608962059021,
   0.49013158679008484,
   0.5839830040931702],
  array([[526, 277],
         [310, 298]])],
 24: [[0.6764256358146667,
   0.4761904776096344,
   0.4367469847202301,
   0.5698323845863342],
  array([[628, 319],
         [374, 290]])],
 48: [[0.6651474833488464,
   0.479806125164032,
   0.42795389890670776,
   0.586306095123291],
  array([[722, 322],
         [397, 297]])],
 72: [[0.6695340275764465,
   0.44029849767684937,
   0.41843971610069275,
   0.5626741051673889],
  array([[715, 375],
         [410, 295]])]}

In [8]:
seq_result

{84: [[1.2854076623916626,
   0.5663600564002991,
   0.6096181273460388,
   0.6653782725334167],
  array([[774, 330],
         [276, 431]])],
 96: [[1.0571415424346924,
   0.5884892344474792,
   0.5768688321113586,
   0.6787280440330505],
  array([[829, 286],
         [300, 409]])],
 120: [[1.2010926008224487,
   0.5956112742424011,
   0.5344585180282593,
   0.6804123520851135],
  array([[874, 258],
         [331, 380]])]}

In [6]:
seq_result # downsample negative samples

{12: [[0.6739060282707214,
   0.49929675459861755,
   0.5838815569877625,
   0.5686968564987183],
  array([[448, 356],
         [253, 355]])],
 24: [[0.6818949580192566,
   0.4321766495704651,
   0.4114114046096802,
   0.5337879657745361],
  array([[587, 360],
         [392, 274]])],
 48: [[0.6943953633308411,
   0.4988763928413391,
   0.319884717464447,
   0.6001150608062744],
  array([[821, 223],
         [472, 222]])],
 72: [[1.1131274700164795,
   0.5409638285636902,
   0.6386913061141968,
   0.6458449363708496],
  array([[709, 381],
         [254, 449]])]}

In [6]:
seq_result # Bidirectional, leaving class 1 weight as variable

{5: [[0.4488883316516876,
   0.27272728085517883,
   0.3744680881500244,
   0.814032793045044],
  array([[4748,  704],
         [ 441,  264]]),
  '              precision    recall  f1-score   support\n\n           0       0.92      0.87      0.89      5452\n           1       0.27      0.37      0.32       705\n\n    accuracy                           0.81      6157\n   macro avg       0.59      0.62      0.60      6157\nweighted avg       0.84      0.81      0.83      6157\n'],
 10: [[0.5950702428817749,
   0.18969346582889557,
   0.6056737303733826,
   0.6585999727249146],
  array([[3628, 1824],
         [ 278,  427]]),
  '              precision    recall  f1-score   support\n\n           0       0.93      0.67      0.78      5452\n           1       0.19      0.61      0.29       705\n\n    accuracy                           0.66      6157\n   macro avg       0.56      0.64      0.53      6157\nweighted avg       0.84      0.66      0.72      6157\n'],
 15: [[0.6356426477432251,
 

In [6]:
seq_result #Bidirecitonal, 5 EQ under same EQ family, remove empty datapoints, shuffle+masking

{12: [0.6014404296875,
  0.1650485396385193,
  0.44810542464256287,
  0.6304535865783691],
 24: [0.5659424066543579,
  0.2021428644657135,
  0.4262048304080963,
  0.7228491902351379],
 48: [0.532018780708313,
  0.21800947189331055,
  0.530259370803833,
  0.7219125032424927],
 72: [0.4936830699443817, 0.23517654836177826, 0.5, 0.7561728358268738]}

In [7]:
seq_result # everything above, except bidirectional

{12: [0.6382883191108704,
  0.16345123946666718,
  0.5016447305679321,
  0.5972342491149902],
 24: [0.6094536185264587,
  0.1734505146741867,
  0.5639097690582275,
  0.6156550645828247],
 48: [0.5899127125740051, 0.0, 0.0, 0.8825616836547852],
 72: [0.5766468048095703,
  0.16018734872341156,
  0.48510637879371643,
  0.649772584438324]}