In [1]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 7825575034658721821]

# Binary classification
Predict if an asset will fail within certain time frame (e.g. cycles)

In [2]:
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0

from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM

# define path to save model
model_path = 'binary_model.h5'

## Data Ingestion

In [3]:
# read training data - It is the aircraft engine run-to-failure data.
train_df = pd.read_csv('PM_train.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

train_df = train_df.sort_values(['id','cycle'])


## Data Preprocessing

In [4]:
#######
# TRAIN
#######
# Data Labeling - generate column RUL(Remaining Usefull Life or Time to Failure)
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
# generate label columns for training data
# we will only make use of "label1" for binary classification, 
# while trying to answer the question: is a specific engine going to fail within w cycles?
w = 10
# w0 = 10
train_df['label1'] = np.where(train_df['RUL'] <= w, 1, 0 )
# train_df['label2'] = train_df['label1']
# train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2

# MinMax normalization (from 0 to 1)
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL','label1'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)


## Now I want to separate the training set into a training and validation set. I will use 80 training sets for the training and 20 training sets as evaluation sets for the PdM policy.

In [5]:
list_ID = np.arange(81,101,1) # I take the 20 last #TODO: make this random

In [6]:
list_ID

array([ 81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100])

In [7]:
validation_df = train_df.loc[train_df['id'].isin(list_ID)]

In [8]:
train_df = train_df[~train_df.id.isin(list_ID)]

## LSTM

### seq_array creation, which is needed as input for the LSTM

In [9]:
# pick a large window size of 50 cycles
sequence_length = 50

# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    # for one id I put all the rows in a single matrix
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    # Iterate over two lists in parallel.
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,112),(50,192)
    # 0 50 -> from row 0 to row 50
    # 1 51 -> from row 1 to row 51
    # 2 52 -> from row 2 to row 52
    # ...
    # 111 191 -> from row 111 to 191
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]
        
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']
sequence_cols.extend(sensor_cols)

# generator for the sequences
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols)) 
           for id in train_df['id'].unique())

# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
seq_array.shape 

(12138, 50, 25)

In [10]:
# we always take the measurements of the last 50 cycles as input!
# Every sequence is reduced by a length of 50 (=sequence_length). We have 80 training sets, 80*50 = 4000 "less" inputs
# train_df.shape = (16138, 30)
# seq_array.shape = (12138, 50, 25)

### label_array creation, which is used as the "true" output in the training of the LSTM (more specifically, using the binary_cross_entropy loss function)

In [11]:
# function to generate labels
def gen_labels(id_df, seq_length, label):
    # For one id I put all the labels in a single matrix.
    # For example:
    # [[1]
    # [4]
    # [1]
    # [5]
    # [9]
    # ...
    # [200]] 
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    # I have to remove the first seq_length labels
    # because for one id the first sequence of seq_length size have as target
    # the last label (the previous ones are discarded).
    # All the next id's sequences will have associated step by step one label as target. 
    return data_matrix[seq_length:num_elements, :]

# generate labels
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['label1']) 
             for id in train_df['id'].unique()]
label_array = np.concatenate(label_gen).astype(np.float32)
label_array.shape

(12138, 1)

In [12]:
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

In [13]:
# Next, we build a deep network. 
# The first layer is an LSTM layer with 100 units followed by another LSTM layer with 50 units. 
# Dropout is also applied after each LSTM layer to control overfitting. 
# Final layer is a Dense output layer with single unit and sigmoid activation since this is a binary classification problem.
# build the network
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()

model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(
          units=50,
          return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units=nb_out, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

# fit the network
# history = model.fit(seq_array, label_array, epochs=100, batch_size=200, validation_split=0.05, verbose=2,
#           callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min'),
#                        keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min', verbose=0)]
#           )

# # list all data in history
# print(history.history.keys())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 100)           50400     
_________________________________________________________________
dropout (Dropout)            (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 80,651
Trainable params: 80,651
Non-trainable params: 0
_________________________________________________________________
None


# Optimize heuristic probability threshold by taking the training data and use the LSTM RUL predictor and implement also the decisions. The objective function is the total expected cost rate.


In [14]:
# Import the model which has already been trained in the past and can simply be called back already trained
if os.path.isfile(model_path):
    estimator = load_model(model_path)

In [15]:
# Cost definitions
C_p = 100
C_c = 500
DT  = 10  

In [16]:
array_decisions = np.arange(0,500,10) # decisions can only be made every DT = 10 cycles

# The goal is to find the optimal Pr threshold that leads to minimization of the expected total cost rate. And then use this threshold in the validation set.

In [17]:
def minimizer_training_set(PR_thres):
    
    costs_array = np.zeros(80)
    t_LC_array  = np.zeros(80)
    
    counter = 0
    
    for id in train_df['id'].unique():
        # print(id)
        preventive_replacement = False
        
        for cycle in range(train_df[train_df['id']==id].shape[0]-sequence_length+1): 

            if cycle in array_decisions:
                seq_array_training_k = train_df[train_df['id']==id][sequence_cols].values[cycle:sequence_length+cycle]
                seq_array_training_k = np.asarray(seq_array_training_k).astype(np.float32).reshape(1,sequence_length, nb_features)
                prob_RUL_smaller_DT = estimator.predict(seq_array_training_k).reshape(1)

                # evaluate decision heuristics
                if PR_thres <= prob_RUL_smaller_DT:
                    t_LC_array[counter] = sequence_length+cycle
                    costs_array[counter] = C_p
                    preventive_replacement = True
                    break

        if preventive_replacement == False:
            t_LC_array[counter] = train_df[train_df['id']==id]['cycle'].iloc[-1]
            # print('ID:', id, ' component failure at t:', t_LC_array[counter])
            costs_array[counter] = C_c

        counter+=1
        
    expected_cost = np.mean(costs_array) / np.mean(t_LC_array)   # this is the objective function
    
    return expected_cost
        
        


In [18]:
# PR_thres = np.arange(0.1, 1.0, 0.1)
PR_thres = np.array([0.1, 0.3, 0.5, 0.7, 0.9])

In [19]:
expected_cost_on_grid = np.zeros(np.size(PR_thres))

for i in range(np.size(PR_thres)):
    expected_cost_on_grid[i] = minimizer_training_set(PR_thres[i])


In [20]:
expected_cost_on_grid

array([0.51712993, 0.51216389, 0.53578263, 0.60940773, 0.78287771])

In [21]:
# Perfect prognostics on the training set
import math
t_LC_perfect_array  = np.zeros(80)
counter=0
for id in train_df['id'].unique():
    t_LC_perfect_array[counter] = math.floor(train_df[train_df['id']==id]['cycle'].iloc[-1] /DT) * DT    
    counter+=1
    
costs_perfect_array = np.ones(80)*C_p # a perfect policy will only lead to preventive replacements

expected_cost_perfect = np.mean(costs_perfect_array)/np.mean(t_LC_perfect_array)
expected_cost_perfect

0.507292327203551

In [22]:
t_LC_perfect_array

array([190., 280., 170., 180., 260., 180., 250., 150., 200., 220., 240.,
       170., 160., 180., 200., 200., 270., 190., 150., 230., 190., 200.,
       160., 140., 230., 190., 150., 160., 160., 190., 230., 190., 200.,
       190., 180., 150., 170., 190., 120., 180., 210., 190., 200., 190.,
       150., 250., 210., 230., 210., 190., 210., 210., 190., 250., 190.,
       270., 130., 140., 230., 170., 180., 180., 170., 280., 150., 200.,
       310., 190., 360., 130., 200., 210., 210., 160., 220., 210., 150.,
       230., 190., 180.])

In [23]:
PR_thres[np.argmin(expected_cost_on_grid)]

0.3

In [24]:
M = (expected_cost_on_grid-expected_cost_perfect)/expected_cost_perfect *100
M

array([ 1.93923723,  0.9603073 ,  5.61615002, 20.12949914, 54.32476798])

In [25]:
PR_thres = np.array([0.6, 0.7, 0.8])

In [26]:
expected_cost_on_grid = np.zeros(np.size(PR_thres))

for i in range(np.size(PR_thres)):
    expected_cost_on_grid[i] = minimizer_training_set(PR_thres[i])


In [27]:
M = (expected_cost_on_grid-expected_cost_perfect)/expected_cost_perfect *100
M

array([ 5.4144385 , 20.12949914, 29.82711671])

In [28]:
(minimizer_training_set(0.2)-expected_cost_perfect)/expected_cost_perfect *100

1.1545862732521006

In [29]:
(minimizer_training_set(0.90)-expected_cost_perfect)/expected_cost_perfect *100

54.32476797777639

## Now take the value of the "optimal" heuristic probability threshold that you found by applying the decisions on the training set with the trained LSTM classifier and evaluate the decisions on the validation data set (ids 81 to 100) (i.e., eventually get the total expected cost rate and the error metric on the validation set).


In [30]:
costs_array = np.zeros(20)
t_LC_array  = np.zeros(20)

In [31]:
# Cost definitions
C_p = 100
C_c = 500
DT  = 10  

In [32]:
counter = 0
for id in validation_df['id'].unique():
    # print(id)
    preventive_replacement = False
    for cycle in range(validation_df[validation_df['id']==id].shape[0]-sequence_length+1): 

        if cycle in array_decisions:
            # print(sequence_length+cycle)
            seq_array_validation_k = validation_df[validation_df['id']==id][sequence_cols].values[cycle:sequence_length+cycle]
            seq_array_validation_k = np.asarray(seq_array_validation_k).astype(np.float32).reshape(1,sequence_length, nb_features)
            prob_RUL_smaller_DT = estimator.predict(seq_array_validation_k).reshape(1)
            # print(prob_RUL_smaller_DT)

            # evaluate decision heuristics
            if 0.80 <= prob_RUL_smaller_DT:
                t_LC_array[counter] = sequence_length+cycle
                costs_array[counter] = C_p
                print('ID:', id, ' preventive replacement informed at cycle:', t_LC_array[counter])
                print('component lifecycle:', t_LC_array[counter])
                preventive_replacement = True
                break

    if preventive_replacement == False:
        t_LC_array[counter] = validation_df[validation_df['id']==id]['cycle'].iloc[-1]
        print('ID:', id, ' component failure at t:', t_LC_array[counter])
        costs_array[counter] = C_c
        
    counter+=1

ID: 81  preventive replacement informed at cycle: 240.0
component lifecycle: 240.0
ID: 82  preventive replacement informed at cycle: 210.0
component lifecycle: 210.0
ID: 83  preventive replacement informed at cycle: 290.0
component lifecycle: 290.0
ID: 84  component failure at t: 267.0
ID: 85  preventive replacement informed at cycle: 180.0
component lifecycle: 180.0
ID: 86  preventive replacement informed at cycle: 270.0
component lifecycle: 270.0
ID: 87  preventive replacement informed at cycle: 170.0
component lifecycle: 170.0
ID: 88  preventive replacement informed at cycle: 210.0
component lifecycle: 210.0
ID: 89  preventive replacement informed at cycle: 210.0
component lifecycle: 210.0
ID: 90  preventive replacement informed at cycle: 150.0
component lifecycle: 150.0
ID: 91  preventive replacement informed at cycle: 130.0
component lifecycle: 130.0
ID: 92  preventive replacement informed at cycle: 340.0
component lifecycle: 340.0
ID: 93  preventive replacement informed at cycle:

In [33]:
costs_array

array([100., 100., 100., 500., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 500., 100., 500., 100., 100., 100., 100.])

In [34]:
t_LC_array

array([240., 210., 290., 267., 180., 270., 170., 210., 210., 150., 130.,
       340., 150., 258., 280., 336., 200., 150., 180., 200.])

In [35]:
t_LC_array.mean()

221.05

In [36]:
# Perfect prognostics
import math
t_LC_perfect_array  = np.zeros(20)
counter=0
for id in validation_df['id'].unique():
    t_LC_perfect_array[counter] = math.floor(validation_df[validation_df['id']==id]['cycle'].iloc[-1] /DT) * DT    
    counter+=1
    
costs_perfect_array = np.ones(20)*C_p # a perfect policy will only lead to preventive replacements

expected_cost_perfect = np.mean(costs_perfect_array)/np.mean(t_LC_perfect_array)
expected_cost_perfect

0.45454545454545453

In [37]:
t_LC_perfect_array

array([240., 210., 290., 260., 180., 270., 170., 210., 210., 150., 130.,
       340., 150., 250., 280., 330., 200., 150., 180., 200.])

In [38]:
t_LC_perfect_array.mean()

220.0

In [39]:
# Evaluation of the expected cost per unit time, Eqns. (3) and (4) of our paper.
expected_cost_LSTM = np.mean(costs_array)/np.mean(t_LC_array)
expected_cost_LSTM

0.7238181406921511

In [40]:
# evaluation of the metric defined in the paper
M = (expected_cost_LSTM - expected_cost_perfect) / expected_cost_perfect
M # it obtains a very small value

0.5923999095227325

In [41]:
M*100 # error percentage

59.239990952273246