In [1]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 10372356642719980145]

# Binary classification
Predict if an asset will fail within certain time frame (e.g. cycles)

In [2]:
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0

from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM

# define path to save model
model_path = 'binary_model.h5'

## Data Ingestion

In [3]:
# read training data - It is the aircraft engine run-to-failure data.
train_df = pd.read_csv('PM_train.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

train_df = train_df.sort_values(['id','cycle'])


In [4]:
train_df

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


## Data Preprocessing

In [5]:
#######
# TRAIN
#######
# Data Labeling - generate column RUL(Remaining Usefull Life or Time to Failure)
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
# generate label columns for training data
# we will only make use of "label1" for binary classification, 
# while trying to answer the question: is a specific engine going to fail within w cycles?
w = 10
# w0 = 10
train_df['label1'] = np.where(train_df['RUL'] <= w, 1, 0 )
# train_df['label2'] = train_df['label1']
# train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2

# MinMax normalization (from 0 to 1)
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL','label1'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)


In [6]:
train_df2 = train_df.query('RUL == 0')['cycle'].to_numpy()

In [7]:
train_df2.mean(), train_df2.std()

(206.31, 46.11045326170629)

## Now I want to separate the training set into a training and validation set. I will use 80 training sets for the training and 20 training sets as evaluation sets for the PdM policy.

In [8]:
list_ID = np.arange(81,101,1) # I take the 20 last #TODO: make this random

In [9]:
list_ID

array([ 81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100])

In [10]:
validation_df = train_df.loc[train_df['id'].isin(list_ID)]

In [11]:
validation_df

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,cycle_norm
16138,81,1,0.212644,0.750000,0.0,0.0,0.250000,0.411380,0.411546,0.0,...,0.464025,0.0,0.250000,0.0,0.0,0.565891,0.610743,239,0,0.000000
16139,81,2,0.632184,0.666667,0.0,0.0,0.433735,0.331589,0.432647,0.0,...,0.509427,0.0,0.333333,0.0,0.0,0.596899,0.622756,238,0,0.002770
16140,81,3,0.471264,0.916667,0.0,0.0,0.403614,0.335295,0.238859,0.0,...,0.489804,0.0,0.500000,0.0,0.0,0.697674,0.644573,237,0,0.005540
16141,81,4,0.494253,0.500000,0.0,0.0,0.361446,0.519948,0.381668,0.0,...,0.444017,0.0,0.333333,0.0,0.0,0.488372,0.726871,236,0,0.008310
16142,81,5,0.637931,0.666667,0.0,0.0,0.662651,0.424024,0.461344,0.0,...,0.277799,0.0,0.333333,0.0,0.0,0.697674,0.603286,235,0,0.011080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,0.477011,0.250000,0.0,0.0,0.686747,0.587312,0.782917,0.0,...,0.656791,0.0,0.750000,0.0,0.0,0.271318,0.109500,4,1,0.540166
20627,100,197,0.408046,0.083333,0.0,0.0,0.701807,0.729453,0.866475,0.0,...,0.727203,0.0,0.583333,0.0,0.0,0.124031,0.366197,3,1,0.542936
20628,100,198,0.522989,0.500000,0.0,0.0,0.665663,0.684979,0.775321,0.0,...,0.922278,0.0,0.833333,0.0,0.0,0.232558,0.053991,2,1,0.545706
20629,100,199,0.436782,0.750000,0.0,0.0,0.608434,0.746021,0.747468,0.0,...,0.823394,0.0,0.583333,0.0,0.0,0.116279,0.234466,1,1,0.548476


In [12]:
train_df = train_df[~train_df.id.isin(list_ID)]

In [13]:
train_df[train_df['id']==1][-12:]

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,cycle_norm
180,1,181,0.465517,0.916667,0.0,0.0,0.671687,0.559625,0.648042,0.0,...,0.565987,0.0,0.75,0.0,0.0,0.155039,0.374206,11,0,0.498615
181,1,182,0.442529,0.75,0.0,0.0,0.903614,0.676695,0.748987,0.0,...,0.734513,0.0,0.75,0.0,0.0,0.224806,0.392847,10,1,0.501385
182,1,183,0.505747,0.583333,0.0,0.0,0.611446,0.570961,0.620864,0.0,...,0.689496,0.0,0.666667,0.0,0.0,0.325581,0.325877,9,1,0.504155
183,1,184,0.522989,0.75,0.0,0.0,0.861446,0.749945,0.848582,0.0,...,0.724894,0.0,0.583333,0.0,0.0,0.224806,0.166252,8,1,0.506925
184,1,185,0.545977,0.583333,0.0,0.0,0.78012,0.70678,0.711512,0.0,...,0.667564,0.0,0.583333,0.0,0.0,0.286822,0.242751,7,1,0.509695
185,1,186,0.655172,0.25,0.0,0.0,0.692771,0.525834,0.743585,0.0,...,0.761062,0.0,0.75,0.0,0.0,0.255814,0.223971,6,1,0.512465
186,1,187,0.229885,0.5,0.0,0.0,0.635542,0.459124,0.759959,0.0,...,0.757599,0.0,0.666667,0.0,0.0,0.217054,0.259597,5,1,0.515235
187,1,188,0.114943,0.75,0.0,0.0,0.76506,0.683235,0.684166,0.0,...,0.753367,0.0,0.666667,0.0,0.0,0.286822,0.089202,4,1,0.518006
188,1,189,0.465517,0.666667,0.0,0.0,0.894578,0.547853,0.772451,0.0,...,0.744132,0.0,0.583333,0.0,0.0,0.263566,0.301712,3,1,0.520776
189,1,190,0.344828,0.583333,0.0,0.0,0.731928,0.614345,0.737677,0.0,...,0.759523,0.0,0.833333,0.0,0.0,0.271318,0.239299,2,1,0.523546


## LSTM

In [14]:
# pick a large window size of 50 cycles
sequence_length = 50

# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    # for one id I put all the rows in a single matrix
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    # Iterate over two lists in parallel.
    # For example id1 have 192 rows and sequence_length is equal to 50
    # so zip iterate over two following list of numbers (0,112),(50,192)
    # 0 50 -> from row 0 to row 50
    # 1 51 -> from row 1 to row 51
    # 2 52 -> from row 2 to row 52
    # ...
    # 111 191 -> from row 111 to 191
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]
        
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
sequence_cols = ['setting1', 'setting2', 'setting3', 'cycle_norm']
sequence_cols.extend(sensor_cols)

# generator for the sequences
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, sequence_cols)) 
           for id in train_df['id'].unique())

# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
seq_array.shape 

(12138, 50, 25)

In [15]:
# we always take the measurements of the last 50 cycles as input!
# Every sequence is reduced by a length of 50 (=sequence_length). We have 80 training sets, 80*50 = 4000 "less" inputs
# train_df.shape = (16138, 30)
# seq_array.shape = (12138, 50, 25)

In [16]:
# function to generate labels
def gen_labels(id_df, seq_length, label):
    # For one id I put all the labels in a single matrix.
    # For example:
    # [[1]
    # [4]
    # [1]
    # [5]
    # [9]
    # ...
    # [200]] 
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    # I have to remove the first seq_length labels
    # because for one id the first sequence of seq_length size have as target
    # the last label (the previous ones are discarded).
    # All the next id's sequences will have associated step by step one label as target. 
    return data_matrix[seq_length:num_elements, :]

# generate labels
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['label1']) 
             for id in train_df['id'].unique()]
label_array = np.concatenate(label_gen).astype(np.float32)
label_array.shape

(12138, 1)

In [17]:
# label_array

In [18]:
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

In [19]:
# Next, we build a deep network. 
# The first layer is an LSTM layer with 100 units followed by another LSTM layer with 50 units. 
# Dropout is also applied after each LSTM layer to control overfitting. 
# Final layer is a Dense output layer with single unit and sigmoid activation since this is a binary classification problem.
# build the network
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()

model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(
          units=50,
          return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units=nb_out, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

# fit the network
# history = model.fit(seq_array, label_array, epochs=100, batch_size=200, validation_split=0.05, verbose=2,
#           callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min'),
#                        keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min', verbose=0)]
#           )

# # list all data in history
# print(history.history.keys())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 100)           50400     
_________________________________________________________________
dropout (Dropout)            (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 80,651
Trainable params: 80,651
Non-trainable params: 0
_________________________________________________________________
None


## PdM first policy evaluation on the validation set.

For each validation set, I need to give the on-line sensor data as input to the trained LSTM.

This will give me the probability of RUL_k<w at fixed time steps k*DT that are discrete decision making time steps.

One issue that I see: The LSTM needs the measurements sequence from the last 50 cycles as input. This means that the PdM decision policy evaluation can only start from cycle 50 onwards. What if my component fails within the first 50 cycles?

In [20]:
if os.path.isfile(model_path):
    estimator = load_model(model_path)

In [21]:
validation_df[validation_df['id']==82]

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,cycle_norm
16378,82,1,0.500000,0.333333,0.0,0.0,0.156627,0.252889,0.326300,0.0,...,0.223548,0.0,0.250000,0.0,0.0,0.542636,0.779067,213,0,0.000000
16379,82,2,0.356322,0.583333,0.0,0.0,0.259036,0.306518,0.230250,0.0,...,0.289727,0.0,0.333333,0.0,0.0,0.744186,0.869373,212,0,0.002770
16380,82,3,0.281609,0.750000,0.0,0.0,0.195783,0.384783,0.198683,0.0,...,0.241631,0.0,0.250000,0.0,0.0,0.666667,0.806131,211,0,0.005540
16381,82,4,0.632184,0.500000,0.0,0.0,0.234940,0.283192,0.307225,0.0,...,0.208157,0.0,0.333333,0.0,0.0,0.503876,0.791356,210,0,0.008310
16382,82,5,0.770115,0.833333,0.0,0.0,0.334337,0.190756,0.288150,0.0,...,0.284340,0.0,0.166667,0.0,0.0,0.620155,0.649544,209,0,0.011080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16587,82,210,0.511494,0.083333,0.0,0.0,0.572289,0.824722,0.878292,0.0,...,0.771835,0.0,0.666667,0.0,0.0,0.248062,0.234328,4,1,0.578947
16588,82,211,0.477011,0.166667,0.0,0.0,0.722892,0.740353,0.754389,0.0,...,0.762216,0.0,0.666667,0.0,0.0,0.162791,0.186689,3,1,0.581717
16589,82,212,0.522989,0.833333,0.0,0.0,0.737952,0.663397,0.751013,0.0,...,0.803771,0.0,0.666667,0.0,0.0,0.279070,0.474593,2,1,0.584488
16590,82,213,0.293103,0.583333,0.0,0.0,0.683735,0.565511,0.732782,0.0,...,0.843401,0.0,0.833333,0.0,0.0,0.271318,0.202983,1,1,0.587258


In [22]:
# validation_df[validation_df['id']==82]['cycle'].iloc[-1]

In [23]:
# validation_df[validation_df['id']==81].shape[0]

In [49]:
# Assumptions for the costs, taken by the 2019 RESS paper
C_p = 100
C_c = 120
DT  = 10  # Decisions can be taken every DT=10

In [50]:
array_decisions = np.arange(0,400,10) # decisions can only be made every DT = 10 cycles

## First PdM policy evaluation on a the whole validation data set (ids 81 to 100)


In [51]:
costs_array = np.zeros(20)
t_LC_array  = np.zeros(20)

In [52]:
counter = 0
for id in validation_df['id'].unique():
    # print(id)
    preventive_replacement = False
    for cycle in range(validation_df[validation_df['id']==id].shape[0]-sequence_length+1): 

        if cycle in array_decisions:
            # print(sequence_length+cycle)
            seq_array_validation_k = validation_df[validation_df['id']==id][sequence_cols].values[cycle:sequence_length+cycle]
            seq_array_validation_k = np.asarray(seq_array_validation_k).astype(np.float32).reshape(1,sequence_length, nb_features)
            prob_RUL_smaller_DT = estimator.predict(seq_array_validation_k).reshape(1)
            # print(prob_RUL_smaller_DT)

            # evaluate decision heuristics
            if C_p/C_c <= prob_RUL_smaller_DT:
                t_LC_array[counter] = sequence_length+cycle
                costs_array[counter] = C_p
                print('ID:', id, ' preventive replacement informed at cycle:', t_LC_array[counter])
                print('component lifecycle:', t_LC_array[counter])
                preventive_replacement = True
                break

    if preventive_replacement == False:
        t_LC_array[counter] = validation_df[validation_df['id']==id]['cycle'].iloc[-1]
        print('ID:', id, ' component failure at t:', t_LC_array[counter])
        costs_array[counter] = C_c
        
    counter+=1

ID: 81  preventive replacement informed at cycle: 240.0
component lifecycle: 240.0
ID: 82  preventive replacement informed at cycle: 210.0
component lifecycle: 210.0
ID: 83  preventive replacement informed at cycle: 290.0
component lifecycle: 290.0
ID: 84  component failure at t: 267.0
ID: 85  preventive replacement informed at cycle: 180.0
component lifecycle: 180.0
ID: 86  preventive replacement informed at cycle: 270.0
component lifecycle: 270.0
ID: 87  preventive replacement informed at cycle: 170.0
component lifecycle: 170.0
ID: 88  preventive replacement informed at cycle: 200.0
component lifecycle: 200.0
ID: 89  preventive replacement informed at cycle: 210.0
component lifecycle: 210.0
ID: 90  preventive replacement informed at cycle: 150.0
component lifecycle: 150.0
ID: 91  preventive replacement informed at cycle: 130.0
component lifecycle: 130.0
ID: 92  preventive replacement informed at cycle: 340.0
component lifecycle: 340.0
ID: 93  preventive replacement informed at cycle:

In [53]:
costs_array

array([100., 100., 100., 120., 100., 100., 100., 100., 100., 100., 100.,
       100., 100., 100., 100., 100., 100., 100., 100., 100.])

In [54]:
t_LC_array

array([240., 210., 290., 267., 180., 270., 170., 200., 210., 150., 130.,
       340., 150., 250., 280., 330., 200., 150., 180., 200.])

In [55]:
t_LC_perfect_array

array([240., 210., 290., 260., 180., 270., 170., 210., 210., 150., 130.,
       340., 150., 250., 280., 330., 200., 150., 180., 200.])

In [56]:
# Evaluation of the expected cost per unit time, Eqns. (3) and (4) of our paper.
expected_cost_LSTM = np.mean(costs_array)/np.mean(t_LC_array)
expected_cost_LSTM

0.4594041391858085

In [57]:
ratio = costs_array/t_LC_array
np.mean(ratio) # indeed taking the mean of the ratio gives a different result!

0.4928275879781793

In [58]:
# Perfect prognostics
import math
t_LC_perfect_array  = np.zeros(20)
counter=0
for id in validation_df['id'].unique():
    t_LC_perfect_array[counter] = math.floor(validation_df[validation_df['id']==id]['cycle'].iloc[-1] /DT) * DT    
    counter+=1
    
costs_perfect_array = np.ones(20)*C_p # a perfect policy will only lead to preventive replacements

expected_cost_perfect = np.mean(costs_perfect_array)/np.mean(t_LC_perfect_array)
expected_cost_perfect

0.45454545454545453

In [59]:
# evaluation of the metric defined in the paper
M = (expected_cost_LSTM - expected_cost_perfect) / expected_cost_perfect
M # it obtains a very small value

0.010689106208778788

In [60]:
M*100

1.0689106208778787