In [1]:
''' Data Preparation, Label, Normalization'''

import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score


def printScores(y_pred, y_true):
    print()
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    print( 'precision = ', precision, ', recall = ', recall)


dataFiles = ['upload/train_FD001.txt', 'upload/test_FD001.txt', 'upload/RUL_FD001.txt']
dataColumns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']

# read data 
train_df = pd.read_csv(dataFiles[0], sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = dataColumns

test_df = pd.read_csv(dataFiles[1], sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = dataColumns

rul_df = pd.read_csv(dataFiles[2], sep=" ", header=None)
rul_df.drop(rul_df.columns[[1]], axis=1, inplace=True)
rul_df.columns = ['more']
rul_df['id'] = rul_df.index + 1

# train set, calculate RUL
train_df = train_df.sort_values(['id','cycle'])
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)

# test set, use ground truth to calculate RUL
test_df = test_df.sort_values(['id','cycle'])
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
rul_df['max'] = rul['max'] + rul_df['more']
rul_df.drop('more', axis=1, inplace=True)
test_df = test_df.merge(rul_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)

# label data
w1 = 30
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )

# normalize train data
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL','label1'])   # feature columns
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
with open('min_max_scaler.pickle','wb') as f:
    pickle.dump(min_max_scaler, f)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)

# normalize test data
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)

# describe data and use only some columns
def describe():
    print('train set', train_df.shape)
    print('test set', test_df.shape)
    print('check distribution \n', train_df['label1'].value_counts())
    stats = train_df.describe().T
    unchanging_cols = list(stats[stats['std']==0].index)
    print('unchanging cols', unchanging_cols)
    # ['setting3', 's1', 's5', 's10', 's16', 's18', 's19']

print('Describe data:')
describe()
    
feature_cols = ['cycle_norm', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
feature_cols = [s for s in feature_cols if s not in ['setting3', 's1', 's5', 's10', 's16', 's18', 's19']]
    
cols = ['id','cycle','RUL','label1'] + feature_cols    
train_df = train_df[cols]
test_df = test_df[cols]

Describe data:
train set (20631, 29)
test set (13096, 29)
check distribution 
 0    17531
1     3100
Name: label1, dtype: int64
unchanging cols ['setting3', 's1', 's5', 's10', 's16', 's18', 's19']


In [2]:
train_df.head(5)

Unnamed: 0,id,cycle,RUL,label1,cycle_norm,setting1,setting2,s2,s3,s4,...,s8,s9,s11,s12,s13,s14,s15,s17,s20,s21
0,1,1,191,0,0.0,0.45977,0.166667,0.183735,0.406802,0.309757,...,0.242424,0.109755,0.369048,0.633262,0.205882,0.199608,0.363986,0.333333,0.713178,0.724662
1,1,2,190,0,0.00277,0.609195,0.25,0.283133,0.453019,0.352633,...,0.212121,0.100242,0.380952,0.765458,0.279412,0.162813,0.411312,0.333333,0.666667,0.731014
2,1,3,189,0,0.00554,0.252874,0.75,0.343373,0.369523,0.370527,...,0.272727,0.140043,0.25,0.795309,0.220588,0.171793,0.357445,0.166667,0.627907,0.621375
3,1,4,188,0,0.00831,0.54023,0.5,0.343373,0.256159,0.331195,...,0.318182,0.124518,0.166667,0.889126,0.294118,0.174889,0.166603,0.333333,0.573643,0.662386
4,1,5,187,0,0.01108,0.390805,0.333333,0.349398,0.257467,0.404625,...,0.242424,0.14996,0.255952,0.746269,0.235294,0.174734,0.402078,0.416667,0.589147,0.704502


In [3]:
test_df.head(5)

Unnamed: 0,id,cycle,RUL,label1,cycle_norm,setting1,setting2,s2,s3,s4,...,s8,s9,s11,s12,s13,s14,s15,s17,s20,s21
0,1,1,142,0,0.0,0.632184,0.75,0.545181,0.310661,0.269413,...,0.212121,0.127614,0.208333,0.646055,0.220588,0.13216,0.308965,0.333333,0.55814,0.661834
1,1,2,141,0,0.00277,0.344828,0.25,0.150602,0.379551,0.222316,...,0.166667,0.146684,0.386905,0.739872,0.264706,0.204768,0.213159,0.416667,0.682171,0.686827
2,1,3,140,0,0.00554,0.517241,0.583333,0.376506,0.346632,0.322248,...,0.227273,0.158081,0.386905,0.69936,0.220588,0.15564,0.458638,0.416667,0.728682,0.721348
3,1,4,139,0,0.00831,0.741379,0.5,0.370482,0.285154,0.408001,...,0.19697,0.105717,0.255952,0.573561,0.25,0.17009,0.257022,0.25,0.666667,0.66211
4,1,5,138,0,0.01108,0.58046,0.5,0.391566,0.352082,0.332039,...,0.166667,0.102396,0.27381,0.73774,0.220588,0.152751,0.300885,0.166667,0.658915,0.716377


In [6]:
import pandas as pd
import numpy as np
import keras
import pickle
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

'''
LSTM
'''
# functions to generate LSTM matrix [?, 50, 25]
def gen_sequence(id_df, seq_length, seq_cols):
    # Only sequences that meet the window-length are considered
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]

# function to generate labels [?, 1]
def gen_labels(id_df, seq_length, label):
    data_array = id_df[label].values
    num_elements = data_array.shape[0]
    return data_array[seq_length:num_elements, :]

sequence_length = 50

# generate LSTM matrix
seq_gen = (list(gen_sequence(train_df[train_df['id']==id], sequence_length, feature_cols)) 
           for id in train_df['id'].unique())
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)

# generate labels
label_gen = [gen_labels(train_df[train_df['id']==id], sequence_length, ['label1']) 
             for id in train_df['id'].unique()]
label_array = np.concatenate(label_gen).astype(np.float32)


# build LSTM network
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()
model.add(LSTM(input_shape=(sequence_length, nb_features), units=100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=nb_out, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model.fit(seq_array, label_array, epochs=10, batch_size=200, validation_split=0.1, verbose=1,
          callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto')])

# check performance on train data set
y_pred = model.predict_classes(seq_array,verbose=1, batch_size=200)
y_true = label_array
printScores(y_pred, y_true)

# generate input LSTM matrix for test
seq_array_test = [test_df[test_df['id']==id][feature_cols].values[-sequence_length:] 
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]
seq_array_test = np.asarray(seq_array_test).astype(np.float32)

# generate labels for test
y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]
label_array_test = test_df.groupby('id')['label1'].nth(-1)[y_mask].values
label_array_test = label_array_test.reshape(label_array_test.shape[0],1).astype(np.float32)

# check performance on test data set
y_pred_test = model.predict_classes(seq_array_test)
y_true_test = label_array_test
printScores(y_pred_test, y_true_test)


Train on 14067 samples, validate on 1564 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

[[12485    46]
 [  476  2624]]
precision =  0.9827715355805243 , recall =  0.8464516129032258

[[68  0]
 [ 3 22]]
precision =  1.0 , recall =  0.88
