# Predictive Maintenance

## Dataset 2 - Binary Classification using XGBoost

- Label 0 = RUL of Machine is greater than 30 cycles
- Label 1 = RUL of Machine is less than or equal to 30 cycles

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

import keras
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation

# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
%matplotlib inline

Using TensorFlow backend.


## Read Train and Test Data

In [2]:
# read training data 
train_df = pd.read_csv('data/train_02.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [3]:
# read test data
test_df = pd.read_csv('data/test_02.txt', sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [4]:
# read ground truth data
truth_df = pd.read_csv('data/truth_02.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

In [5]:
train_df = train_df.sort_values(['id','cycle'])
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,130.72,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,164.31,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286


## Generate Labels for Train Data

In [6]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071,148
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665,147
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723,146
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701,145
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286,144


In [7]:
# generate label columns for training data
w1 = 30
w0 = 24
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,9.3461,0.02,334,2223,100.0,14.73,8.8071,148,0,0
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,9.3774,0.02,330,2212,100.0,10.41,6.2665,147,0,0
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,10.8941,0.02,309,1915,84.93,14.08,8.6723,146,0,0
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,9.3528,0.02,329,2212,100.0,10.59,6.4701,145,0,0
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,10.8963,0.02,309,1915,84.93,14.13,8.5286,144,0,0


## Generate Labels for Test Data

In [8]:
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

In [9]:
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,9.9987,0.2502,100.0,489.05,605.03,1497.17,1304.99,10.52,...,2388.18,8114.1,8.6476,0.03,369,2319,100.0,28.42,17.1551,275
1,1,2,20.0026,0.7,100.0,491.19,607.82,1481.2,1246.11,9.35,...,2388.12,8053.06,9.2405,0.02,364,2324,100.0,24.29,14.8039,274
2,1,3,35.0045,0.84,100.0,449.44,556.0,1359.08,1128.36,5.48,...,2387.75,8053.04,9.3472,0.02,333,2223,100.0,14.98,8.9125,273
3,1,4,42.0066,0.841,100.0,445.0,550.17,1349.69,1127.89,3.91,...,2387.72,8066.9,9.3961,0.02,332,2212,100.0,10.35,6.4181,272
4,1,5,24.9985,0.6213,60.0,462.54,536.72,1253.18,1050.69,7.05,...,2028.05,7865.66,10.8682,0.02,305,1915,84.93,14.31,8.574,271


In [10]:
# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
0,1,1,9.9987,0.2502,100.0,489.05,605.03,1497.17,1304.99,10.52,...,8.6476,0.03,369,2319,100.0,28.42,17.1551,275,0,0
1,1,2,20.0026,0.7,100.0,491.19,607.82,1481.2,1246.11,9.35,...,9.2405,0.02,364,2324,100.0,24.29,14.8039,274,0,0
2,1,3,35.0045,0.84,100.0,449.44,556.0,1359.08,1128.36,5.48,...,9.3472,0.02,333,2223,100.0,14.98,8.9125,273,0,0
3,1,4,42.0066,0.841,100.0,445.0,550.17,1349.69,1127.89,3.91,...,9.3961,0.02,332,2212,100.0,10.35,6.4181,272,0,0
4,1,5,24.9985,0.6213,60.0,462.54,536.72,1253.18,1050.69,7.05,...,10.8682,0.02,305,1915,84.93,14.31,8.574,271,0,0


## Choose Columns to be Used for Training

In [11]:
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
cols = ['setting1', 'setting2', 'setting3']
cols.extend(sensor_cols)

## Train Test Split

In [12]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb

X_train, X_val, Y_train, Y_val = train_test_split(train_df[cols], train_df['label1'], test_size=0.05, 
                                                  shuffle=False, random_state=42)

print ("Train_shape: " + str(X_train.shape))
print ("Val_shape: " + str(X_val.shape))
print ("No of positives in train: " + str(Y_train.sum()))
print ("No of positives in val: " + str(Y_val.sum()))

Train_shape: (51071, 24)
Val_shape: (2688, 24)
No of positives in train: 7672
No of positives in val: 388


## Training XGBoost

In [13]:
import xgboost as xgb

params = {}
params['booster'] = 'gbtree'
params['objective'] = 'binary:logistic'
params['eta'] = 0.006
params['eval_metric'] = 'auc'
params['max_depth'] = 4
params['colsample_bytree'] = 0.6
params['subsample'] = 0.5
params['silent'] = 1

d_train = xgb.DMatrix(X_train, label=Y_train)
d_valid = xgb.DMatrix(X_val, label=Y_val)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

gbm = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=50, verbose_eval=25)

[0]	train-auc:0.882429	valid-auc:0.908819
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[25]	train-auc:0.968309	valid-auc:0.965909
[50]	train-auc:0.97087	valid-auc:0.967645
[75]	train-auc:0.973904	valid-auc:0.971166
[100]	train-auc:0.974164	valid-auc:0.971863
[125]	train-auc:0.974593	valid-auc:0.972908
[150]	train-auc:0.976067	valid-auc:0.974827
[175]	train-auc:0.976817	valid-auc:0.975536
[200]	train-auc:0.977929	valid-auc:0.975601
[225]	train-auc:0.978566	valid-auc:0.976432
[250]	train-auc:0.979515	valid-auc:0.977604
[275]	train-auc:0.980113	valid-auc:0.978068
[300]	train-auc:0.980866	valid-auc:0.978932
[325]	train-auc:0.981496	valid-auc:0.979534
[350]	train-auc:0.982144	valid-auc:0.98024
[375]	train-auc:0.982661	valid-auc:0.981481
[400]	train-auc:0.983081	valid-auc:0.981922
[425]	train-auc:0.983488	valid-auc:0.982646
[450]	train-auc:0.983857	valid-auc:0.983046
[475]	train-auc:0.9842	valid

In [14]:
from operator import itemgetter

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

print(get_importance(gbm, list(X_train.columns.values)))

[('s15', 3193), ('s11', 3073), ('s4', 2416), ('s14', 2184), ('s13', 1924), ('s2', 1634), ('s3', 1508), ('s12', 1483), ('s9', 1431), ('s8', 1425), ('s7', 1331), ('s21', 1308), ('s20', 1088), ('s17', 950), ('setting1', 528), ('setting2', 260), ('s6', 223), ('s16', 68), ('s10', 16), ('s1', 5), ('s18', 1)]


## Results on Train Set

In [15]:
from sklearn.metrics import accuracy_score
# training metrics

d_trn = xgb.DMatrix(train_df[cols]) 
pred_train = gbm.predict(d_trn)
pred_train = np.where(pred_train > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(train_df['label1'], pred_train)))

Accurracy: 0.9608065626220725


In [16]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(train_df['label1'], pred_train)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[44805,   894],
       [ 1213,  6847]])

## Results on Test Set

In [17]:
d_test =xgb.DMatrix(test_df[cols])
pred_test = gbm.predict(d_test)
pred_test = np.where(pred_test > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(test_df['label1'], pred_test)))

Accurracy: 0.9819363949280692


In [18]:
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(test_df['label1'], pred_test)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[32582,   322],
       [  292,   795]])

In [19]:
# compute precision and recall
precision_test = precision_score(test_df['label1'], pred_test)
recall_test = recall_score(test_df['label1'], pred_test)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.711727842435094 
 Recall:  0.7313707451701932 
 F1-score: 0.721415607985481


In [20]:
label_array_test_last = test_df.groupby('id')['label1'].nth(-1).values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
label_array_test_last.shape

(259, 1)

In [21]:
seq_array_test_last = [test_df[test_df['id']==id][cols].values[-1] for id in test_df['id'].unique()]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last.shape

(259, 24)

In [22]:
d_test_last = xgb.DMatrix(seq_array_test_last)
d_test_last.feature_names = d_trn.feature_names
pred_test_last = gbm.predict(d_test_last)
pred_test_last = np.where(pred_test_last > 0.5, 1, 0)
acc = accuracy_score(label_array_test_last, pred_test_last)
print('Accurracy: {}'.format(acc))

Accurracy: 0.9806949806949807


In [23]:
# make predictions and compute confusion matrix
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(label_array_test_last, pred_test_last)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[196,   2],
       [  3,  58]])

In [24]:
# compute precision and recall
precision_test = precision_score(label_array_test_last, pred_test_last)
recall_test = recall_score(label_array_test_last, pred_test_last)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.9666666666666667 
 Recall:  0.9508196721311475 
 F1-score: 0.9586776859504132


## Summary of Results on Test Set

In [25]:
results_df = pd.DataFrame([[acc,precision_test,recall_test,f1_test],],
                         columns = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
                         index = ['XGBoost'])
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score
XGBoost,0.980695,0.966667,0.95082,0.958678
