# Predictive Maintenance

## Dataset 2 - Binary Classification using LightGBM

- Label 0 = RUL of Machine is greater than 30 cycles
- Label 1 = RUL of Machine is less than or equal to 30 cycles

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

import keras
from pykalman import KalmanFilter
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation

np.random.seed(1234)  
PYTHONHASHSEED = 0
%matplotlib inline

Using TensorFlow backend.


## Read Train and Test Data

In [2]:
# read training data 
train_df = pd.read_csv('data/train_02.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [3]:
# read test data
test_df = pd.read_csv('data/test_02.txt', sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [4]:
# read ground truth data
truth_df = pd.read_csv('data/truth_02.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

In [5]:
train_df = train_df.sort_values(['id','cycle'])
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,130.72,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,164.31,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286


## Generate Labels for Train Data

In [6]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071,148
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665,147
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723,146
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701,145
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286,144


In [7]:
# generate label columns for training data
w1 = 30
w0 = 24
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,9.3461,0.02,334,2223,100.0,14.73,8.8071,148,0,0
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,9.3774,0.02,330,2212,100.0,10.41,6.2665,147,0,0
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,10.8941,0.02,309,1915,84.93,14.08,8.6723,146,0,0
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,9.3528,0.02,329,2212,100.0,10.59,6.4701,145,0,0
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,10.8963,0.02,309,1915,84.93,14.13,8.5286,144,0,0


In [8]:
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

## Generate Labels for Test Data

In [9]:
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,9.9987,0.2502,100.0,489.05,605.03,1497.17,1304.99,10.52,...,2388.18,8114.1,8.6476,0.03,369,2319,100.0,28.42,17.1551,275
1,1,2,20.0026,0.7,100.0,491.19,607.82,1481.2,1246.11,9.35,...,2388.12,8053.06,9.2405,0.02,364,2324,100.0,24.29,14.8039,274
2,1,3,35.0045,0.84,100.0,449.44,556.0,1359.08,1128.36,5.48,...,2387.75,8053.04,9.3472,0.02,333,2223,100.0,14.98,8.9125,273
3,1,4,42.0066,0.841,100.0,445.0,550.17,1349.69,1127.89,3.91,...,2387.72,8066.9,9.3961,0.02,332,2212,100.0,10.35,6.4181,272
4,1,5,24.9985,0.6213,60.0,462.54,536.72,1253.18,1050.69,7.05,...,2028.05,7865.66,10.8682,0.02,305,1915,84.93,14.31,8.574,271


In [10]:
# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
0,1,1,9.9987,0.2502,100.0,489.05,605.03,1497.17,1304.99,10.52,...,8.6476,0.03,369,2319,100.0,28.42,17.1551,275,0,0
1,1,2,20.0026,0.7,100.0,491.19,607.82,1481.2,1246.11,9.35,...,9.2405,0.02,364,2324,100.0,24.29,14.8039,274,0,0
2,1,3,35.0045,0.84,100.0,449.44,556.0,1359.08,1128.36,5.48,...,9.3472,0.02,333,2223,100.0,14.98,8.9125,273,0,0
3,1,4,42.0066,0.841,100.0,445.0,550.17,1349.69,1127.89,3.91,...,9.3961,0.02,332,2212,100.0,10.35,6.4181,272,0,0
4,1,5,24.9985,0.6213,60.0,462.54,536.72,1253.18,1050.69,7.05,...,10.8682,0.02,305,1915,84.93,14.31,8.574,271,0,0


## Choose Columns to be Used for Training

In [11]:
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
cols = ['setting1', 'setting2', 'setting3']
cols.extend(sensor_cols)

## Train Test Split

In [12]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb

X_train, X_val, Y_train, Y_val = train_test_split(train_df[cols], train_df['label1'], test_size=0.05, 
                                                  shuffle=False, random_state=42)

print ("Train_shape: " + str(X_train.shape))
print ("Val_shape: " + str(X_val.shape))
print ("No of positives in train: " + str(Y_train.sum()))
print ("No of positives in val: " + str(Y_val.sum()))

Train_shape: (51071, 24)
Val_shape: (2688, 24)
No of positives in train: 7672
No of positives in val: 388


## Training the Model

In [13]:
lgb_train = lgb.Dataset(X_train, Y_train)
lgb_eval = lgb.Dataset(X_val, Y_val)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 12,
    'learning_rate': 0.001,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
}

print('Start training...')

gbm = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=[lgb_train, lgb_eval], 
                early_stopping_rounds=100, verbose_eval=25)

Start training...
Training until validation scores don't improve for 100 rounds.
[25]	training's auc: 0.96812	valid_1's auc: 0.967976
[50]	training's auc: 0.973474	valid_1's auc: 0.973474
[75]	training's auc: 0.973673	valid_1's auc: 0.973722
[100]	training's auc: 0.97423	valid_1's auc: 0.97483
[125]	training's auc: 0.975235	valid_1's auc: 0.976426
[150]	training's auc: 0.975863	valid_1's auc: 0.976816
[175]	training's auc: 0.975991	valid_1's auc: 0.976956
[200]	training's auc: 0.976167	valid_1's auc: 0.977189
[225]	training's auc: 0.97621	valid_1's auc: 0.977218
[250]	training's auc: 0.976253	valid_1's auc: 0.977041
[275]	training's auc: 0.976435	valid_1's auc: 0.977231
[300]	training's auc: 0.976427	valid_1's auc: 0.977125
[325]	training's auc: 0.976402	valid_1's auc: 0.976987
Early stopping, best iteration is:
[230]	training's auc: 0.976277	valid_1's auc: 0.977437


## Results on Train Set

In [14]:
from sklearn.metrics import accuracy_score
# training metrics

pred_train = gbm.predict(train_df[cols], num_iteration=gbm.best_iteration)
pred_train = np.where(pred_train > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(train_df['label1'], pred_train)))

Accurracy: 0.9381312896445246


In [15]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(train_df['label1'], pred_train)
print(cm)

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels
[[44751   948]
 [ 2378  5682]]


## Results on Test Set

In [16]:
pred_test = gbm.predict(test_df[cols], num_iteration=gbm.best_iteration)
pred_test = np.where(pred_test > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(test_df['label1'], pred_test)))

Accurracy: 0.976817392839281


In [17]:
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(test_df['label1'], pred_test)
print(cm)

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels
[[32569   335]
 [  453   634]]


In [18]:
# compute precision and recall
precision_test = precision_score(test_df['label1'], pred_test)
recall_test = recall_score(test_df['label1'], pred_test)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.6542827657378741 
 Recall:  0.5832566697332107 
 F1-score: 0.6167315175097277


In [19]:
label_array_test_last = test_df.groupby('id')['label1'].nth(-1).values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
label_array_test_last.shape

(259, 1)

In [20]:
seq_array_test_last = [test_df[test_df['id']==id][cols].values[-1] for id in test_df['id'].unique()]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last.shape

(259, 24)

In [21]:
pred_test_last = gbm.predict(seq_array_test_last, num_iteration=gbm.best_iteration)
pred_test_last = np.where(pred_test_last > 0.5, 1, 0)
acc = accuracy_score(label_array_test_last, pred_test_last)
print('Accurracy: {}'.format(acc))

Accurracy: 0.915057915057915


In [22]:
# make predictions and compute confusion matrix
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(label_array_test_last, pred_test_last)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[190,   8],
       [ 14,  47]])

In [23]:
# compute precision and recall
precision_test = precision_score(label_array_test_last, pred_test_last)
recall_test = recall_score(label_array_test_last, pred_test_last)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.8545454545454545 
 Recall:  0.7704918032786885 
 F1-score: 0.810344827586207


## Summary of Results on Test Set

In [24]:
results_df = pd.DataFrame([[acc,precision_test,recall_test,f1_test],],
                         columns = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
                         index = ['LightGBM'])
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score
LightGBM,0.915058,0.854545,0.770492,0.810345
