# Predictive Maintenance

## Dataset 2 - Binary Classification using SVM

- Label 0 = RUL of Machine is greater than 30 cycles
- Label 1 = RUL of Machine is less than or equal to 30 cycles

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

from pykalman import KalmanFilter
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score

# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
%matplotlib inline

## Read Train and Test Data

In [2]:
# read training data 
train_df = pd.read_csv('data/train_02.txt', sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [3]:
# read test data
test_df = pd.read_csv('data/test_02.txt', sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']

In [4]:
# read ground truth data
truth_df = pd.read_csv('data/truth_02.txt', sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

In [5]:
train_df = train_df.sort_values(['id','cycle'])
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,130.72,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,164.31,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286


## Generate Labels for Train Data

In [6]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071,148
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665,147
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723,146
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,2387.61,8068.66,9.3528,0.02,329,2212,100.0,10.59,6.4701,145
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,2028.0,7861.23,10.8963,0.02,309,1915,84.93,14.13,8.5286,144


In [7]:
# generate label columns for training data
w1 = 30
w0 = 24
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s15,s16,s17,s18,s19,s20,s21,RUL,label1,label2
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,9.3461,0.02,334,2223,100.0,14.73,8.8071,148,0,0
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,9.3774,0.02,330,2212,100.0,10.41,6.2665,147,0,0
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,10.8941,0.02,309,1915,84.93,14.08,8.6723,146,0,0
3,1,4,42.0077,0.8416,100.0,445.0,549.51,1354.03,1126.38,3.91,...,9.3528,0.02,329,2212,100.0,10.59,6.4701,145,0,0
4,1,5,25.0005,0.6203,60.0,462.54,537.07,1257.71,1047.93,7.05,...,10.8963,0.02,309,1915,84.93,14.13,8.5286,144,0,0


## Normalize Train and Test Data

In [8]:
# MinMax normalization
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['id','cycle','RUL','label1','label2'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)
train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s16,s17,s18,s19,s20,s21,RUL,label1,label2,cycle_norm
0,1,1,0.833134,0.997625,1.0,0.060269,0.181576,0.311201,0.273095,0.146592,...,0.0,0.322917,0.651163,1.0,0.156036,0.159082,148,0,0,0.0
1,1,2,0.999767,0.998575,1.0,0.0,0.131847,0.2966,0.245535,0.0,...,0.0,0.28125,0.627907,1.0,0.007888,0.014562,147,0,0,0.002653
2,1,3,0.595096,0.73848,0.0,0.238089,0.016332,0.035297,0.056997,0.293184,...,0.0,0.0625,0.0,0.0,0.133745,0.151414,146,0,0,0.005305
3,1,4,0.999993,0.999525,1.0,0.0,0.128269,0.298795,0.246979,0.0,...,0.0,0.270833,0.627907,1.0,0.01406,0.026144,145,0,0,0.007958
4,1,5,0.595137,0.736698,0.0,0.238089,0.01413,0.037871,0.058152,0.293184,...,0.0,0.0625,0.0,0.0,0.13546,0.14324,144,0,0,0.01061


In [9]:
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,cycle_norm
0,1,1,0.238019,0.29715,1.0,0.597937,0.637673,0.68655,0.676888,0.61718,...,0.993662,0.632503,0.114199,1.0,0.6875,0.854123,1.0,0.625514,0.633951,0.0
1,1,2,0.476162,0.831354,1.0,0.626985,0.663272,0.643289,0.535166,0.507937,...,0.993496,0.487219,0.331283,0.0,0.635417,0.864693,1.0,0.483882,0.500205,0.002653
2,1,3,0.833282,0.997625,1.0,0.060269,0.187815,0.312475,0.251745,0.146592,...,0.992477,0.487171,0.37035,0.0,0.3125,0.651163,1.0,0.164609,0.165078,0.005305
3,1,4,0.999967,0.998812,1.0,0.0,0.134324,0.287038,0.250614,0.0,...,0.992394,0.52016,0.388254,0.0,0.302083,0.627907,1.0,0.00583,0.023186,0.007958
4,1,5,0.595089,0.737886,0.0,0.238089,0.010918,0.025599,0.064796,0.293184,...,0.001213,0.041177,0.927248,0.0,0.020833,0.0,0.0,0.141632,0.145822,0.01061


## Generate Labels for Test Data

In [10]:
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

In [11]:
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s14,s15,s16,s17,s18,s19,s20,s21,cycle_norm,RUL
0,1,1,0.238019,0.29715,1.0,0.597937,0.637673,0.68655,0.676888,0.61718,...,0.632503,0.114199,1.0,0.6875,0.854123,1.0,0.625514,0.633951,0.0,275
1,1,2,0.476162,0.831354,1.0,0.626985,0.663272,0.643289,0.535166,0.507937,...,0.487219,0.331283,0.0,0.635417,0.864693,1.0,0.483882,0.500205,0.002653,274
2,1,3,0.833282,0.997625,1.0,0.060269,0.187815,0.312475,0.251745,0.146592,...,0.487171,0.37035,0.0,0.3125,0.651163,1.0,0.164609,0.165078,0.005305,273
3,1,4,0.999967,0.998812,1.0,0.0,0.134324,0.287038,0.250614,0.0,...,0.52016,0.388254,0.0,0.302083,0.627907,1.0,0.00583,0.023186,0.007958,272
4,1,5,0.595089,0.737886,0.0,0.238089,0.010918,0.025599,0.064796,0.293184,...,0.041177,0.927248,0.0,0.020833,0.0,0.0,0.141632,0.145822,0.01061,271


In [12]:
# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2
test_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s16,s17,s18,s19,s20,s21,cycle_norm,RUL,label1,label2
0,1,1,0.238019,0.29715,1.0,0.597937,0.637673,0.68655,0.676888,0.61718,...,1.0,0.6875,0.854123,1.0,0.625514,0.633951,0.0,275,0,0
1,1,2,0.476162,0.831354,1.0,0.626985,0.663272,0.643289,0.535166,0.507937,...,0.0,0.635417,0.864693,1.0,0.483882,0.500205,0.002653,274,0,0
2,1,3,0.833282,0.997625,1.0,0.060269,0.187815,0.312475,0.251745,0.146592,...,0.0,0.3125,0.651163,1.0,0.164609,0.165078,0.005305,273,0,0
3,1,4,0.999967,0.998812,1.0,0.0,0.134324,0.287038,0.250614,0.0,...,0.0,0.302083,0.627907,1.0,0.00583,0.023186,0.007958,272,0,0
4,1,5,0.595089,0.737886,0.0,0.238089,0.010918,0.025599,0.064796,0.293184,...,0.0,0.020833,0.0,0.0,0.141632,0.145822,0.01061,271,0,0


## Choose Columns to be Used for Training

In [13]:
# pick the feature columns 
sensor_cols = ['s' + str(i) for i in range(1,22)]
cols = ['setting1', 'setting2', 'setting3','cycle_norm']
cols.extend(sensor_cols)

## Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_val, Y_train, Y_val = train_test_split(train_df[cols], train_df['label1'], test_size=0.05, 
                                                  shuffle=False, random_state=42)

print ("Train_shape: " + str(X_train.shape))
print ("Val_shape: " + str(X_val.shape))
print ("No of positives in train: " + str(Y_train.sum()))
print ("No of positives in val: " + str(Y_val.sum()))

Train_shape: (51071, 25)
Val_shape: (2688, 25)
No of positives in train: 7672
No of positives in val: 388


## Training SVM Classifier

In [15]:
from sklearn.metrics import accuracy_score
print('Start training...')

clf = SVC(C=4.0, kernel='rbf', degree=3, gamma='auto', shrinking=True, verbose=True, max_iter=-1, 
          random_state=42)
clf.fit(X_train, Y_train)
print ("Validation Accuracy: "+str(accuracy_score(Y_val, clf.predict(X_val))))

Start training...
[LibSVM]Validation Accuracy: 0.9244791666666666


## Results on Train Set

In [16]:
# training metrics
pred_train = clf.predict(train_df[cols])
pred_train = np.where(pred_train > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(train_df['label1'], pred_train)))

Accurracy: 0.9379452742796555


In [17]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(train_df['label1'], pred_train)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[44828,   871],
       [ 2465,  5595]])

## Results on Test Set

In [18]:
pred_test = clf.predict(test_df[cols])
pred_test = np.where(pred_test > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(test_df['label1'], pred_test)))

Accurracy: 0.9760230649289517


In [19]:
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(test_df['label1'], pred_test)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[32638,   266],
       [  549,   538]])

In [20]:
# compute precision and recall
precision_test = precision_score(test_df['label1'], pred_test)
recall_test = recall_score(test_df['label1'], pred_test)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.6691542288557214 
 Recall:  0.49494020239190434 
 F1-score: 0.5690111052353252


In [21]:
label_array_test_last = test_df.groupby('id')['label1'].nth(-1).values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
label_array_test_last.shape

(259, 1)

In [22]:
seq_array_test_last = [test_df[test_df['id']==id][cols].values[-1] for id in test_df['id'].unique()]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last.shape

(259, 25)

In [23]:
pred_test_last = clf.predict(seq_array_test_last)
pred_test_last = np.where(pred_test_last > 0.5, 1, 0)
acc = accuracy_score(label_array_test_last, pred_test_last)
print('Accurracy: {}'.format(acc))

Accurracy: 0.9266409266409267


In [24]:
# make predictions and compute confusion matrix
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(label_array_test_last, pred_test_last)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[197,   1],
       [ 18,  43]])

In [25]:
# compute precision and recall
precision_test = precision_score(label_array_test_last, pred_test_last)
recall_test = recall_score(label_array_test_last, pred_test_last)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.9772727272727273 
 Recall:  0.7049180327868853 
 F1-score: 0.8190476190476191


## Summary of Results on Test Set

In [26]:
results_df = pd.DataFrame([[acc,precision_test,recall_test,f1_test],
                          ],
                         columns = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
                         index = ['SVM'])
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score
SVM,0.926641,0.977273,0.704918,0.819048
