In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn import tree
from sklearn import preprocessing
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, Normalizer

%matplotlib inline

## Data Ingestion

In [33]:
def readDataset(dset=1):

    # Read data file

    dtype='train'
    file_name = '{}_FD00{}.txt'.format(dtype, str(dset))
    columns = ['unit', 'cycle', 'os1', 'os2', 'os3'] + ['sm{}'.format(j+1) for j in range(23)]
    
    data = pd.read_csv('./CMAPSSData/{}'.format(file_name), delimiter=' ', names=columns)
    
        
    # Remove the last two columns
    
    data.drop(data.columns[[26, 27]], axis=1, inplace=True)
    
    
    # Calculate end-of-life cycles for each unit
    
    eol_cycles = pd.DataFrame(data.groupby('unit')['cycle'].max()).reset_index()
    eol_cycles.columns = ['unit', 'eol_cycles']
    
    
    # Calculate Remaining Useful Life (RUL) for each unit at each cycle

    data = data.merge(eol_cycles, on=['unit'], how='left')
    data['RUL'] = data['eol_cycles'] - data['cycle']
    data.drop('eol_cycles', axis=1, inplace=True)
    

    # Create label columns
    w = 10
    data['L1'] = np.where(data['RUL'] < w, 1, 0)

    return data


In [34]:
data = readDataset(dset=1)
# data

## Data Preprocessing

In [36]:
smoothing_window = 5
data_smoothed = data.copy()

for unit in np.arange(1, 101, 1):
    for channel in np.arange(1, 22, 1): #measurement_channels:
        smoothed_data = data.loc[data['unit'] == unit, 'sm{}'.format(channel)].rolling(smoothing_window).sum()/smoothing_window
        data_smoothed.loc[data_smoothed['unit'] == unit, 'sm{}'.format(channel)] = smoothed_data
        
data_smoothed = data_smoothed.dropna()

## Split data into training and validation sets

In [66]:
measurement_channels = [7, 11, 14]

training_units = np.arange(1, 81, 1)
validation_units = np.array(list(set(np.arange(1, 101, 1)).difference(training_units)))

data_shift = 1 # Change this for N-step ahead prediction


def sliceData(data, channels, units):
    
    """
    Get a slice of the data set corresponding to specific channels and units.
    
    Parameters
    ----------
    data: pandas.DataFrame
        The data to be sliced.
    channels: list, tuple or ndarray
        The list of channels to be sliced.
    units: list, tuple of ndarray
        The list of units to be sliced.
        
    Returns
    -------
    X, Y: pandas.DataFrame
        The sliced input and output data sets.
    """

    sliced_data = data.loc[data['unit'].isin(units)]
    X = sliced_data[['sm{}'.format(j) for j in channels]]
    Y = sliced_data['L1']
    
    return X, Y


def shiftData(X, Y, shift):

    """
    Shift input and output data for N-step ahead predictions.
    
    Parameters
    ----------
    X, Y: pandas.DataFrame
        The input and output data sets.
    shift: int
        The number of shifts
    """

    X = X[:-shift].to_numpy()
    Y = Y[shift:].to_numpy()
    
    rows_to_delete = np.where(Y == 1)[0][:-1] + data_shift
    X = np.delete(X, rows_to_delete, axis=0)
    Y = np.delete(Y, rows_to_delete)
    
    return X, Y


training_data = data_smoothed.loc[data_smoothed['unit'].isin(training_units)]
validation_data = data_smoothed.loc[~data_smoothed['unit'].isin(training_units)]

X_training, Y_training = sliceData(data_smoothed, measurement_channels, training_units)
X_validation, Y_validation = sliceData(data_smoothed, measurement_channels, validation_units)

transformer = MinMaxScaler().fit(X_training)
# transformer = MaxAbsScaler().fit(X_training)
X_training = transformer.transform(X_training)
X_validation = transformer.transform(X_validation)


# Train and evaluate the model using the training set

dtc = tree.DecisionTreeClassifier(criterion='entropy',max_depth=7)
# dtc = tree.DecisionTreeClassifier(criterion='entropy', max_depth=1)
# dtc = tree.DecisionTreeClassifier(criterion='entropy', max_depth=1)
# dtc = tree.DecisionTreeClassifier(max_depth=1)
Y_predicted = dtc.fit(X_training, Y_training).predict(X_training)

total = X_training.shape[0]
mislabeled = (Y_training != Y_predicted).sum()
message = 'Number of mislabeled points in the training set: {}/{} ({:.2f}%)\n'
print(message.format(mislabeled, total, mislabeled/total*100))

# Evaluate the model using the validation set

Y_predicted = dtc.predict(X_validation)

total = X_validation.shape[0]
mislabeled = (Y_validation != Y_predicted).sum()
message = 'Number of mislabeled points in the validation set: {}/{} ({:.2f}%)'
print(message.format(mislabeled, total, mislabeled/total*100))

Number of mislabeled points in the training set: 194/15818 (1.23%)

Number of mislabeled points in the validation set: 86/4413 (1.95%)


In [67]:
array_decisions = np.arange(0,400,10) # decisions can only be made every DT = 10 cycles
C_p = 100
DT  = 10  # Decisions can be taken every DT=10

In [68]:
def minimizer_training_set(C_c):
    
    print('C_c=', C_c)
    costs_array = np.zeros(80)
    t_LC_array  = np.zeros(80)

    counter = 0
    PR_thres = C_p/C_c

    for unit in training_data['unit'].unique():

        preventive_replacement = False

        X = training_data[['sm{}'.format(j) for j in measurement_channels]].loc[training_data['unit'] == unit].to_numpy()
        X = transformer.transform(X)
        Y = training_data['L1'].loc[training_data['unit'] == unit].to_numpy()

        for cycle in range(training_data[training_data['unit']==unit].shape[0]):

            if smoothing_window + cycle in array_decisions:

                prob_RUL_smaller_DT = dtc.predict_proba(X[cycle].reshape(1,3))[0,1]
#                 print(prob_RUL_smaller_DT)

    #             if prob_RUL_smaller_DT < 0.5:
    #                 prob_RUL_smaller_DT = 0

                # evaluate decision heuristics
                if PR_thres <= prob_RUL_smaller_DT:

                    t_LC_array[counter] = smoothing_window+cycle
                    costs_array[counter] = C_p
#                     print('Unit:', unit, ', preventive replacement informed at cycle:', t_LC_array[counter])
#                     print('component lifecycle:', t_LC_array[counter])
                    preventive_replacement = True
                    break

        if preventive_replacement == False:

            t_LC_array[counter] = training_data[training_data['unit']==unit]['cycle'].iloc[-1]
            print('Unit:', unit, ', component failure at t:', t_LC_array[counter])
            costs_array[counter] = C_c

        counter+=1
        
    expected_cost = np.mean(costs_array) / np.mean(t_LC_array)   # this is the objective function
    return expected_cost

In [69]:
C_c = np.array([10000, 5000, 2000, 1000, 500, 300, 200, 165, 150, 120])

In [70]:
expected_cost_on_grid = np.zeros(np.size(C_c))

for i in range(np.size(C_c)):
    expected_cost_on_grid[i] = minimizer_training_set(C_c[i])


C_c= 10000
C_c= 5000
C_c= 2000
C_c= 1000
C_c= 500
Unit: 70 , component failure at t: 137.0
C_c= 300
Unit: 24 , component failure at t: 147.0
Unit: 57 , component failure at t: 137.0
Unit: 70 , component failure at t: 137.0
C_c= 200
Unit: 24 , component failure at t: 147.0
Unit: 27 , component failure at t: 156.0
Unit: 45 , component failure at t: 158.0
Unit: 57 , component failure at t: 137.0
Unit: 70 , component failure at t: 137.0
C_c= 165
Unit: 7 , component failure at t: 259.0
Unit: 23 , component failure at t: 168.0
Unit: 24 , component failure at t: 147.0
Unit: 27 , component failure at t: 156.0
Unit: 36 , component failure at t: 158.0
Unit: 42 , component failure at t: 196.0
Unit: 45 , component failure at t: 158.0
Unit: 57 , component failure at t: 137.0
Unit: 58 , component failure at t: 147.0
Unit: 70 , component failure at t: 137.0
Unit: 74 , component failure at t: 166.0
C_c= 150
Unit: 7 , component failure at t: 259.0
Unit: 10 , component failure at t: 222.0
Unit: 15 , com

In [71]:
expected_cost_on_grid 

array([0.5242464 , 0.5242464 , 0.52321779, 0.51948052, 0.53994986,
       0.55018873, 0.54226475, 0.55091978, 0.56310557, 0.52724298])

In [72]:
expected_cost_perfect = 0.507292327203551 # for the 80 training data points, I computed in the LSTM notebook

In [73]:
M = (expected_cost_on_grid-expected_cost_perfect)/expected_cost_perfect *100 # optimal 0.9
M

array([ 3.34207077,  3.34207077,  3.13930674,  2.4025974 ,  6.43761651,
        8.45595291,  6.89393939,  8.60006163, 11.00218636,  3.93277337])