# Feature engineering on the datasets (4 of 4)

The first step in predictive maintenance applications is feature engineering which requires bringing the different data sources together to create features that best describe a machines's health condition at a given point in time.

### Environmnet Setup:

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt

### Load the Datasets

In [2]:
# 1. Telemetry Dataset - telemetry.csv
telemetry = pd.read_csv('data/telemetry.csv')
telemetry['datetime'] = pd.to_datetime(telemetry['datetime'], format="%Y-%m-%d %H:%M:%S")

# 2. Errors Dataset - errors.csv
errors = pd.read_csv('data/errors.csv')
errors['datetime'] = pd.to_datetime(errors['datetime'], format="%Y-%m-%d %H:%M:%S")
errors['errorID'] = errors['errorID'].astype('category')

# 3. Machines Dataset - machines.csv
machines = pd.read_csv('data/machines.csv')
machines['model'] = machines['model'].astype('category')

# 4. Failures Dataset - failures.csv
failures = pd.read_csv('data/failures.csv')
failures['datetime'] = pd.to_datetime(failures['datetime'], format="%Y-%m-%d %H:%M:%S")
failures['failure'] = failures['failure'].astype('category')

# 5. Maintainance Dataset -  maint.csv
maintainance = pd.read_csv('data/maint.csv')
maintainance['datetime'] = pd.to_datetime(maintainance['datetime'], format="%Y-%m-%d %H:%M:%S")
maintainance['comp'] = maintainance['comp'].astype('category')

### Feature engineering on the telemetry dataset

### Lag Features from Telemetry

Telemetry dataset features time-stamps from which we shall pick window sizes of 3hrs and 24hrs for the lag features to be created and compute rolling aggregate measures

- mean
- standard deviation
- minimum
- maximum

These features help represent the short term history of the telemetry over the chosen lag window. In the following we calculate for every 3 hours the rolling mean, standard deviation, minimum and maximum of the telemetry data over the last 3 hour lag window.

In [3]:
# 1. calculate 3h mean values for features in the telemetry dataset
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').mean().unstack())
telemetry_mean_3h = pd.concat(temp, axis=1)
telemetry_mean_3h.columns = [i + '_mean_3h' for i in fields]
telemetry_mean_3h.reset_index(inplace=True)

# 2. calculate 3h standard deviation values for features in the telemetry dataset
temp = []
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').std().unstack())
telemetry_std_3h = pd.concat(temp, axis=1)
telemetry_std_3h.columns = [i + '_sd_3h' for i in fields]
telemetry_std_3h.reset_index(inplace=True)

# 3. calculate 3h min values for features in the telemetry dataset
temp = []
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').min().unstack())
telemetry_min_3h = pd.concat(temp, axis=1)
telemetry_min_3h.columns = [i + '_min_3h' for i in fields]
telemetry_min_3h.reset_index(inplace=True)

# 4. calculate 3h max values for features in the telemetry dataset
temp = []
for col in fields:
    temp.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).resample('3H', closed='left', label='right').max().unstack())
telemetry_max_3h = pd.concat(temp, axis=1)
telemetry_max_3h.columns = [i + '_max_3h' for i in fields]
telemetry_max_3h.reset_index(inplace=True)

telemetry_mean_3h.head()

Unnamed: 0,machineID,datetime,volt_mean_3h,rotate_mean_3h,pressure_mean_3h,vibration_mean_3h
0,1,2015-01-01 09:00:00,170.028993,449.533798,94.592122,40.893502
1,1,2015-01-01 12:00:00,164.192565,403.949857,105.687417,34.255891
2,1,2015-01-01 15:00:00,168.134445,435.781707,107.793709,41.239405
3,1,2015-01-01 18:00:00,165.514453,430.472823,101.703289,40.373739
4,1,2015-01-01 21:00:00,168.809347,437.11112,90.91106,41.738542


To capture longer term effect, we can expand our window to 24 hours. In the following we calculate for every 24 hours the rolling mean and standard deviation of the telemetry data over the last 24 hour lag window.

In [4]:
# 1. calculate 24h mean values for features in the telemetry dataset
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.DataFrame(pd.pivot_table(telemetry,
                                            index='datetime',
                                            columns='machineID',
                                            values=col)).rolling(24).mean().resample('3H',
                                                                                     closed='left',
                                                                                     label='right').first().unstack())
    
telemetry_mean_24h = pd.concat(temp, axis=1)
telemetry_mean_24h.columns = [i + '_mean_24h' for i in fields]
telemetry_mean_24h.reset_index(inplace=True)
telemetry_mean_24h = telemetry_mean_24h.loc[-telemetry_mean_24h['volt_mean_24h'].isnull()]

# 2. calculate 24h std dev values for features in the telemetry dataset
temp = []
fields = ['volt', 'rotate', 'pressure', 'vibration']
for col in fields:
    temp.append(pd.DataFrame(pd.pivot_table(telemetry,
                                            index='datetime',
                                            columns='machineID',
                                            values=col)).rolling(24).std().resample('3H',
                                                                                     closed='left',
                                                                                     label='right').first().unstack())
    
telemetry_std_24h = pd.concat(temp, axis=1)
telemetry_std_24h.columns = [i + '_std_24h' for i in fields]
telemetry_std_24h.reset_index(inplace=True)
telemetry_std_24h = telemetry_std_24h.loc[-telemetry_std_24h['volt_std_24h'].isnull()]


# Notice that a 24h rolling average is not available at the earliest timepoints
telemetry_std_24h.head(10)

Unnamed: 0,machineID,datetime,volt_std_24h,rotate_std_24h,pressure_std_24h,vibration_std_24h
7,1,2015-01-02 06:00:00,11.23312,48.717395,10.07988,5.853209
8,1,2015-01-02 09:00:00,12.519402,48.385076,10.17154,6.163231
9,1,2015-01-02 12:00:00,13.370357,42.432317,9.471669,6.195076
10,1,2015-01-02 15:00:00,13.299281,41.346121,8.731229,5.687944
11,1,2015-01-02 18:00:00,13.954518,43.490234,8.061653,5.898069
12,1,2015-01-02 21:00:00,14.40274,42.626186,10.408012,5.94189
13,1,2015-01-03 00:00:00,15.513819,40.395881,10.833294,5.737671
14,1,2015-01-03 03:00:00,15.72697,39.648116,11.9047,5.601191
15,1,2015-01-03 06:00:00,15.635083,41.828592,11.326412,5.583521
16,1,2015-01-03 09:00:00,13.995465,40.843882,11.036546,5.561553


Next, the columns of the lag feature datasets created from the 3 and 24 hr windows  are merged to create the final feature set from telemetry.

In [5]:
# merge columns of feature sets created earlier
telemetry_feat = pd.concat([telemetry_mean_3h,
                            telemetry_std_3h.iloc[:, 2:6],
                            telemetry_mean_24h.iloc[:, 2:6],
                            telemetry_std_24h.iloc[:, 2:6]], axis=1).dropna()
telemetry_feat.describe()

Unnamed: 0,machineID,volt_mean_3h,rotate_mean_3h,pressure_mean_3h,vibration_mean_3h,volt_sd_3h,rotate_sd_3h,pressure_sd_3h,vibration_sd_3h,volt_mean_24h,rotate_mean_24h,pressure_mean_24h,vibration_mean_24h,volt_std_24h,rotate_std_24h,pressure_std_24h,vibration_std_24h
count,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0
mean,50.5,170.771416,446.612973,100.855301,40.384328,13.299858,44.456335,8.885822,4.44066,170.772579,446.61344,100.854547,40.384564,14.918849,49.947916,10.046687,5.002065
std,28.86612,9.497201,33.122865,7.408854,3.476641,6.966647,23.216421,4.656341,2.320066,4.71697,18.074248,4.733098,2.059861,2.260726,7.682126,1.713726,0.79982
min,1.0,125.532506,211.811184,72.118639,26.569635,0.025509,0.078991,0.027417,0.015278,155.812721,266.010419,91.057429,35.060087,6.380619,18.385248,4.145308,2.144863
25%,25.75,164.445576,427.568132,96.238173,38.147839,8.027455,26.903567,5.370647,2.684308,168.07089,441.549802,98.668589,39.354062,13.358663,44.668824,8.924317,4.460347
50%,50.5,170.43079,448.385564,100.233139,40.146085,12.495595,41.7957,8.345785,4.173854,170.210913,449.212554,100.097694,40.072752,14.853948,49.613807,9.921407,4.958594
75%,75.25,176.607051,468.44768,104.40452,42.227392,17.689118,59.103715,11.789645,5.898997,172.459612,456.368008,101.611121,40.833364,16.394764,54.821318,10.980498,5.48448
max,100.0,241.420717,586.682904,162.309656,69.311324,58.444332,179.903039,35.659369,18.305595,220.782618,499.096975,152.310351,61.932124,27.664538,103.819404,28.654103,12.325783


In [6]:
telemetry_feat.head()

Unnamed: 0,machineID,datetime,volt_mean_3h,rotate_mean_3h,pressure_mean_3h,vibration_mean_3h,volt_sd_3h,rotate_sd_3h,pressure_sd_3h,vibration_sd_3h,volt_mean_24h,rotate_mean_24h,pressure_mean_24h,vibration_mean_24h,volt_std_24h,rotate_std_24h,pressure_std_24h,vibration_std_24h
7,1,2015-01-02 06:00:00,180.133784,440.60832,94.137969,41.551544,21.322735,48.770512,2.135684,10.037208,169.733809,445.179865,96.797113,40.38516,11.23312,48.717395,10.07988,5.853209
8,1,2015-01-02 09:00:00,176.364293,439.349655,101.553209,36.10558,18.95221,51.329636,13.789279,6.737739,170.614862,446.364859,96.849785,39.736826,12.519402,48.385076,10.17154,6.163231
9,1,2015-01-02 12:00:00,160.384568,424.385316,99.598722,36.094637,13.04708,13.702496,9.988609,1.639962,169.893965,447.009407,97.7156,39.498374,13.370357,42.432317,9.471669,6.195076
10,1,2015-01-02 15:00:00,170.472461,442.933997,102.380586,40.483002,16.642354,56.290447,3.305739,8.854145,171.243444,444.233563,96.66606,40.22937,13.299281,41.346121,8.731229,5.687944
11,1,2015-01-02 18:00:00,163.263806,468.937558,102.726648,40.921802,17.424688,38.68038,9.105775,3.060781,170.792486,448.440437,95.766838,40.055214,13.954518,43.490234,8.061653,5.898069


### Lag Features from Errors

Error dataset also come with timestamps. This enables us to calculate lag features although we shall note that since the error IDs are categorical values we cannot averaged over time intervals like the telemetry measurements. Instead, we count the number of errors of each type in a lagging window. 

The error data first will be refomarted to have one entry per machine per time at which at least one error shall be seen to have occurred:

In [7]:
# create a column for each error type
error_count = pd.get_dummies(errors.set_index('datetime')).reset_index()
error_count.columns = ['datetime', 'machineID', 'error_1', 'error_2', 'error_3', 'error_4', 'error_5']

# combine errors for a given machine in a given hour
error_count = error_count.groupby(['machineID', 'datetime']).sum().reset_index()
error_count.head()

Unnamed: 0,machineID,datetime,error_1,error_2,error_3,error_4,error_5
0,1,2015-01-03 07:00:00,1,0,0,0,0
1,1,2015-01-03 20:00:00,0,0,1,0,0
2,1,2015-01-04 06:00:00,0,0,0,0,1
3,1,2015-01-10 15:00:00,0,0,0,1,0
4,1,2015-01-22 10:00:00,0,0,0,1,0


We then merge with the telemetry dataset to create a full feature dataset while filling all blank entries with 0.0 (fillna(0.0) i.e these are points in time where errors did not occur:

In [8]:
error_count = telemetry[['datetime', 'machineID']].merge(error_count, on=['machineID', 'datetime'], how='left').fillna(0.0)
error_count.describe()

Unnamed: 0,machineID,error_1,error_2,error_3,error_4,error_5
count,876100.0,876100.0,876100.0,876100.0,876100.0,876100.0
mean,50.5,0.001153,0.001128,0.000957,0.00083,0.000406
std,28.866087,0.033934,0.033563,0.030913,0.028795,0.020154
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,25.75,0.0,0.0,0.0,0.0,0.0
50%,50.5,0.0,0.0,0.0,0.0,0.0
75%,75.25,0.0,0.0,0.0,0.0,0.0
max,100.0,1.0,1.0,1.0,1.0,1.0


Finally for the lag feature, we can compute the total number of errors of each type over the last 24 hours, for timepoints taken every three hours:

In [9]:
temp = []
fields = ['error_%d' % i for i in range(1,6)]
for col in fields:
    temp.append(pd.DataFrame(pd.pivot_table(error_count,
                                            index='datetime',
                                            columns='machineID',
                                            values=col)).rolling(24).sum().resample('3H',
                                                                                    closed='left',
                                                                                    label='right').first().unstack())
                                                                             
error_count = pd.concat(temp, axis=1)
error_count.columns = [i + '_count' for i in fields]
error_count.reset_index(inplace=True)
error_count = error_count.dropna()
error_count.describe()

Unnamed: 0,machineID,error_1_count,error_2_count,error_3_count,error_4_count,error_5_count
count,291400.0,291400.0,291400.0,291400.0,291400.0,291400.0
mean,50.5,0.027649,0.027069,0.022907,0.019904,0.009753
std,28.86612,0.166273,0.164429,0.151453,0.14082,0.098797
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,25.75,0.0,0.0,0.0,0.0,0.0
50%,50.5,0.0,0.0,0.0,0.0,0.0
75%,75.25,0.0,0.0,0.0,0.0,0.0
max,100.0,2.0,2.0,2.0,2.0,2.0


In [10]:
error_count.head()

Unnamed: 0,machineID,datetime,error_1_count,error_2_count,error_3_count,error_4_count,error_5_count
7,1,2015-01-02 06:00:00,0.0,0.0,0.0,0.0,0.0
8,1,2015-01-02 09:00:00,0.0,0.0,0.0,0.0,0.0
9,1,2015-01-02 12:00:00,0.0,0.0,0.0,0.0,0.0
10,1,2015-01-02 15:00:00,0.0,0.0,0.0,0.0,0.0
11,1,2015-01-02 18:00:00,0.0,0.0,0.0,0.0,0.0


###  Maintenance Features

The maintainance dataset features information of the per component replacement entries. We shall use this information to calculate how long it has been since a component is last replaced. This quantity correlates with the expected failure of component as the more the days since last replacement the more probable the chances of failure as more degreadation has taken place at this point

Domain knowledge plays a big role in understanding the predictors of a failure. In the mantainance dataset, the days since last component replacement are calculated for each component type as features for prediction.

In [11]:
# create a column for each error type
comp_rep = pd.get_dummies(maintainance.set_index('datetime')).reset_index()
comp_rep.columns = ['datetime', 'machineID', 'comp1', 'comp2', 'comp3', 'comp4']

# combine repairs for a given machine in a given hour
comp_rep = comp_rep.groupby(['machineID', 'datetime']).sum().reset_index()

# add timepoints where no components were replaced
comp_rep = telemetry[['datetime', 'machineID']].merge(comp_rep,
                                                      on=['datetime', 'machineID'],
                                                      how='outer').fillna(0).sort_values(by=['machineID', 
                                                                                             'datetime'])

components = ['comp1', 'comp2', 'comp3', 'comp4']
for comp in components:
    # convert indicator to most recent date of component change
    comp_rep.loc[comp_rep[comp] < 1, comp] = None
    comp_rep.loc[-comp_rep[comp].isnull(), comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']
    
    # forward-fill the most-recent date of component change
    comp_rep[comp] = comp_rep[comp].fillna(method='ffill')

# remove dates in 2014 (may have NaN or future component change dates)    
comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]

# replace dates of most recent component change with days since most recent component change
for comp in components:
    comp_rep[comp] = (comp_rep['datetime'] - pd.to_datetime(comp_rep[comp])) / np.timedelta64(1, 'D')
    
comp_rep.describe()

Unnamed: 0,machineID,comp1,comp2,comp3,comp4
count,876100.0,876100.0,876100.0,876100.0,876100.0
mean,50.5,53.525185,51.540806,52.725962,53.834191
std,28.866087,62.491679,59.269254,58.873114,59.707978
min,1.0,0.0,0.0,0.0,0.0
25%,25.75,13.291667,12.125,13.125,13.0
50%,50.5,32.791667,29.666667,32.291667,32.5
75%,75.25,68.708333,66.541667,67.333333,70.458333
max,100.0,491.958333,348.958333,370.958333,394.958333


In [12]:
comp_rep.head()

Unnamed: 0,datetime,machineID,comp1,comp2,comp3,comp4
0,2015-01-01 06:00:00,1,19.0,214.0,154.0,169.0
1,2015-01-01 07:00:00,1,19.041667,214.041667,154.041667,169.041667
2,2015-01-01 08:00:00,1,19.083333,214.083333,154.083333,169.083333
3,2015-01-01 09:00:00,1,19.125,214.125,154.125,169.125
4,2015-01-01 10:00:00,1,19.166667,214.166667,154.166667,169.166667


### Machine Features

Machine features here includes the age (number of years in service). This information is readily available in the dataset so so further modifications are required.

With these newly created feature datasets, we can merge them together to create the final feature matrix.

In [13]:
final_feat = telemetry_feat.merge(error_count, on=['datetime', 'machineID'], how='left')
final_feat = final_feat.merge(comp_rep, on=['datetime', 'machineID'], how='left')
final_feat = final_feat.merge(machines, on=['machineID'], how='left')

final_feat.describe()

Unnamed: 0,machineID,volt_mean_3h,rotate_mean_3h,pressure_mean_3h,vibration_mean_3h,volt_sd_3h,rotate_sd_3h,pressure_sd_3h,vibration_sd_3h,volt_mean_24h,...,error_1_count,error_2_count,error_3_count,error_4_count,error_5_count,comp1,comp2,comp3,comp4,age
count,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,...,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0,291300.0
mean,50.5,170.771416,446.612973,100.855301,40.384328,13.299858,44.456335,8.885822,4.44066,170.772579,...,0.027659,0.027075,0.022901,0.019907,0.009756,53.350748,51.318963,52.512108,53.619118,11.33
std,28.86612,9.497201,33.122865,7.408854,3.476641,6.966647,23.216421,4.656341,2.320066,4.71697,...,0.166301,0.164446,0.151435,0.140832,0.098813,62.418063,59.140743,58.765904,59.606886,5.827625
min,1.0,125.532506,211.811184,72.118639,26.569635,0.025509,0.078991,0.027417,0.015278,155.812721,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25.75,164.445576,427.568132,96.238173,38.147839,8.027455,26.903567,5.370647,2.684308,168.07089,...,0.0,0.0,0.0,0.0,0.0,13.25,12.0,13.0,12.875,6.75
50%,50.5,170.43079,448.385564,100.233139,40.146085,12.495595,41.7957,8.345785,4.173854,170.210913,...,0.0,0.0,0.0,0.0,0.0,32.625,29.5,32.125,32.375,12.0
75%,75.25,176.607051,468.44768,104.40452,42.227392,17.689118,59.103715,11.789645,5.898997,172.459612,...,0.0,0.0,0.0,0.0,0.0,68.5,66.25,67.0,70.125,16.0
max,100.0,241.420717,586.682904,162.309656,69.311324,58.444332,179.903039,35.659369,18.305595,220.782618,...,2.0,2.0,2.0,2.0,2.0,491.875,348.875,370.875,394.875,20.0


In [14]:
final_feat.head()

Unnamed: 0,machineID,datetime,volt_mean_3h,rotate_mean_3h,pressure_mean_3h,vibration_mean_3h,volt_sd_3h,rotate_sd_3h,pressure_sd_3h,vibration_sd_3h,...,error_2_count,error_3_count,error_4_count,error_5_count,comp1,comp2,comp3,comp4,model,age
0,1,2015-01-02 06:00:00,180.133784,440.60832,94.137969,41.551544,21.322735,48.770512,2.135684,10.037208,...,0.0,0.0,0.0,0.0,20.0,215.0,155.0,170.0,model3,18
1,1,2015-01-02 09:00:00,176.364293,439.349655,101.553209,36.10558,18.95221,51.329636,13.789279,6.737739,...,0.0,0.0,0.0,0.0,20.125,215.125,155.125,170.125,model3,18
2,1,2015-01-02 12:00:00,160.384568,424.385316,99.598722,36.094637,13.04708,13.702496,9.988609,1.639962,...,0.0,0.0,0.0,0.0,20.25,215.25,155.25,170.25,model3,18
3,1,2015-01-02 15:00:00,170.472461,442.933997,102.380586,40.483002,16.642354,56.290447,3.305739,8.854145,...,0.0,0.0,0.0,0.0,20.375,215.375,155.375,170.375,model3,18
4,1,2015-01-02 18:00:00,163.263806,468.937558,102.726648,40.921802,17.424688,38.68038,9.105775,3.060781,...,0.0,0.0,0.0,0.0,20.5,215.5,155.5,170.5,model3,18


## Label Construction

Labelling here is done by taking a time window prior to the failure of an asset and labelling the feature records that fall into that window as "about to fail due to a problem" while labelling all other records as "normal".

This time window should be picked according to the business case: in some situations it may be enough to predict failures hours in advance, while in others days or weeks may be needed to allow e.g. for arrival of replacement parts.

The prediction problem for this example scenerio is to estimate the probability that a machine will fail in the near future due to a failure of a certain component. More specifically, the goal is to compute the probability that a machine will fail in the next 24 hours due to a certain component failure (component 1, 2, 3, or 4). 

Below, a categorical `failure` feature is created to serve as the label. All records within a 24 hour window before a failure of component 1 have `failure=comp1`, and so on for components 2, 3, and 4; all records not within 24 hours of a component failure have `failure=none`.

In [15]:
labeled_features = final_feat.merge(failures, on=['datetime', 'machineID'], how='left')
labeled_features['failure'] = labeled_features.groupby("machineID")["failure"].bfill(limit=7)
labeled_features['failure'] = labeled_features['failure'].cat.add_categories(["none"])
labeled_features['failure'] = labeled_features['failure'].fillna('none')
labeled_features.head()

Unnamed: 0,machineID,datetime,volt_mean_3h,rotate_mean_3h,pressure_mean_3h,vibration_mean_3h,volt_sd_3h,rotate_sd_3h,pressure_sd_3h,vibration_sd_3h,...,error_3_count,error_4_count,error_5_count,comp1,comp2,comp3,comp4,model,age,failure
0,1,2015-01-02 06:00:00,180.133784,440.60832,94.137969,41.551544,21.322735,48.770512,2.135684,10.037208,...,0.0,0.0,0.0,20.0,215.0,155.0,170.0,model3,18,none
1,1,2015-01-02 09:00:00,176.364293,439.349655,101.553209,36.10558,18.95221,51.329636,13.789279,6.737739,...,0.0,0.0,0.0,20.125,215.125,155.125,170.125,model3,18,none
2,1,2015-01-02 12:00:00,160.384568,424.385316,99.598722,36.094637,13.04708,13.702496,9.988609,1.639962,...,0.0,0.0,0.0,20.25,215.25,155.25,170.25,model3,18,none
3,1,2015-01-02 15:00:00,170.472461,442.933997,102.380586,40.483002,16.642354,56.290447,3.305739,8.854145,...,0.0,0.0,0.0,20.375,215.375,155.375,170.375,model3,18,none
4,1,2015-01-02 18:00:00,163.263806,468.937558,102.726648,40.921802,17.424688,38.68038,9.105775,3.060781,...,0.0,0.0,0.0,20.5,215.5,155.5,170.5,model3,18,none


Below is an example of records that are labeled as failure=comp4 in the failure column. Notice that the first 8 records all occur in the 24-hour window before the first recorded failure of component 4. The next 8 records are within the 24 hour window before another failure of component 4.

In [17]:
labeled_features.loc[labeled_features['failure'] == 'comp4'][:16]

Unnamed: 0,machineID,datetime,volt_mean_3h,rotate_mean_3h,pressure_mean_3h,vibration_mean_3h,volt_sd_3h,rotate_sd_3h,pressure_sd_3h,vibration_sd_3h,...,error_3_count,error_4_count,error_5_count,comp1,comp2,comp3,comp4,model,age,failure
17,1,2015-01-04 09:00:00,166.281848,453.787824,106.187582,51.99008,24.276228,23.621315,11.176731,3.394073,...,1.0,0.0,1.0,22.125,217.125,157.125,172.125,model3,18,comp4
18,1,2015-01-04 12:00:00,175.412103,445.450581,100.887363,54.251534,34.918687,11.001625,10.580336,2.921501,...,1.0,0.0,1.0,22.25,217.25,157.25,172.25,model3,18,comp4
19,1,2015-01-04 15:00:00,157.347716,451.882075,101.28938,48.602686,24.617739,28.950883,9.966729,2.356486,...,1.0,0.0,1.0,22.375,217.375,157.375,172.375,model3,18,comp4
20,1,2015-01-04 18:00:00,176.45055,446.033068,84.521555,47.638836,8.0714,76.511343,2.636879,4.108621,...,1.0,0.0,1.0,22.5,217.5,157.5,172.5,model3,18,comp4
21,1,2015-01-04 21:00:00,190.325814,422.692565,107.393234,49.552856,8.390777,7.176553,4.262645,7.598552,...,1.0,0.0,1.0,22.625,217.625,157.625,172.625,model3,18,comp4
22,1,2015-01-05 00:00:00,169.985134,458.929418,91.494362,54.882021,9.451483,12.052752,3.685906,6.621183,...,0.0,0.0,1.0,22.75,217.75,157.75,172.75,model3,18,comp4
23,1,2015-01-05 03:00:00,149.082619,412.180336,93.509785,54.386079,19.075952,30.715081,3.090266,6.53061,...,0.0,0.0,1.0,22.875,217.875,157.875,172.875,model3,18,comp4
24,1,2015-01-05 06:00:00,185.782709,439.531288,99.41366,51.558082,14.495664,45.663743,4.289212,7.330397,...,0.0,0.0,1.0,0.0,218.0,158.0,0.0,model3,18,comp4
1337,1,2015-06-18 09:00:00,169.324639,453.923471,101.313249,53.092274,28.155693,42.557599,7.688674,2.488851,...,0.0,0.0,1.0,89.125,29.125,14.125,134.125,model3,18,comp4
1338,1,2015-06-18 12:00:00,190.691297,441.577271,97.192512,44.025425,6.296827,47.271008,7.577957,4.648336,...,0.0,0.0,1.0,89.25,29.25,14.25,134.25,model3,18,comp4
