In [1]:
import numpy as np
import scipy.stats as scs
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelBinarizer



In [3]:
raw_df = pd.read_csv('/mnt/c/Users/kurtrm/Downloads/bigml_5acff779eba31d156f0000fa.csv')

In [4]:
raw_df['Failure'].value_counts()

No     8703
Yes      81
Name: Failure, dtype: int64

In [5]:
raw_df.columns

Index(['Date', 'Temperature', 'Humidity', 'Operator', 'Measure1', 'Measure2',
       'Measure3', 'Measure4', 'Measure5', 'Measure6', 'Measure7', 'Measure8',
       'Measure9', 'Measure10', 'Measure11', 'Measure12', 'Measure13',
       'Measure14', 'Measure15', 'Hours Since Previous Failure', 'Failure',
       '﻿Date.year', '﻿Date.month', '﻿Date.day-of-month', '﻿Date.day-of-week',
       '﻿Date.hour', '﻿Date.minute', '﻿Date.second'],
      dtype='object')

Fifteen columns have generic names. In the spirit of creativity, I renamed these columns to readings that we may actually see on equipment logs, though their values may be unrealistic to what we could expect of these readings. 

In [6]:
raw_df[[f'Measure{i}' for i in range(1, 16)]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 15 columns):
Measure1     8784 non-null int64
Measure2     8784 non-null int64
Measure3     8784 non-null int64
Measure4     8784 non-null int64
Measure5     8784 non-null int64
Measure6     8784 non-null int64
Measure7     8784 non-null int64
Measure8     8784 non-null int64
Measure9     8784 non-null int64
Measure10    8784 non-null int64
Measure11    8784 non-null int64
Measure12    8784 non-null int64
Measure13    8784 non-null int64
Measure14    8784 non-null int64
Measure15    8784 non-null int64
dtypes: int64(15)
memory usage: 1.0 MB


In [7]:
fake_categories = ['discharge_pressure', 'suction_pressure', 'differential_pressure', 'rotor_temperature', 'packing_leakage_rate',
                   'axial_bearing_temp', 'horizontal_bearing_temp', 'frequency', 'noise', 'voltage', 'current', 'speed',
                   'system_pressure', 'suction_fluid_temperature', 'discharge_fluid_temperature']

In [8]:
replacements = {key: value for key, value in zip([f'Measure{i}' for i in range(1, 16)], fake_categories)}
new_columns = [name if name not in replacements else replacements[name] for name in raw_df.columns]

In [9]:
new_columns

['Date',
 'Temperature',
 'Humidity',
 'Operator',
 'discharge_pressure',
 'suction_pressure',
 'differential_pressure',
 'rotor_temperature',
 'packing_leakage_rate',
 'axial_bearing_temp',
 'horizontal_bearing_temp',
 'frequency',
 'noise',
 'voltage',
 'current',
 'speed',
 'system_pressure',
 'suction_fluid_temperature',
 'discharge_fluid_temperature',
 'Hours Since Previous Failure',
 'Failure',
 '\ufeffDate.year',
 '\ufeffDate.month',
 '\ufeffDate.day-of-month',
 '\ufeffDate.day-of-week',
 '\ufeffDate.hour',
 '\ufeffDate.minute',
 '\ufeffDate.second']

There are some odd strings in the 'Date.' categories. I'll clean those up.

In [10]:
new_columns[-7:] = ['year', 'month', 'day-of-month', 'day-of-week', 'hour', 'minute', 'second']

In [11]:
raw_df.columns = new_columns

Next, change the 'Date' column datatype to datetime objects.

In [12]:
raw_df['Date'] = pd.to_datetime(raw_df['Date'])

In [13]:
one_hot = LabelBinarizer()
binary_labels = one_hot.fit_transform(raw_df['Failure'].values)

In [14]:
raw_df['Failure'] = binary_labels

In [15]:
raw_df.Operator.value_counts()

Operator2    1952
Operator6     976
Operator7     976
Operator5     976
Operator3     976
Operator1     976
Operator4     976
Operator8     976
Name: Operator, dtype: int64

In [16]:
operator_dummies = pd.get_dummies(raw_df['Operator'])

In [17]:
equipment_df = pd.concat([raw_df.drop('Operator', axis=1), operator_dummies], axis=1)

In [18]:
equipment_df = equipment_df.drop(['minute', 'second'], axis=1)

In [19]:
equipment_df.head()

Unnamed: 0,Date,Temperature,Humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,day-of-week,hour,Operator1,Operator2,Operator3,Operator4,Operator5,Operator6,Operator7,Operator8
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,5,0,1,0,0,0,0,0,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,5,1,1,0,0,0,0,0,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,5,2,1,0,0,0,0,0,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,5,3,1,0,0,0,0,0,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,5,4,1,0,0,0,0,0,0,0


In [20]:
week_dummies = pd.get_dummies(equipment_df['day-of-week'])

In [21]:
equipment_df = pd.concat([equipment_df.drop('day-of-week', axis=1), week_dummies], axis=1)

In [22]:
equipment_df.head()

Unnamed: 0,Date,Temperature,Humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,Operator6,Operator7,Operator8,1,2,3,4,5,6,7
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,0,0,0,0,0,0,0,1,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,0,0,0,0,0,0,0,1,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,0,0,0,0,0,0,0,1,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,0,0,0,0,0,0,0,1,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,0,0,0,0,0,0,0,1,0,0


In [23]:
column_list = list(equipment_df.columns)

In [24]:
column_list[-7:] = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [25]:
column_list[1:3] = ['ambient_temp', 'ambient_humidity']

In [26]:
equipment_df.columns = column_list

In [27]:
equipment_df.head()

Unnamed: 0,Date,ambient_temp,ambient_humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,Operator6,Operator7,Operator8,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,0,0,0,0,0,0,0,1,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,0,0,0,0,0,0,0,1,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,0,0,0,0,0,0,0,1,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,0,0,0,0,0,0,0,1,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,0,0,0,0,0,0,0,1,0,0


In [28]:
hour_dummie_headers = [f'hour_{i}' for i in range(0, 24)]

In [29]:
hour_dummies = pd.get_dummies(equipment_df['hour'])

In [30]:
adapted_equipment_df = pd.concat([equipment_df, hour_dummies], axis=1)

In [31]:
adapted_equipment_df.head()

Unnamed: 0,Date,ambient_temp,ambient_humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,14,15,16,17,18,19,20,21,22,23
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,0,0,0,0,0,0,0,0,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,0,0,0,0,0,0,0,0,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,0,0,0,0,0,0,0,0,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,0,0,0,0,0,0,0,0,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,0,0,0,0,0,0,0,0,0,0


In [32]:
hour_columns = list(adapted_equipment_df.columns)
hour_columns[-24:] = hour_dummie_headers
adapted_equipment_df.columns = hour_columns

In [33]:
adapted_equipment_df.columns

Index(['Date', 'ambient_temp', 'ambient_humidity', 'discharge_pressure',
       'suction_pressure', 'differential_pressure', 'rotor_temperature',
       'packing_leakage_rate', 'axial_bearing_temp', 'horizontal_bearing_temp',
       'frequency', 'noise', 'voltage', 'current', 'speed', 'system_pressure',
       'suction_fluid_temperature', 'discharge_fluid_temperature',
       'Hours Since Previous Failure', 'Failure', 'year', 'month',
       'day-of-month', 'hour', 'Operator1', 'Operator2', 'Operator3',
       'Operator4', 'Operator5', 'Operator6', 'Operator7', 'Operator8',
       'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
       'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')

In [34]:
final = adapted_equipment_df.drop('Date', axis=1)

In [35]:
final.head()

Unnamed: 0,ambient_temp,ambient_humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,frequency,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,67,82,291,1,1,1041,846,334,706,1086,...,0,0,0,0,0,0,0,0,0,0
1,68,77,1180,1,1,1915,1194,637,1093,524,...,0,0,0,0,0,0,0,0,0,0
2,64,76,1406,1,1,511,1577,1121,1948,1882,...,0,0,0,0,0,0,0,0,0,0
3,63,80,550,1,1,1754,1834,1413,1151,945,...,0,0,0,0,0,0,0,0,0,0
4,65,81,1928,1,2,1326,1082,233,1441,1736,...,0,0,0,0,0,0,0,0,0,0


In [40]:
final.corr()

Unnamed: 0,ambient_temp,ambient_humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,frequency,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
ambient_temp,1.000000,-0.045661,0.000796,0.002787,-0.007428,-0.015234,0.007868,-0.007172,-0.003100,-0.010439,...,2.647971e-03,6.619927e-03,-4.700148e-03,5.825536e-03,-2.714170e-03,1.456384e-03,-1.522583e-03,-1.264406e-02,4.633949e-04,-1.045948e-02
ambient_humidity,-0.045661,1.000000,-0.000619,-0.011132,-0.007573,0.004571,-0.029283,-0.008802,-0.020730,0.013148,...,-2.282507e-03,-6.523552e-03,-1.253170e-02,3.725641e-03,-3.931802e-03,7.731072e-03,1.326799e-02,1.503509e-02,-9.468723e-03,-1.457859e-03
discharge_pressure,0.000796,-0.000619,1.000000,-0.013846,-0.008231,0.002793,0.013409,0.010618,0.002264,0.000065,...,-2.207016e-02,-1.219003e-02,-1.427327e-03,1.951265e-03,1.157362e-02,-4.905633e-03,-5.021258e-03,-6.582730e-03,-1.461179e-02,7.589457e-04
suction_pressure,0.002787,-0.011132,-0.013846,1.000000,-0.027071,0.002475,0.000496,0.002355,-0.008543,0.010852,...,-9.341627e-03,4.447381e-03,-8.830923e-03,-3.213179e-03,-1.489553e-04,3.936677e-03,-1.291656e-02,4.958085e-03,1.383157e-03,2.915269e-03
differential_pressure,-0.007428,-0.007573,-0.008231,-0.027071,1.000000,-0.012008,-0.000126,0.012463,0.001757,0.013123,...,1.695100e-02,9.013394e-04,-1.654394e-02,1.346194e-02,-2.587716e-03,-1.654394e-02,-2.421986e-02,1.555537e-02,7.879451e-03,-1.165926e-02
rotor_temperature,-0.015234,0.004571,0.002793,0.002475,-0.012008,1.000000,-0.000696,0.016694,-0.007038,0.005685,...,-1.268318e-02,-1.686917e-03,-1.027791e-02,1.088843e-02,5.251720e-03,7.499822e-03,-1.169346e-02,-4.957655e-03,1.776017e-02,-3.133263e-03
packing_leakage_rate,0.007868,-0.029283,0.013409,0.000496,-0.000126,-0.000696,1.000000,0.001189,-0.002892,-0.009155,...,1.218529e-02,5.586565e-03,-5.243894e-03,7.535727e-03,-8.533105e-03,-1.768904e-02,-1.177530e-02,-8.423037e-03,-1.232777e-02,-3.529828e-03
axial_bearing_temp,-0.007172,-0.008802,0.010618,0.002355,0.012463,0.016694,0.001189,1.000000,0.004636,0.004837,...,1.180372e-02,-1.831561e-02,-1.952164e-03,1.537579e-02,1.239053e-02,7.236205e-03,8.005059e-04,-7.714633e-03,1.282156e-02,-1.973996e-02
horizontal_bearing_temp,-0.003100,-0.020730,0.002264,-0.008543,0.001757,-0.007038,-0.002892,0.004636,1.000000,0.002470,...,-3.986056e-03,-2.240392e-03,-1.014087e-02,-8.527534e-03,-2.137600e-02,-8.292520e-04,2.049135e-02,1.898176e-02,-1.500417e-02,6.554620e-03
frequency,-0.010439,0.013148,0.000065,0.010852,0.013123,0.005685,-0.009155,0.004837,0.002470,1.000000,...,-1.303850e-02,-9.999870e-03,7.353749e-03,4.642843e-03,-1.115063e-02,7.716344e-04,-1.049835e-02,4.714831e-04,-7.436388e-03,-8.145933e-03
