In [2]:
import numpy as np
import scipy.stats as scs
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV



In [4]:
raw_df = pd.read_csv('/mnt/c/Users/kurtrm/Downloads/bigml_5acff779eba31d156f0000fa.csv')

In [5]:
raw_df['Failure'].value_counts()

No     8703
Yes      81
Name: Failure, dtype: int64

In [6]:
raw_df.columns

Index(['Date', 'Temperature', 'Humidity', 'Operator', 'Measure1', 'Measure2',
       'Measure3', 'Measure4', 'Measure5', 'Measure6', 'Measure7', 'Measure8',
       'Measure9', 'Measure10', 'Measure11', 'Measure12', 'Measure13',
       'Measure14', 'Measure15', 'Hours Since Previous Failure', 'Failure',
       '﻿Date.year', '﻿Date.month', '﻿Date.day-of-month', '﻿Date.day-of-week',
       '﻿Date.hour', '﻿Date.minute', '﻿Date.second'],
      dtype='object')

Fifteen columns have generic names. In the spirit of creativity, I renamed these columns to readings that we may actually see on equipment logs, though their values may be unrealistic to what we could expect of these readings. 

In [7]:
raw_df[[f'Measure{i}' for i in range(1, 16)]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 15 columns):
Measure1     8784 non-null int64
Measure2     8784 non-null int64
Measure3     8784 non-null int64
Measure4     8784 non-null int64
Measure5     8784 non-null int64
Measure6     8784 non-null int64
Measure7     8784 non-null int64
Measure8     8784 non-null int64
Measure9     8784 non-null int64
Measure10    8784 non-null int64
Measure11    8784 non-null int64
Measure12    8784 non-null int64
Measure13    8784 non-null int64
Measure14    8784 non-null int64
Measure15    8784 non-null int64
dtypes: int64(15)
memory usage: 1.0 MB


In [8]:
fake_categories = ['discharge_pressure', 'suction_pressure', 'differential_pressure', 'rotor_temperature', 'packing_leakage_rate',
                   'axial_bearing_temp', 'horizontal_bearing_temp', 'frequency', 'noise', 'voltage', 'current', 'speed',
                   'system_pressure', 'suction_fluid_temperature', 'discharge_fluid_temperature']

In [9]:
replacements = {key: value for key, value in zip([f'Measure{i}' for i in range(1, 16)], fake_categories)}
new_columns = [name if name not in replacements else replacements[name] for name in raw_df.columns]

In [10]:
new_columns

['Date',
 'Temperature',
 'Humidity',
 'Operator',
 'discharge_pressure',
 'suction_pressure',
 'differential_pressure',
 'rotor_temperature',
 'packing_leakage_rate',
 'axial_bearing_temp',
 'horizontal_bearing_temp',
 'frequency',
 'noise',
 'voltage',
 'current',
 'speed',
 'system_pressure',
 'suction_fluid_temperature',
 'discharge_fluid_temperature',
 'Hours Since Previous Failure',
 'Failure',
 '\ufeffDate.year',
 '\ufeffDate.month',
 '\ufeffDate.day-of-month',
 '\ufeffDate.day-of-week',
 '\ufeffDate.hour',
 '\ufeffDate.minute',
 '\ufeffDate.second']

There are some odd strings in the 'Date.' categories. I'll clean those up.

In [11]:
new_columns[-7:] = ['year', 'month', 'day-of-month', 'day-of-week', 'hour', 'minute', 'second']

In [12]:
raw_df.columns = new_columns

Next, change the 'Date' column datatype to datetime objects.

In [13]:
raw_df['Date'] = pd.to_datetime(raw_df['Date'])

In [14]:
one_hot = LabelBinarizer()
binary_labels = one_hot.fit_transform(raw_df['Failure'].values)

In [15]:
raw_df['Failure'] = binary_labels

In [16]:
raw_df.Operator.value_counts()

Operator2    1952
Operator5     976
Operator4     976
Operator3     976
Operator6     976
Operator1     976
Operator7     976
Operator8     976
Name: Operator, dtype: int64

In [17]:
operator_dummies = pd.get_dummies(raw_df['Operator'])

In [18]:
equipment_df = pd.concat([raw_df.drop('Operator', axis=1), operator_dummies], axis=1)

In [19]:
equipment_df = equipment_df.drop(['minute', 'second'], axis=1)

In [20]:
equipment_df.head()

Unnamed: 0,Date,Temperature,Humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,day-of-week,hour,Operator1,Operator2,Operator3,Operator4,Operator5,Operator6,Operator7,Operator8
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,5,0,1,0,0,0,0,0,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,5,1,1,0,0,0,0,0,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,5,2,1,0,0,0,0,0,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,5,3,1,0,0,0,0,0,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,5,4,1,0,0,0,0,0,0,0


In [21]:
week_dummies = pd.get_dummies(equipment_df['day-of-week'])

In [22]:
equipment_df = pd.concat([equipment_df.drop('day-of-week', axis=1), week_dummies], axis=1)

In [23]:
equipment_df.head()

Unnamed: 0,Date,Temperature,Humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,Operator6,Operator7,Operator8,1,2,3,4,5,6,7
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,0,0,0,0,0,0,0,1,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,0,0,0,0,0,0,0,1,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,0,0,0,0,0,0,0,1,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,0,0,0,0,0,0,0,1,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,0,0,0,0,0,0,0,1,0,0


In [27]:
column_list = list(equipment_df.columns)

In [28]:
column_list[-7:] = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [30]:
column_list[1:3] = ['ambient_temp', 'ambient_humidity']

In [32]:
equipment_df.columns = column_list

In [33]:
equipment_df.head()

Unnamed: 0,Date,ambient_temp,ambient_humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,Operator6,Operator7,Operator8,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,0,0,0,0,0,0,0,1,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,0,0,0,0,0,0,0,1,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,0,0,0,0,0,0,0,1,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,0,0,0,0,0,0,0,1,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,0,0,0,0,0,0,0,1,0,0


In [43]:
hour_dummie_headers = [f'hour_{i}' for i in range(0, 24)]

In [39]:
hour_dummies = pd.get_dummies(equipment_df['hour'])

In [44]:
adapted_equipment_df = pd.concat([equipment_df, hour_dummies], axis=1)

In [45]:
adapted_equipment_df.head()

Unnamed: 0,Date,ambient_temp,ambient_humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,14,15,16,17,18,19,20,21,22,23
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,0,0,0,0,0,0,0,0,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,0,0,0,0,0,0,0,0,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,0,0,0,0,0,0,0,0,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,0,0,0,0,0,0,0,0,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,0,0,0,0,0,0,0,0,0,0


In [47]:
hour_columns = list(adapted_equipment_df.columns)
hour_columns[-24:] = hour_dummie_headers
adapted_equipment_df.columns = hour_columns

In [48]:
adapted_equipment_df

Unnamed: 0,Date,ambient_temp,ambient_humidity,discharge_pressure,suction_pressure,differential_pressure,rotor_temperature,packing_leakage_rate,axial_bearing_temp,horizontal_bearing_temp,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,2016-01-01 00:00:00,67,82,291,1,1,1041,846,334,706,...,0,0,0,0,0,0,0,0,0,0
1,2016-01-01 01:00:00,68,77,1180,1,1,1915,1194,637,1093,...,0,0,0,0,0,0,0,0,0,0
2,2016-01-01 02:00:00,64,76,1406,1,1,511,1577,1121,1948,...,0,0,0,0,0,0,0,0,0,0
3,2016-01-01 03:00:00,63,80,550,1,1,1754,1834,1413,1151,...,0,0,0,0,0,0,0,0,0,0
4,2016-01-01 04:00:00,65,81,1928,1,2,1326,1082,233,1441,...,0,0,0,0,0,0,0,0,0,0
5,2016-01-01 05:00:00,67,84,398,1,2,1901,1801,1153,1085,...,0,0,0,0,0,0,0,0,0,0
6,2016-01-01 06:00:00,67,83,847,0,2,1849,1141,1609,982,...,0,0,0,0,0,0,0,0,0,0
7,2016-01-01 07:00:00,67,76,1021,2,1,185,170,952,1183,...,0,0,0,0,0,0,0,0,0,0
8,2016-01-01 08:00:00,65,80,1731,2,0,1424,1176,1223,621,...,0,0,0,0,0,0,0,0,0,0
9,2016-01-01 09:00:00,63,80,415,0,0,1008,1086,1759,1946,...,0,0,0,0,0,0,0,0,0,0
