In [1]:
import pandas as pd
import datetime as dt
import matplotlib
import matplotlib.pyplot as pp
import seaborn as sns
import numpy as np

get_ipython().magic(u'matplotlib inline')


In [2]:
#inspecting sensor data, remove empty columns and change datatype
data = pd.read_csv('sensor.csv')
#data.info()
#remove old index and column without any data
data = data.drop(['Unnamed: 0','sensor_15'], axis=1)
#convert date to datetime for timeseries and set as new index
data[['timestamp']] = data[['timestamp']].apply(pd.to_datetime)
ts = data.set_index('timestamp')

In [3]:
#check at which timepoints the machine was broken
ts_broken = ts[ts.machine_status == 'BROKEN']
list(ts_broken.index)
#the machine was broken at seven timepoints during 2018

[Timestamp('2018-04-12 21:55:00'),
 Timestamp('2018-04-18 00:30:00'),
 Timestamp('2018-05-19 03:18:00'),
 Timestamp('2018-05-25 00:30:00'),
 Timestamp('2018-06-28 22:00:00'),
 Timestamp('2018-07-08 00:11:00'),
 Timestamp('2018-07-25 14:00:00')]

In [4]:
# create a new class "before breaking/broken" : 10 minutes before breakdown

In [5]:
#fill in missing values, default is 'linear', treat values as equally spaced
ts_interpolated = ts.interpolate()

# create a new class "before breaking/broken" : 10 minutes before breakdown
temp = ts_interpolated['machine_status'].copy()

temp.loc[(temp.index >= pd.to_datetime('2018-04-12 21:45:00')) & (temp.index <= pd.to_datetime('2018-04-12 21:54:00'))] = 'BEFORE_BREAKING'
temp.loc[(temp.index >= pd.to_datetime('2018-04-18 00:20:00')) & (temp.index <= pd.to_datetime('2018-04-18 00:29:00'))] = 'BEFORE_BREAKING'
temp.loc[(temp.index >= pd.to_datetime('2018-05-19 03:08:00')) & (temp.index <= pd.to_datetime('2018-05-19 03:17:00'))] = 'BEFORE_BREAKING'
temp.loc[(temp.index >= pd.to_datetime('2018-05-25 00:20:00')) & (temp.index <= pd.to_datetime('2018-05-25 00:29:00'))] = 'BEFORE_BREAKING'
temp.loc[(temp.index >= pd.to_datetime('2018-06-28 21:50:00')) & (temp.index <= pd.to_datetime('2018-06-28 21:59:00'))] = 'BEFORE_BREAKING'
temp.loc[(temp.index >= pd.to_datetime('2018-07-08 00:01:00')) & (temp.index <= pd.to_datetime('2018-07-08 00:10:00'))] = 'BEFORE_BREAKING'
temp.loc[(temp.index >= pd.to_datetime('2018-07-25 13:50:00')) & (temp.index <= pd.to_datetime('2018-07-25 13:59:00'))] = 'BEFORE_BREAKING'

ts_interpolated['machine_status'] = temp

#check result
#ts_interpolated.info()
#ts_interpolated[ts_interpolated['machine_status'] == 'BEFORE_BREAKING']

In [6]:
# unite 'before_breaking', 'broken' and 'recovering' to detect the abnormal classes : make it 3 class problem 
def make_status_abnormal(alarms): 
    if np.any(alarms == 'BEFORE_BREAKING'):
        return 1
    if np.any(alarms == 'BROKEN'):
        return 1
    if np.any(alarms == 'NORMAL'):
        return 0
    if np.any(alarms == 'RECOVERING'):
        return 2

In [7]:
#add new classes to dataframe
ts_interpolated['machine_status_new'] = ts_interpolated['machine_status'].apply(make_status_abnormal)

In [8]:
#check result
#ts_interpolated.info()

In [9]:
ts_interpolated['machine_status_new'].value_counts()

0    205766
2     14477
1        77
Name: machine_status_new, dtype: int64

In [10]:
feature_subset = list(ts_interpolated.columns.values)

feature_subset.remove('machine_status')

#feature_subset.remove('machine_status_new')

feature_subset.append('machine_status_new_t')

feature_subset

['sensor_00',
 'sensor_01',
 'sensor_02',
 'sensor_03',
 'sensor_04',
 'sensor_05',
 'sensor_06',
 'sensor_07',
 'sensor_08',
 'sensor_09',
 'sensor_10',
 'sensor_11',
 'sensor_12',
 'sensor_13',
 'sensor_14',
 'sensor_16',
 'sensor_17',
 'sensor_18',
 'sensor_19',
 'sensor_20',
 'sensor_21',
 'sensor_22',
 'sensor_23',
 'sensor_24',
 'sensor_25',
 'sensor_26',
 'sensor_27',
 'sensor_28',
 'sensor_29',
 'sensor_30',
 'sensor_31',
 'sensor_32',
 'sensor_33',
 'sensor_34',
 'sensor_35',
 'sensor_36',
 'sensor_37',
 'sensor_38',
 'sensor_39',
 'sensor_40',
 'sensor_41',
 'sensor_42',
 'sensor_43',
 'sensor_44',
 'sensor_45',
 'sensor_46',
 'sensor_47',
 'sensor_48',
 'sensor_49',
 'sensor_50',
 'sensor_51',
 'machine_status_new',
 'machine_status_new_t']

In [11]:
# convert series to supervised learning (code from https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/)
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [12]:
ts_interpolated_reframed = series_to_supervised(ts_interpolated, 1, 1)

In [13]:
ts_interpolated_reframed.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 220319 entries, 2018-04-01 00:01:00 to 2018-08-31 23:59:00
Columns: 106 entries, var1(t-1) to var53(t)
dtypes: float64(103), int64(1), object(2)
memory usage: 179.9+ MB


In [14]:
ts_interpolated_reframed.drop(['var1(t)', 'var2(t)',
       'var3(t)', 'var4(t)', 'var5(t)', 'var6(t)', 'var7(t)', 'var8(t)',
       'var9(t)', 'var10(t)', 'var11(t)', 'var12(t)', 'var13(t)',
       'var14(t)', 'var15(t)', 'var16(t)', 'var17(t)', 'var18(t)',
       'var19(t)', 'var20(t)', 'var21(t)', 'var22(t)', 'var23(t)',
       'var24(t)', 'var25(t)', 'var26(t)', 'var27(t)', 'var28(t)',
       'var29(t)', 'var30(t)', 'var31(t)', 'var32(t)', 'var33(t)',
       'var34(t)', 'var35(t)', 'var36(t)', 'var37(t)', 'var38(t)',
       'var39(t)', 'var40(t)', 'var41(t)', 'var42(t)', 'var43(t)',
       'var44(t)', 'var45(t)', 'var46(t)', 'var47(t)', 'var48(t)',
       'var49(t)', 'var50(t)', 'var51(t)', 'var52(t)', 'var52(t-1)'], axis=1, inplace=True)

In [15]:
ts_interpolated_reframed.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 220319 entries, 2018-04-01 00:01:00 to 2018-08-31 23:59:00
Data columns (total 53 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   var1(t-1)   220319 non-null  float64
 1   var2(t-1)   220319 non-null  float64
 2   var3(t-1)   220319 non-null  float64
 3   var4(t-1)   220319 non-null  float64
 4   var5(t-1)   220319 non-null  float64
 5   var6(t-1)   220319 non-null  float64
 6   var7(t-1)   220319 non-null  float64
 7   var8(t-1)   220319 non-null  float64
 8   var9(t-1)   220319 non-null  float64
 9   var10(t-1)  220319 non-null  float64
 10  var11(t-1)  220319 non-null  float64
 11  var12(t-1)  220319 non-null  float64
 12  var13(t-1)  220319 non-null  float64
 13  var14(t-1)  220319 non-null  float64
 14  var15(t-1)  220319 non-null  float64
 15  var16(t-1)  220319 non-null  float64
 16  var17(t-1)  220319 non-null  float64
 17  var18(t-1)  220319 non-null  float64
 18  var19(t-1)

In [16]:
ts_interpolated_reframed.columns = feature_subset

In [17]:
ts_interpolated_reframed.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 220319 entries, 2018-04-01 00:01:00 to 2018-08-31 23:59:00
Data columns (total 53 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   sensor_00             220319 non-null  float64
 1   sensor_01             220319 non-null  float64
 2   sensor_02             220319 non-null  float64
 3   sensor_03             220319 non-null  float64
 4   sensor_04             220319 non-null  float64
 5   sensor_05             220319 non-null  float64
 6   sensor_06             220319 non-null  float64
 7   sensor_07             220319 non-null  float64
 8   sensor_08             220319 non-null  float64
 9   sensor_09             220319 non-null  float64
 10  sensor_10             220319 non-null  float64
 11  sensor_11             220319 non-null  float64
 12  sensor_12             220319 non-null  float64
 13  sensor_13             220319 non-null  float64
 14  sensor_14         

In [18]:
#[Timestamp('2018-04-12 21:55:00'),
 #Timestamp('2018-04-18 00:30:00'),
# Timestamp('2018-05-19 03:18:00'),
# Timestamp('2018-05-25 00:30:00'),
# Timestamp('2018-06-28 22:00:00'),
# Timestamp('2018-07-08 00:11:00'),
# Timestamp('2018-07-25 14:00:00')]

In [19]:
from sklearn.preprocessing import MinMaxScaler

In [20]:
import tensorflow as tf
from tensorflow import keras

In [21]:
#### RNN with class weights on unsampled dataset

In [22]:
#broken sensors
#[Timestamp('2018-04-12 21:55:00'),
# Timestamp('2018-04-18 00:30:00'),
#Timestamp('2018-05-19 03:18:00'),
#Timestamp('2018-05-25 00:30:00'),
# Timestamp('2018-06-28 22:00:00'),
#Timestamp('2018-07-08 00:11:00'),
#Timestamp('2018-07-25 14:00:00')]

In [23]:
ts_interpolated_train = ts_interpolated_reframed['2018-04-01':'2018-06-21']
ts_interpolated_valid = ts_interpolated_reframed['2018-06-22':'2018-07-01']
ts_interpolated_test = ts_interpolated_reframed['2018-07-02':]

In [24]:
feature_subset.remove('machine_status_new_t')

In [25]:
feature_subset

['sensor_00',
 'sensor_01',
 'sensor_02',
 'sensor_03',
 'sensor_04',
 'sensor_05',
 'sensor_06',
 'sensor_07',
 'sensor_08',
 'sensor_09',
 'sensor_10',
 'sensor_11',
 'sensor_12',
 'sensor_13',
 'sensor_14',
 'sensor_16',
 'sensor_17',
 'sensor_18',
 'sensor_19',
 'sensor_20',
 'sensor_21',
 'sensor_22',
 'sensor_23',
 'sensor_24',
 'sensor_25',
 'sensor_26',
 'sensor_27',
 'sensor_28',
 'sensor_29',
 'sensor_30',
 'sensor_31',
 'sensor_32',
 'sensor_33',
 'sensor_34',
 'sensor_35',
 'sensor_36',
 'sensor_37',
 'sensor_38',
 'sensor_39',
 'sensor_40',
 'sensor_41',
 'sensor_42',
 'sensor_43',
 'sensor_44',
 'sensor_45',
 'sensor_46',
 'sensor_47',
 'sensor_48',
 'sensor_49',
 'sensor_50',
 'sensor_51',
 'machine_status_new']

In [26]:
X_train = ts_interpolated_train[feature_subset]
X_valid = ts_interpolated_valid[feature_subset]
X_test = ts_interpolated_test[feature_subset]
y_train = ts_interpolated_train['machine_status_new_t']
y_valid = ts_interpolated_valid['machine_status_new_t']
y_test = ts_interpolated_test['machine_status_new_t']

In [27]:
ts_interpolated_train['machine_status_new_t'].value_counts()

0    112064
2      5971
1        44
Name: machine_status_new_t, dtype: int64

In [28]:
ts_interpolated_valid['machine_status_new_t'].value_counts()

0    9950
2    4439
1      11
Name: machine_status_new_t, dtype: int64

In [29]:
ts_interpolated_test['machine_status_new_t'].value_counts()

0    83751
2     4067
1       22
Name: machine_status_new_t, dtype: int64

In [30]:
scaler = MinMaxScaler(feature_range=(-1,1))
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [31]:
X_train

array([[ 0.9343888 ,  0.47916635,  0.71046678, ..., -0.06265956,
        -0.64726944, -1.        ],
       [ 0.9343888 ,  0.47916635,  0.71046678, ..., -0.06265956,
        -0.64726944, -1.        ],
       [ 0.91817862,  0.49479155,  0.71046678, ..., -0.07033258,
        -0.64249478, -1.        ],
       ...,
       [ 0.96835249,  0.52864535,  0.38084601, ..., -0.47058825,
         1.        , -1.        ],
       [ 0.96140471,  0.52864535,  0.38084601, ..., -0.47058825,
         1.        , -1.        ],
       [ 0.96989583,  0.53385395,  0.38084601, ..., -0.46930926,
         1.        , -1.        ]])

In [32]:
y_train = pd.DataFrame(y_train)

#check column names
for col in y_train.columns:
    print("y_train column names: ")
    print(col)
    
y_valid = pd.DataFrame(y_valid)

#check column names
for col in y_valid.columns:
    print("y_valid column names: ")
    print(col)
    
y_test = pd.DataFrame(y_test)

for col in y_test.columns:
    print("y_test column names: ")
    print(col)

y_train column names: 
machine_status_new_t
y_valid column names: 
machine_status_new_t
y_test column names: 
machine_status_new_t


In [33]:
#for LSTM need [samples, timesteps, features]

In [34]:
X_train_reshaped2 = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))

In [35]:
X_valid_reshaped2 = X_valid.reshape((X_valid.shape[0], 1, X_valid.shape[1]))

In [36]:
X_test_reshaped2 = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [37]:
print(X_train_reshaped2.shape, y_train.shape )

(118079, 1, 52) (118079, 1)


In [38]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)
y_test = np.array(y_test)

In [55]:
model3 = keras.models.Sequential([
    keras.layers.LSTM(6,input_shape=(X_train_reshaped2.shape[1], X_train_reshaped2.shape[2])),
    #keras.layers.LSTM(3,return_sequences=True),
    #keras.layers.LSTM(3,return_sequences=True),
    keras.layers.Dense(3,activation='softmax'),
])

In [56]:
model3.compile(loss='sparse_categorical_crossentropy',optimizer='adam', metrics=["accuracy"])

In [57]:
total = 112065+5971+44
weight_for_0 = (1 / 112065)*(total)/3.0 
weight_for_1 = (1 / 44)*(total)/3.0
weight_for_2 = (1 / 5971)*(total)/3.0
class_weight_by_hand = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
print('Weight for class 2: {:.2f}'.format(weight_for_2))

Weight for class 0: 0.35
Weight for class 1: 894.55
Weight for class 2: 6.59


In [58]:
class_weights = {0: 0.35, 1: 895, 2: 6.6}

In [59]:
X_train.shape

(118079, 52)

In [60]:
history = model3.fit(X_train_reshaped2, y_train, 
                    epochs=20, batch_size=5000,
                    validation_data=(X_valid_reshaped2, 
                                     y_valid),
                    class_weight = class_weights,
                    verbose=0)

In [61]:
mse_test3 = model3.evaluate(X_test_reshaped2, np.array(y_test), verbose=0)

In [62]:
mse_test3

[0.3440248112657357, 0.9870674]

In [63]:
ypred3 = model3.predict(X_test_reshaped2)
y_pred_single3 = ypred3.argmax(axis = 1)[:,None]
y_pred_single_df3 = pd.DataFrame(y_pred_single3)

In [64]:
import sklearn.metrics as metrics

y_test = ts_interpolated_test['machine_status_new']
cnf_matrix_nn_down = metrics.confusion_matrix(y_test, y_pred_single_df3)
cnf_matrix_nn_down



array([[82717,   742,   291],
       [    7,    12,     3],
       [    0,    93,  3975]])

In [65]:
y_test

timestamp
2018-07-02 00:00:00    2.0
2018-07-02 00:01:00    2.0
2018-07-02 00:02:00    2.0
2018-07-02 00:03:00    2.0
2018-07-02 00:04:00    2.0
                      ... 
2018-08-31 23:55:00    0.0
2018-08-31 23:56:00    0.0
2018-08-31 23:57:00    0.0
2018-08-31 23:58:00    0.0
2018-08-31 23:59:00    0.0
Name: machine_status_new, Length: 87840, dtype: float64

In [66]:
#use resampling for timeseries
#https://link.springer.com/article/10.1007/s41060-017-0044-3
#https://datascience.stackexchange.com/questions/28200/when-should-you-balance-a-time-series-dataset
##try upsampling only the breakdowns (adding seconds)

In [67]:
#references

#https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
##try using the last step

#https://developers.google.com/machine-learning/data-prep/construct/sampling-splitting/imbalanced-data
##downsample and upweight

#https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
#https://www.tensorflow.org/tutorials/text/text_classification_rnn
#https://github.com/ovguyo/moviereview
#https://machinelearningmastery.com/what-are-word-embeddings/
#https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer
####text sentiment classification with LSTM and two classes (using word embeddings)

#https://machinelearningmastery.com/use-different-batch-sizes-training-predicting-python-keras/
#sequence prediction, not classification :)

#https://www.tensorflow.org/guide/keras/sequential_model
#basics on sequentual model

#https://github.com/keras-team/keras/issues/3653
#cant do class weights with multi-class classification

#imbalanced 2 class classifier
#https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#understanding_useful_metrics

#multiclass classifier
#https://www.tensorflow.org/tutorials/keras/classification

#https://datascience.stackexchange.com/questions/27533/keras-lstm-with-1d-time-series?rq=1
#https://datascience.stackexchange.com/questions/77588/setting-batch-size-when-performing-multi-class-classification-with-imbalanced-da