In [43]:
# imports
import pandas as pd
import numpy as np
from keras import backend as K
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

In [117]:
# retrieve data
test = pd.read_csv('/kaggle/input/equipfails/equip_failures_test_set.csv', na_values='na')
train = pd.read_csv('/kaggle/input/equipfails/equip_failures_training_set.csv', na_values='na')

# handle nan in test
test = test.interpolate(axis=1, limit=10, limit_direction='both').fillna(0)
train = train.interpolate(axis=1, limit=10, limit_direction='both').fillna(0); # ';' to suppress output

In [118]:
train.columns

Index(['id', 'target', 'sensor1_measure', 'sensor2_measure', 'sensor3_measure',
       'sensor4_measure', 'sensor5_measure', 'sensor6_measure',
       'sensor7_histogram_bin0', 'sensor7_histogram_bin1',
       ...
       'sensor105_histogram_bin2', 'sensor105_histogram_bin3',
       'sensor105_histogram_bin4', 'sensor105_histogram_bin5',
       'sensor105_histogram_bin6', 'sensor105_histogram_bin7',
       'sensor105_histogram_bin8', 'sensor105_histogram_bin9',
       'sensor106_measure', 'sensor107_measure'],
      dtype='object', length=172)

In [119]:
train.describe()

Unnamed: 0,id,target,sensor1_measure,sensor2_measure,sensor3_measure,sensor4_measure,sensor5_measure,sensor6_measure,sensor7_histogram_bin0,sensor7_histogram_bin1,...,sensor105_histogram_bin2,sensor105_histogram_bin3,sensor105_histogram_bin4,sensor105_histogram_bin5,sensor105_histogram_bin6,sensor105_histogram_bin7,sensor105_histogram_bin8,sensor105_histogram_bin9,sensor106_measure,sensor107_measure
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,30000.5,0.016667,59336.5,121889200.0,336235500.0,6080781.0,4570.921,2891.619,1987.528,3473.278,...,448958.4,216161.5,447091.2,394827.8,333566.2,345577.2,139302.2,9364.293,858.2815,845.4228
std,17320.652413,0.12802,145430.1,339076700.0,776758300.0,86656590.0,73854.34,83499.24,102168.8,126991.2,...,1359823.0,835575.3,1283248.0,1203716.0,1123527.0,1739849.0,482541.6,102874.4,27015.36,26828.15
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15000.75,0.0,834.0,12.0,18.0,12.0,0.0,0.0,0.0,0.0,...,2710.0,1050.0,2341.5,3001.955,420.0,94.0,0.0,0.0,0.0,0.0
50%,30000.5,0.0,30776.0,15626.0,180.0,86.0,0.0,0.0,0.0,0.0,...,228831.0,109338.0,216070.0,184652.0,86107.0,38932.0,3354.0,0.0,0.0,0.0
75%,45000.25,0.0,48668.0,30887.25,1454.0,428.0,0.0,0.0,0.0,0.0,...,435929.5,217002.0,463703.5,400734.0,272993.0,166706.5,137376.5,1966.0,0.0,0.0
max,60000.0,1.0,2746564.0,1065657000.0,2130707000.0,8584298000.0,13292410.0,16357450.0,19422500.0,22487550.0,...,134036900.0,117282300.0,100527700.0,83773090.0,67018470.0,119580100.0,33509240.0,16754620.0,2708070.0,2708070.0


In [120]:
train.head()

Unnamed: 0,id,target,sensor1_measure,sensor2_measure,sensor3_measure,sensor4_measure,sensor5_measure,sensor6_measure,sensor7_histogram_bin0,sensor7_histogram_bin1,...,sensor105_histogram_bin2,sensor105_histogram_bin3,sensor105_histogram_bin4,sensor105_histogram_bin5,sensor105_histogram_bin6,sensor105_histogram_bin7,sensor105_histogram_bin8,sensor105_histogram_bin9,sensor106_measure,sensor107_measure
0,1.0,0.0,76698.0,1065392000.0,2130706000.0,280.0,0.0,0.0,0.0,0.0,...,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,2.0,0.0,33058.0,16529.0,0.0,0.0,0.0,0.0,0.0,0.0,...,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,3.0,0.0,41040.0,20634.0,228.0,100.0,0.0,0.0,0.0,0.0,...,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,4.0,0.0,12.0,0.0,70.0,66.0,0.0,10.0,0.0,0.0,...,240.0,46.0,58.0,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,5.0,0.0,60874.0,31121.0,1368.0,458.0,0.0,0.0,0.0,0.0,...,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


In [121]:
# trim cols to measures and target
cols = [col for col in cols if 'measure' in col]
train = train[cols + ['target']]
test = test[cols]

In [122]:
# split into input (X) and output (Y) variables
X = train.drop(columns='target').values
Y = train['target'].values
X_test = test.values

In [123]:
# standardize the input feature
sc = StandardScaler()
scaler = sc.fit(X)
X = scaler.transform(X)
X_test = scaler.transform(X_test)

In [124]:
# oversampling the failures to overcome class imbalance
sm = SMOTE(random_state=42)
X, Y = sm.fit_resample(X, Y)
X_test

array([[ 0.05192574,  2.78257194,  2.31022414, ..., -0.13992824,
        -0.0317704 , -0.03151279],
       [ 0.21856393, -0.35927528, -0.43281516, ..., -0.03888876,
        -0.0317704 , -0.03151279],
       [ 1.09735796, -0.35893879, -0.43268592, ...,  0.10446626,
        -0.0317704 , -0.03151279],
       ...,
       [-0.0583825 , -0.35940057, -0.43287269, ..., -0.13979572,
        -0.0317704 , -0.03151279],
       [-0.40768047,  2.78247337,  2.31022413, ..., -0.14015676,
        -0.0317704 , -0.03151279],
       [ 0.00456237,  2.78256178,  2.31022413, ..., -0.13982928,
        -0.0317704 , -0.03151279]])

In [125]:
# def metrics
def recall_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_metric(y_true, y_pred):
    precision = precision_metric(y_true, y_pred)
    recall = recall_metric(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [126]:
# build classifier neural network
classifier = Sequential()

classifier.add(Dense(30, activation='relu', kernel_initializer='random_normal', input_dim=100)) # Input Layer
classifier.add(Dense(15, activation='relu', kernel_initializer='random_normal')) # Hidden Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal')) # Output Layer

classifier.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_metric]);

In [127]:
#Fitting the data to the training dataset
# using mini-batch of 50
# 15 epochs to prevent over-fitting on small dataset
classifier.fit(X,Y, batch_size=50, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f9b8423f0b8>

In [128]:
# predict on test data
y_pred = classifier.predict(X_test)

array([[0.0000000e+00],
       [6.8545341e-06],
       [9.1327429e-03],
       ...,
       [0.0000000e+00],
       [3.1002462e-03],
       [2.4063766e-08]], dtype=float32)

In [1]:
# format dataframe to fit submission specification and write csv
submit = pd.DataFrame(y_pred)
submit[0] = submit[0].astype(int)
submit.index += 1
submit.rename(columns={0: 'target'})

# write csv
submit.to_csv('submission.csv', index_label='id', header=['target'])

NameError: name 'pd' is not defined