In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, f1_score

In [3]:
# reading in dataset
df_og = pd.read_csv('data/train.csv').drop('id', axis=1)
df_test = pd.read_csv('data/test.csv').drop('id', axis=1)
df_og = pd.concat([df_og, df_test], axis=0).reset_index(drop=True)
test_idx = np.arange(len(df_og)-len(df_test), len(df_og))

df_og.head()

Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,L50096,L,300.6,309.6,1596,36.1,140,0.0,0,0,0,0,0
1,M20343,M,302.6,312.1,1759,29.1,200,0.0,0,0,0,0,0
2,L49454,L,299.3,308.5,1805,26.5,25,0.0,0,0,0,0,0
3,L53355,L,301.0,310.9,1524,44.3,197,0.0,0,0,0,0,0
4,M24050,M,298.0,309.0,1641,35.4,34,0.0,0,0,0,0,0


## Exploratory Data Analysis

In [4]:
# check for missing values
df_og.isnull().sum()


Product ID                     0
Type                           0
Air temperature [K]            0
Process temperature [K]        0
Rotational speed [rpm]         0
Torque [Nm]                    0
Tool wear [min]                0
Machine failure            90954
TWF                            0
HDF                            0
PWF                            0
OSF                            0
RNF                            0
dtype: int64

In [5]:
df_test['HDF'].value_counts()

0    90468
1      486
Name: HDF, dtype: int64

## Feature Engineering

In [54]:
# engineering features
df_fe = df_og.copy()
# df columns are: ['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]','Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]','Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']

# get temperature difference between process and air
df_fe['temp_diff'] = df_fe['Process temperature [K]'] - df_fe['Air temperature [K]']

# get all Product Ids with more than a threshold number of occurrences
threshold = 1000
prod_ids = df_og['Product ID'].value_counts()
prod_ids = prod_ids[prod_ids > threshold].index.tolist()
print(len(prod_ids))
df_fe['Product ID'] = df_og['Product ID'].apply(lambda x: "popular" if x in prod_ids else 'other')

# convert categorical columns to one-hot encoding
df_fe = pd.get_dummies(df_fe)

# drop the last 5 features because these indicate type of failure
# df_fe = df_fe.drop(['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis=1)

# split back into train and test
df_train = df_fe[~df_fe.index.isin(test_idx)]
df_test = df_fe[df_fe.index.isin(test_idx)]


df_fe = df_train

0


In [55]:
# check data imbalance
display(df_og['Machine failure'].value_counts())

df_balanced = df_fe.copy()
# fix data imbalance by undersampling from majority class
df_majority = df_fe[df_fe['Machine failure'] == 0]
df_minority = df_fe[df_fe['Machine failure'] == 1]
df_majority_downsampled = df_majority.sample(n=len(df_minority)*5, random_state=42)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled['Machine failure'].value_counts()
df_balanced = df_downsampled
print("value counts after balancing:")
display(df_balanced['Machine failure'].value_counts()) 

0.0    134281
1.0      2148
Name: Machine failure, dtype: int64

value counts after balancing:


0.0    10740
1.0     2148
Name: Machine failure, dtype: int64

## Model fitting

In [33]:
preds = np.zeros(len(df_balanced))
# split into train and test
for train_index, val_index in KFold(n_splits=3, shuffle=True, random_state=42).split(df_balanced):
    train = df_balanced.iloc[train_index]
    val = df_balanced.iloc[val_index]

    # fit a random forest model on the training data
    model = GradientBoostingClassifier(max_depth=4,n_estimators=100, random_state=42)
    model.fit(train.drop('Machine failure', axis=1), train['Machine failure'])

    # predict model on validation data
    preds[val_index] = model.predict_proba(val.drop('Machine failure', axis=1))[::,1]

# get roc auc score and f1 score
training_pred = model.predict_proba(df_balanced.drop('Machine failure', axis=1))[::,1]
print('training roc auc score: ', roc_auc_score(df_balanced['Machine failure'], training_pred))
print('roc auc score: ', roc_auc_score(df_balanced['Machine failure'], preds))
print('f1 score: ', f1_score(df_balanced['Machine failure'], preds>0.5))


training roc auc score:  0.9745125386223901
roc auc score:  0.9621564081090548
f1 score:  0.8790709416813937


In [92]:
# get horizontal table of feature importances from random forest model
feat_imp = pd.DataFrame({'feature': df_balanced.drop('Machine failure', axis=1).columns, 'importance': model.feature_importances_})
feat_imp

Unnamed: 0,feature,importance
0,Air temperature [K],0.056886
1,Process temperature [K],0.041131
2,Rotational speed [rpm],0.21026
3,Torque [Nm],0.197496
4,Tool wear [min],0.102249
5,TWF,0.067766
6,HDF,0.089458
7,PWF,0.06303
8,OSF,0.090797
9,RNF,0.000533


## Using a Neural network

In [56]:
# import tensorflow as tf
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization


# create model
class FeedForwardNN(object):
    def __init__(self,in_dims):
        super().__init__()

        self.in_dims = in_dims

    def create_net(self):
        model = Sequential()
        model.add(tf.keras.Input((self.in_dims)))
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.1))
        # model.add(BatchNormalization())
        model.add(Dense(64, activation='relu'))
        # add batch normalization
        model.add(Dropout(0.1))
        # model.add(BatchNormalization())
        # model.add(Dense(32, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        return model
    
    def compile_net(self, model):
        self.model = model
        self.model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=[tf.keras.metrics.AUC()])
        return self.model
    
init_lr = 0.01
batch_size = 64
epochs = 50
def lr_scheduler(epoch):
    new_lr = init_lr * 0.95 ** epoch
    print("Learning rate:", new_lr)
    return new_lr

net = FeedForwardNN(in_dims = len(df_train.columns)-1)

s = tf.keras.backend.clear_session()
model=net.create_net()
model.summary()

model = net.compile_net(model)
print(model)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               2048      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         


                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 10,369
Trainable params: 10,369
Non-trainable params: 0
_________________________________________________________________
<keras.engine.sequential.Sequential object at 0x7f9e186b7d90>


In [57]:
# get training and validation data by splitting the balanced dataset 80-20 using the machine failure column
df_train = df_balanced.copy()
train_idx = np.random.choice(len(df_train), int(len(df_train)*0.8), replace=False)
X_train = df_train.iloc[train_idx].drop('Machine failure', axis=1)
y_train = df_train.iloc[train_idx]['Machine failure']
X_val = df_train.iloc[~train_idx].drop('Machine failure', axis=1)
y_val = df_train.iloc[~train_idx]['Machine failure']

history = model.fit(
    X_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.LearningRateScheduler(lr_scheduler)],
    shuffle=True,
    verbose=1,
    initial_epoch=0, 
    validation_data=(X_val, y_val)
)

Learning rate: 0.01
Epoch 1/50
Learning rate: 0.0095
Epoch 2/50
Learning rate: 0.009025
Epoch 3/50
Learning rate: 0.00857375
Epoch 4/50
Learning rate: 0.0081450625
Epoch 5/50
Learning rate: 0.007737809374999998
Epoch 6/50
Learning rate: 0.007350918906249998
Epoch 7/50
Learning rate: 0.006983372960937498
Epoch 8/50
Learning rate: 0.006634204312890623
Epoch 9/50
Learning rate: 0.006302494097246091
Epoch 10/50
Learning rate: 0.005987369392383787
Epoch 11/50
Learning rate: 0.005688000922764597
Epoch 12/50
Learning rate: 0.005403600876626367
Epoch 13/50
Learning rate: 0.005133420832795048
Epoch 14/50
Learning rate: 0.0048767497911552955
Epoch 15/50
Learning rate: 0.00463291230159753
Epoch 16/50
Learning rate: 0.0044012666865176535
Epoch 17/50
Learning rate: 0.004181203352191771
Epoch 18/50
Learning rate: 0.003972143184582182
Epoch 19/50
Learning rate: 0.0037735360253530726
Epoch 20/50
Learning rate: 0.0035848592240854188
Epoch 21/50
Learning rate: 0.003405616262881148
Epoch 22/50
Learning r

## Saving Model

In [93]:
# get results on test dataset
rf = RandomForestClassifier(max_depth=10,n_estimators=100, random_state=42)

rf.fit(df_balanced.drop('Machine failure', axis=1), df_balanced['Machine failure'])

preds = rf.predict_proba(df_test.drop('Machine failure', axis=1))[:,1]
preds_df = pd.DataFrame({'id': df_test.index, 'Machine Failure': preds})
# save predictions to csv
preds_df.to_csv('data/submission.csv', index=False)
