In [1]:
%store -r res2 status

In [2]:
### data preprocessing

dataset = res2.copy()

#find out which hours are the major down
def find_hours(frame,start,end):
    hours  = [] 

    #This method is trying to tell from which row to which row in the status dataframe should I pay attention to when calculating UDT
    #After you extracted the 2 indexes, you can the calculate how many UDT are there between the 2 index

    for index, row in frame.iterrows():
        a = row[start]
        b = row[end]
        
        # for each row of data in frame, find the number of major down between the input start and end time
        start_interval = max(list(status.index[a >= status['TIMESTAMP_START']])) + 1 #status row index with timestamp closest to our 'Input_Time_Window_Start'
        end_interval = min(list(status.index[b <= status['TIMESTAMP_START']])) #status row index with timestamp_end closest to our 'Input_Time_Window_End'
        count = 0
        result = status.iloc[start_interval:end_interval]
        
        filtered = result[(result['LEVEL3']=='UDT')&(result['DURATION']>3600)]
        if len(filtered) == 0:
            hours.append([0])
        else:
            hour = [] 
            for i in range(len(filtered)):
                hour.append(filtered.iloc[i]['TIMESTAMP_START'].hour - a.hour + 1) #add 1 is to prevent breakdown happening at the same time as this would give rise to class 0 again
            hours.append(hour)

    return hours

def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset
    return dataset

In [3]:
### removes all those with multiple breakdown
data_new = dataset[dataset['NoMajorDown']<2]
print("Removed {} rows with multiple major downs for multi-class classification".format(len(dataset)-len(data_new)))

hours = find_hours(data_new, 'Target_Time_Window_Start', 'Target_Time_Window_End')

### check is hours calculation are correct ###
no_breakdown = len([ele for ele in hours if ele==[0]])
zero = data_new['NoMajorDown'].value_counts()[0]
print(no_breakdown==zero)

data_new = data_new.drop(['Input_Time_Window_Start', 'Input_Time_Window_End', 
                     'Target_Time_Window_Start', 'Target_Time_Window_End',
                     'Small_HangUp_Past7days', 'Small_HangUp_Past5days', 'Small_HangUp_Past3days', 'NoMajorDown',
                         'Alarm4Count', 'Alarm90Count', 'Alarm19Count', 'Alarm3Count',
                         'Machine Failure Failed'], axis=1)

### copy another dataframe with highly correlated columns removed
data_rm = data_new.copy()
data_rm = correlation(data_rm, 0.8)
print("Columns removed from high correlations are {}". format([column for column in data_new if column not in data_rm]))

Removed 13 rows with multiple major downs for multi-class classification
True
Columns removed from high correlations are ['Change Setup', 'Change Setup Failed']


In [4]:
import numpy as np

randomforest_params = {"n_estimators": [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
                        "max_depth": [int(x) for x in np.linspace(10, 110, 11)],
                        "min_samples_split": [2, 5, 10],
                        "min_samples_leaf": [1, 2, 4]}

decisiontree_params = {"max_depth": [int(x) for x in np.linspace(10, 110, 11)],
                       "min_samples_split": [2, 5, 10],
                       "min_samples_leaf": [1, 2, 4]}

XGB_params = {"subsample": [0.3, 0.5, 0.7, 1],
            "n_estimators": [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
            "eta": [0.3, 0.5, 0.7, 1],
            "max_depth": [int(x) for x in np.linspace(10, 110, 11)]}

SVC_params = {'kernel':['linear', 'rbf', 'poly'],
               'C': [0.001, 0.01, 0.1, 1, 10], 
               'gamma' : [0.001, 0.01, 0.1, 1]}

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

import matplotlib.pyplot as plt

class Pipeline():
    def __init__(self, data, target, classifiers, params):
        self.data = data
        self.target = target
        self.classifiers = classifiers
        self.params = params
    
    def split_test_train(self, data, target, test_size):
        x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size, random_state=42)
        return x_train, y_train, x_test, y_test
        
    def balance_data(self, x_data, y_data):
        sm = SMOTE(random_state=42)
        x_resampled, y_resampled = sm.fit_resample(x_data, y_data)
        return x_resampled, y_resampled
        
    def fit(self, x_train, y_train, cv):
        model = self.classifiers
        param = self.params
        rs = RandomizedSearchCV(model, param, cv=cv, n_jobs=-1, scoring='f1') #since this is a classification task, we can use f1 as scoring
        rs.fit(x_train, y_train)
        return rs
    
    def evaluate_model(self, result, x_test, y_test):
        print(f"Evaluating {result.best_estimator_} model with score {result.best_score_} with test data ")
        model = result.best_estimator_
        pred = model.predict(x_test)
        
        print(classification_report(y_test, pred))
        print(confusion_matrix(y_test, pred))
        
    def pipeline(self, cv=3):
        params = self.params
        x_train, y_train, x_test, y_test = self.split_test_train(self.data, self.target, 0.25)
        x_balanced, y_balanced = self.balance_data(x_train, y_train)
        rs = self.fit(x_balanced,y_balanced,cv)
        self.evaluate_model(rs, x_test, y_test)
        return rs

In [6]:
data_new = np.array(data_new)
print(data_new.shape)
hours = np.reshape(hours, (len(np.array(hours)),))
print(hours.shape)

(411, 16)
(411,)


In [7]:
rf_pipeline = Pipeline(data_new, hours, RandomForestClassifier(random_state=42), randomforest_params)
rf = rf_pipeline.pipeline()

### pipeline for decisiontree ###
decisiontree_pipeline = Pipeline(data_new, hours, DecisionTreeClassifier(random_state=42), decisiontree_params)
dt = decisiontree_pipeline.pipeline()

### pipeline for XGB ###
XGB_pipeline = Pipeline(data_new, hours, XGBClassifier(random_state=42), XGB_params)
xgb = XGB_pipeline.pipeline()

### pipeline for SVC
SVC_pipeline = Pipeline(data_new, hours, SVC(random_state=42), SVC_params)
svc = SVC_pipeline.pipeline()



Evaluating RandomForestClassifier(max_depth=20, min_samples_leaf=4, n_estimators=1200,
                       random_state=42) model with score nan with test data 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.90      0.83      0.87        89
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         1
           5       0.33      0.50      0.40         2
           6       0.00      0.00      0.00         3

    accuracy                           0.73       103
   macro avg       0.18      0.19      0.18       103
weighted avg       0.79      0.73      0.76       103

[[74  1  3  8  1  1  1]
 [ 0  0  2  1  0  1  0]
 [ 0  0  0  0  0  0  0]
 [ 3  0  1  0  0  0  0]
 [ 1  0  0  0  0  0  0]
 [ 1  0  0  0  0  1  0]
 [ 3  0  0  0  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating DecisionTreeClassifier(max_depth=70, min_samples_leaf=2, min_samples_split=10,
                       random_state=42) model with score nan with test data 
              precision    recall  f1-score   support

           0       0.88      0.83      0.86        89
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         3

    accuracy                           0.72       103
   macro avg       0.13      0.12      0.12       103
weighted avg       0.76      0.72      0.74       103

[[74  0  2  3  3  7  0]
 [ 1  0  1  1  1  0  0]
 [ 0  0  0  0  0  0  0]
 [ 3  0  0  0  0  1  0]
 [ 1  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0]
 [ 3  0  0  0  0  0  0]]




Evaluating XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=1, max_delta_step=0, max_depth=30,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=800, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.5,
              tree_method='exact', validate_parameters=1, verbosity=None) model with score nan with test data 
              precision    recall  f1-score   support

           0       0.89      0.87      0.88        89
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating SVC(C=1, gamma=0.1, kernel='poly', random_state=42) model with score nan with test data 
              precision    recall  f1-score   support

           0       0.90      0.71      0.79        89
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         3

    accuracy                           0.61       103
   macro avg       0.13      0.10      0.11       103
weighted avg       0.78      0.61      0.68       103

[[63  6  2  6  0 10  2]
 [ 1  0  0  2  1  0  0]
 [ 0  0  0  0  0  0  0]
 [ 2  0  0  0  0  2  0]
 [ 1  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0]
 [ 1  1  1  0  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Multi class prediction with CNN

In [8]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import joblib

import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, Conv1D, BatchNormalization #import in this exact format to prevent errors
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt

In [9]:
data_cnn = data_new.copy()
scaler = MinMaxScaler()
data_cnn = scaler.fit_transform(data_cnn)

TEST_SPLIT = 0.1 
VAL_SPLIT = 0.2

x_train, x_test, y_train, y_test = train_test_split(data_cnn, hours, test_size=TEST_SPLIT, random_state=42)

### balance the data ###
sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_resample(x_train, y_train)

In [22]:
from collections import Counter, OrderedDict
print(OrderedDict(sorted(Counter(y_train).items())))

OrderedDict([(0, 311), (1, 311), (2, 311), (3, 311), (4, 311), (5, 311), (6, 311)])


In [15]:
print("Shape of X train is ", x_train.shape)
print("Shape of Y train is ", y_train.shape)

Shape of X train is  (2177, 16)
Shape of Y train is  (2177,)


In [11]:
def build_model(n_neurons1=32, dropout_rate=0.5):
    model = Sequential()
    
    model.add(Dense(n_neurons1, input_dim = x_train.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    
    model.add(Dropout(dropout_rate))
    model.add(Dense(len(np.unique(y_train)), activation='softmax'))
                  
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [12]:
n_neurons1 = [int(x) for x in np.linspace(start = 32, stop = 256, num = 8)]
dropout_rate = [0.5, 0.6, 0.7, 0.8]
epochs = [50, 100, 150, 200]

param = dict(n_neurons1=n_neurons1, dropout_rate=dropout_rate, epochs=epochs)

estimator = KerasClassifier(build_fn = build_model, verbose = 0)
rs = RandomizedSearchCV(estimator=estimator, param_distributions=param, cv=5, n_jobs=-1)
rs_result = rs.fit(x_train, y_train)

In [24]:
rs_result.best_estimator_.model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 160)               2720      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                5152      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 231       
Total params: 8,103
Trainable params: 8,103
Non-trainable params: 0
_________________________________________________________________


In [13]:
print(rs_result.best_params_)
print(rs_result.best_estimator_.score(x_test, y_test)) #catching everything as 0
print(rs_result.best_estimator_.predict(x_test))
y_test

{'n_neurons1': 160, 'epochs': 200, 'dropout_rate': 0.6}
0.6666666865348816
[0 0 0 5 4 0 0 5 0 3 0 0 0 0 0 0 3 0 0 6 0 6 0 0 4 0 0 0 0 0 3 0 0 0 3 0 0
 0 0 0 0 0]




array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0])

In [25]:
np.array(rs_result.best_estimator_.predict(x_test))

array([0, 0, 0, 5, 4, 0, 0, 5, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 6, 0, 6,
       0, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0])

In [26]:
res2

Unnamed: 0,Input_Time_Window_Start,Input_Time_Window_End,Target_Time_Window_Start,Target_Time_Window_End,NoMajorDown,Small_HangUp_Past7days,Small_HangUp_Past5days,Small_HangUp_Past3days,Proportion of 3 days hangup over 7 days,Proportion of 5 days hangup over 7 days,...,Machine Failure Passed,Machine Failure Failed,Machine Failure Reset,Change Setup,Change Setup Passed,Change Setup Failed,Change Setup Reset,xscation error 1087242244 Count,xscation error 13500912 Count,Days since last maintenance
0,2021-01-10 00:00:00,2021-01-17 00:00:00,2021-01-17 00:00:00,2021-01-17 06:00:00,0,648,532,261,0.402778,0.820988,...,0,26,3,4,0,4,0,0,0,58
1,2021-01-10 06:00:00,2021-01-17 06:00:00,2021-01-17 06:00:00,2021-01-17 12:00:00,0,686,528,275,0.400875,0.769679,...,0,27,3,3,0,3,0,0,0,58
2,2021-01-10 12:00:00,2021-01-17 12:00:00,2021-01-17 12:00:00,2021-01-17 18:00:00,0,667,535,248,0.371814,0.802099,...,0,28,3,3,0,3,0,0,0,59
3,2021-01-10 18:00:00,2021-01-17 18:00:00,2021-01-17 18:00:00,2021-01-18 00:00:00,0,719,593,261,0.363004,0.824757,...,0,28,3,3,0,3,0,0,0,59
4,2021-01-11 00:00:00,2021-01-18 00:00:00,2021-01-18 00:00:00,2021-01-18 06:00:00,0,710,575,235,0.330986,0.809859,...,0,27,3,3,0,3,0,0,0,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,2021-04-24 18:00:00,2021-05-01 18:00:00,2021-05-01 18:00:00,2021-05-02 00:00:00,2,538,365,134,0.249071,0.678439,...,2,16,2,3,0,3,0,0,0,163
420,2021-04-25 00:00:00,2021-05-02 00:00:00,2021-05-02 00:00:00,2021-05-02 06:00:00,2,548,365,147,0.268248,0.666058,...,2,16,1,4,0,4,0,0,0,163
421,2021-04-25 06:00:00,2021-05-02 06:00:00,2021-05-02 06:00:00,2021-05-02 12:00:00,1,618,416,222,0.359223,0.673139,...,2,16,1,3,0,3,0,0,0,163
422,2021-04-25 12:00:00,2021-05-02 12:00:00,2021-05-02 12:00:00,2021-05-02 18:00:00,1,602,424,230,0.382060,0.704319,...,2,16,1,3,0,3,0,0,0,164
