# Import modules

In [1]:
import pandas as pd
import numpy as np

# our modules
import sys
sys.path.insert(0, '..')

from clean import clean_data
from imbalance import balance_data
from evaluate import evaluate_model

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier

from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
# neural_network.MLPClassifier

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, recall_score, precision_score, f1_score, matthews_corrcoef, confusion_matrix

import seaborn as sns

In [2]:
pd.set_option("display.precision", 6)
pd.set_option("display.float_format", lambda x: "%.6f" % x)

In [3]:
RANDOM_STATE = 123

# Load data

In [4]:
data = pd.read_csv('/home/hristo/Documents/WBS_Data_Science/WBS_Predictive_Maintenance/data/ai4i2020.csv', index_col='UDI')

In [5]:
data

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,M14860,M,298.100000,308.600000,1551,42.800000,0,0,0,0,0,0,0
2,L47181,L,298.200000,308.700000,1408,46.300000,3,0,0,0,0,0,0
3,L47182,L,298.100000,308.500000,1498,49.400000,5,0,0,0,0,0,0
4,L47183,L,298.200000,308.600000,1433,39.500000,7,0,0,0,0,0,0
5,L47184,L,298.200000,308.700000,1408,40.000000,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,M24855,M,298.800000,308.400000,1604,29.500000,14,0,0,0,0,0,0
9997,H39410,H,298.900000,308.400000,1632,31.800000,17,0,0,0,0,0,0
9998,M24857,M,299.000000,308.600000,1645,33.400000,22,0,0,0,0,0,0
9999,H39412,H,299.000000,308.700000,1408,48.500000,25,0,0,0,0,0,0


# Clean data

In [6]:
data_cl = clean_data(data, drop_type=True, multi=True)
data_cl

Unnamed: 0_level_0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,298.100000,308.600000,1551,42.800000,0,0,0,0,0,0
2,298.200000,308.700000,1408,46.300000,3,0,0,0,0,0
3,298.100000,308.500000,1498,49.400000,5,0,0,0,0,0
4,298.200000,308.600000,1433,39.500000,7,0,0,0,0,0
5,298.200000,308.700000,1408,40.000000,9,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9996,298.800000,308.400000,1604,29.500000,14,0,0,0,0,0
9997,298.900000,308.400000,1632,31.800000,17,0,0,0,0,0
9998,299.000000,308.600000,1645,33.400000,22,0,0,0,0,0
9999,299.000000,308.700000,1408,48.500000,25,0,0,0,0,0


# Prepare data

In [7]:
X = data_cl.copy()
y = X[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']].copy()
X.drop(columns=['TWF', 'HDF', 'PWF', 'OSF', 'RNF'], inplace=True)

# Split train and test

## without balancing

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)

## with balancing

In [9]:
balance = False
if balance:
#     data_bal, method = balance_data(X, y, 'SMOTE')
#     X_bl, y_bl = data_bal

#     print(X_bl.info())
#     print(y_bl.info())

#     y_bl.value_counts().plot(kind='bar')
#     X_bl.info()

#     X_train, X_test, y_train, y_test = train_test_split(X_bl, y_bl, train_size=0.8, random_state=RANDOM_STATE)
    machine_failure = {}
    failure_labels = np.unique(y['HDF'])
    for l in failure_labels:
        machine_failure[l] = np.count_nonzero(y['HDF']==l)

    maxcount = np.max(list(machine_failure.values()))
    for l in failure_labels:
        gapnum = maxcount - machine_failure[l]
        #print(gapnum)
        temp_df = y.iloc[np.random.choice(np.where(y['HDF']==l)[0],size=gapnum)]
        data_df = y.append(temp_df,ignore_index=True)

    y = y.sample(frac=1).reset_index(drop=True)

# Baseline

## Pipeline

In [10]:
params = {}

In [11]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', MultiOutputClassifier(DecisionTreeClassifier()))
])

In [12]:
grid = GridSearchCV(
    pipe,
    param_grid=params,
    scoring='accuracy',
    cv=5,
    verbose=1
)

In [13]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [14]:
best_score = grid.best_score_
print(best_score)

0.97725


In [15]:
best = grid.best_estimator_

In [16]:
y_pred = best.predict(X_test)

In [17]:
y_tp = abs(np.subtract(y_pred, y_test))
print(y_tp)

      TWF  HDF  PWF  OSF  RNF
UDI                          
2657    0    0    0    0    0
446     0    0    0    0    0
9506    0    0    0    0    0
333     0    0    0    0    0
4169    0    0    0    0    0
...   ...  ...  ...  ...  ...
8019    0    0    0    0    0
6464    0    0    0    0    0
2884    0    0    0    0    0
7896    0    0    0    0    0
621     0    0    0    0    0

[2000 rows x 5 columns]


In [18]:
accuracy = {}
for i in range(y_tp.shape[1]):
    accuracy[y_tp.columns[i]] = 100*y_tp[y_tp.columns[i]].value_counts(normalize=True)
    # print(100*y_tp.iloc[:, i].value_counts(normalize=True))
    # print(f'{100*abs(y_pred[:, i] - y_test[:, i])} %')

In [19]:
accuracy = pd.DataFrame(accuracy)
accuracy

Unnamed: 0,TWF,HDF,PWF,OSF,RNF
0,98.9,99.75,99.6,99.15,99.6
1,1.1,0.25,0.4,0.85,0.4


## Evaluation

In [20]:
accuracy, recall, precision, f1s, roc = evaluate_model(y_test, y_predictions=y_pred)

In [21]:
print(f"Accuracy: {accuracy:.2%}")
print(f"Recall: {recall:.2%}")
print(f"Precision: {precision:.2%}")
print(f"F1-Score: {f1s:.2%}")
# print(f"MCC: {MCC:.2%}")
print(f"ROC AUC score: {roc:.2%}")

Accuracy: 97.05%
Recall: 56.79%
Precision: 66.24%
F1-Score: 60.72%
ROC AUC score: 71.85%
