# Predictive Maintenance with Azure Dataset

## Project imports

In [1]:
import numpy as np
import pandas as pd

## Data Imports

Needs pre-processing.ipynb to be run first

In [2]:
def read(name: str, parse_dates: list[str] | None = ["datetime"]) -> pd.DataFrame:
    path = "data/"
    ext = ".csv"
    file = path + name + ext
    return pd.read_csv(file, parse_dates=parse_dates, na_values="NaN")

VARIABLES = ["volt", "rotate", "pressure", "vibration"]
DATA = read("raw_data").dropna(subset=VARIABLES)
normal_behavior_data = read("preprocessing/expected_behavior")
abnormal_data = read("preprocessing/failures_only")

## Full dataset classification

### Pre-processing

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

STATE = 42

independent_data = DATA.drop(
    columns=["datetime", "machineID", "errorID", "failure", "comp"]
)
labels = DATA[["errorID", "failure", "comp"]]

numeric_features = ["volt", "rotate", "pressure", "vibration", "age"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["model"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("ordinal", OrdinalEncoder()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessed_data = preprocessor.fit_transform(independent_data)
print(preprocessed_data.shape)

(877209, 6)


### KNN

#### Model

In [4]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X = preprocessed_data

# Run the classifier for each label
for label in ["errorID", "failure", "comp"]:
    print(f"Classification report for {label}:")
    y = labels[label].fillna("No Anomaly")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=STATE
    )

    # Train a KNN classifier
    knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
    knn.fit(X_train, y_train)

    # Evaluate the classifier
    y_pred = knn.predict(X_test)
    print(classification_report(y_test, y_pred, zero_division=1))

Classification report for errorID:
              precision    recall  f1-score   support

  No Anomaly       1.00      1.00      1.00    174705
      error1       0.00      0.00      0.00       204
      error2       0.33      0.01      0.01       179
      error3       0.00      0.00      0.00       156
      error4       0.00      0.00      0.00       134
      error5       1.00      0.00      0.00        64

    accuracy                           1.00    175442
   macro avg       0.39      0.17      0.17    175442
weighted avg       0.99      1.00      0.99    175442

Classification report for failure:
              precision    recall  f1-score   support

  No Anomaly       1.00      1.00      1.00    175216
       comp1       0.00      0.00      0.00        64
       comp2       0.00      0.00      0.00        69
       comp3       0.00      0.00      0.00        39
       comp4       0.00      0.00      0.00        54

    accuracy                           1.00    175442
   macr

#### Narrowing classification to specific cases

In [5]:
# Combine the labels into a single label
def label(row: pd.Series) -> str:
    if not pd.isna(row["failure"]):
        return "failure"
    if not pd.isna(row["errorID"]):
        return "errorID"
    if not pd.isna(row["comp"]):
        return "comp"
    return "No Anomaly"


labels_combined = DATA.apply(label, axis=1)

X = preprocessed_data
y = labels_combined

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=STATE
)

knn = KNeighborsClassifier(n_neighbors=9, n_jobs=-1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

  No Anomaly       0.99      1.00      1.00    174104
        comp       0.00      0.00      0.00       379
     errorID       0.00      0.00      0.00       733
     failure       0.33      0.00      0.01       226

    accuracy                           0.99    175442
   macro avg       0.33      0.25      0.25    175442
weighted avg       0.99      0.99      0.99    175442



In [6]:
# Combine the labels into a single binary label
binary_labels = labels.any(axis=1).replace({False: "No Anomaly", True: "Anomaly"})

X = preprocessed_data
y = binary_labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=STATE
)
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Anomaly       0.42      0.02      0.04      1338
  No Anomaly       0.99      1.00      1.00    174104

    accuracy                           0.99    175442
   macro avg       0.71      0.51      0.52    175442
weighted avg       0.99      0.99      0.99    175442



### Support vector classifier

In [7]:
from sklearn.linear_model import SGDClassifier

svc_sgd = SGDClassifier(
    loss="hinge", class_weight="balanced", random_state=STATE, n_jobs=-1
)
svc_sgd.fit(X_train, y_train)

y_pred = svc_sgd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Anomaly       0.01      0.42      0.02      1338
  No Anomaly       0.99      0.67      0.80    174104

    accuracy                           0.67    175442
   macro avg       0.50      0.55      0.41    175442
weighted avg       0.99      0.67      0.80    175442



### Single layer perceptron

In [8]:
svc_sgd = SGDClassifier(
    loss="perceptron", class_weight="balanced", random_state=STATE, n_jobs=-1
)
svc_sgd.fit(X_train, y_train)

y_pred = svc_sgd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Anomaly       0.01      0.73      0.01      1338
  No Anomaly       0.99      0.23      0.37    174104

    accuracy                           0.23    175442
   macro avg       0.50      0.48      0.19    175442
weighted avg       0.98      0.23      0.37    175442



In [9]:
from sklearn.neural_network import MLPClassifier

slp = MLPClassifier(hidden_layer_sizes=(1,), random_state=STATE)
slp.fit(X_train, y_train)

y_pred = slp.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

     Anomaly       0.00      0.00      0.00      1338
  No Anomaly       0.99      1.00      1.00    174104

    accuracy                           0.99    175442
   macro avg       0.50      0.50      0.50    175442
weighted avg       0.98      0.99      0.99    175442



### Logistic regression

In [10]:
svc_sgd = SGDClassifier(
    loss="log_loss", class_weight="balanced", random_state=STATE, n_jobs=-1
)
svc_sgd.fit(X_train, y_train)

y_pred = svc_sgd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Anomaly       0.01      0.70      0.02      1338
  No Anomaly       0.99      0.34      0.51    174104

    accuracy                           0.34    175442
   macro avg       0.50      0.52      0.26    175442
weighted avg       0.99      0.34      0.50    175442



In [11]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight="balanced", random_state=STATE, n_jobs=-1)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Anomaly       0.01      0.52      0.02      1338
  No Anomaly       0.99      0.56      0.72    174104

    accuracy                           0.56    175442
   macro avg       0.50      0.54      0.37    175442
weighted avg       0.99      0.56      0.71    175442



### Random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

X = preprocessed_data
y = labels_combined

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=STATE
)

rf = RandomForestClassifier(class_weight="balanced", random_state=STATE, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  No Anomaly       0.99      1.00      1.00    174104
        comp       1.00      0.34      0.50       379
     errorID       1.00      0.12      0.21       733
     failure       1.00      0.60      0.75       226

    accuracy                           0.99    175442
   macro avg       1.00      0.51      0.62    175442
weighted avg       0.99      0.99      0.99    175442



## Failure data classification

### Pre-processing

- Data

In [29]:
independent_data = DATA.drop(
    columns=["datetime", "machineID", "failure"]
)

numeric_features = ["volt", "rotate", "pressure", "vibration", "age"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["model", "errorID", "comp"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="None")),
        ("ordinal", OrdinalEncoder()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessed_data = preprocessor.fit_transform(independent_data)
print(preprocessed_data.shape)

(877209, 8)


- Labels

In [30]:
time_before_failure = 48
interval = 6
counter = 0

labels = DATA[["failure"]].fillna("normal").reset_index(drop=True)
binary_labels = DATA[["failure"]].fillna("normal").reset_index(drop=True)
interval_labels = DATA[["failure"]].fillna("normal").reset_index(drop=True)

for i in range(len(labels) - 1, -1, -1):
    if labels.at[i, "failure"] != "normal":
        counter = time_before_failure
        binary_labels.at[i, "failure"] = "failure"
        current_failure = labels.at[i, "failure"]

    elif counter > 0:
        counter -= 1

        first_hour = (
            f"{time_before_failure - interval - counter // interval * interval + 1}"
        )
        last_hour = f"{time_before_failure - counter // interval * interval}"
        label = f"{first_hour}_to_{last_hour}_hours_before_{current_failure}"

        labels.at[i, "failure"] = label
        binary_labels.at[i, "failure"] = "pre-failure"
        interval_labels.at[i, "failure"] = current_failure


binary_labels.to_csv("data/preprocessing/binary_labels.csv", index=False)
labels.to_csv("data/preprocessing/labels.csv", index=False)
interval_labels.to_csv("data/preprocessing/interval_labels.csv", index=False)

print(labels.shape)
print(f"{labels.value_counts()}\n\n")
print(binary_labels.shape)
print(binary_labels.value_counts())
print(interval_labels.shape)
print(interval_labels.value_counts())

assert labels.shape[0] == preprocessed_data.shape[0]
assert labels.shape == binary_labels.shape
assert labels.shape == interval_labels.shape

(877209, 1)
failure                    
normal                         841624
25_to_30_hours_before_comp2      1500
7_to_12_hours_before_comp2       1500
43_to_48_hours_before_comp2      1500
19_to_24_hours_before_comp2      1500
37_to_42_hours_before_comp2      1500
13_to_18_hours_before_comp2      1500
1_to_6_hours_before_comp2        1500
31_to_36_hours_before_comp2      1500
7_to_12_hours_before_comp1       1152
37_to_42_hours_before_comp1      1152
31_to_36_hours_before_comp1      1152
13_to_18_hours_before_comp1      1152
25_to_30_hours_before_comp1      1152
1_to_6_hours_before_comp1        1152
19_to_24_hours_before_comp1      1152
43_to_48_hours_before_comp1      1149
1_to_6_hours_before_comp4         930
13_to_18_hours_before_comp4       930
25_to_30_hours_before_comp4       930
31_to_36_hours_before_comp4       930
19_to_24_hours_before_comp4       930
7_to_12_hours_before_comp4        930
37_to_42_hours_before_comp4       930
43_to_48_hours_before_comp4       930
1_to_6_hou

### KNN

- Categorical failures

In [31]:
X = preprocessed_data
y = labels["failure"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=STATE
)

knn = KNeighborsClassifier(n_neighbors=23, n_jobs=-1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

                             precision    recall  f1-score   support

13_to_18_hours_before_comp1       0.00      0.00      0.00       244
13_to_18_hours_before_comp2       0.00      0.00      0.00       267
13_to_18_hours_before_comp3       0.00      0.00      0.00       170
13_to_18_hours_before_comp4       0.00      0.00      0.00       175
19_to_24_hours_before_comp1       0.00      0.00      0.00       210
19_to_24_hours_before_comp2       0.48      0.04      0.08       289
19_to_24_hours_before_comp3       0.75      0.12      0.21       148
19_to_24_hours_before_comp4       0.81      0.12      0.20       191
  1_to_6_hours_before_comp1       0.00      0.00      0.00       232
  1_to_6_hours_before_comp2       0.00      0.00      0.00       291
  1_to_6_hours_before_comp3       0.00      0.00      0.00       160
  1_to_6_hours_before_comp4       0.00      0.00      0.00       179
25_to_30_hours_before_comp1       0.00      0.00      0.00       222
25_to_30_hours_before_comp2      

- Binary failures

In [32]:
X = preprocessed_data
y = binary_labels["failure"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=STATE
)
knn = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     failure       0.81      0.53      0.64       226
      normal       0.96      1.00      0.98    168451
 pre-failure       0.45      0.07      0.12      6765

    accuracy                           0.96    175442
   macro avg       0.74      0.53      0.58    175442
weighted avg       0.94      0.96      0.95    175442



### SVC

In [33]:
# from sklearn.svm import SVC

# X = preprocessed_data
# y = binary_labels["failure"]

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=STATE
# )

# svc = SVC(
#     kernel="rbf",
#     class_weight={"failure": 1, "normal": 1, "pre-failure": 3},
#     random_state=STATE,
# )
# svc.fit(X_train, y_train)

# y_pred = svc.predict(X_test)
# print(classification_report(y_test, y_pred, zero_division=0))

### Random forest

In [34]:
X = preprocessed_data
y = labels["failure"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=STATE
)

rf = RandomForestClassifier(class_weight="balanced", random_state=STATE, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

                             precision    recall  f1-score   support

13_to_18_hours_before_comp1       0.00      0.00      0.00       244
13_to_18_hours_before_comp2       0.00      0.00      0.00       267
13_to_18_hours_before_comp3       0.00      0.00      0.00       170
13_to_18_hours_before_comp4       0.00      0.00      0.00       175
19_to_24_hours_before_comp1       0.12      0.00      0.01       210
19_to_24_hours_before_comp2       0.70      0.05      0.09       289
19_to_24_hours_before_comp3       0.79      0.16      0.26       148
19_to_24_hours_before_comp4       0.85      0.12      0.20       191
  1_to_6_hours_before_comp1       0.00      0.00      0.00       232
  1_to_6_hours_before_comp2       0.00      0.00      0.00       291
  1_to_6_hours_before_comp3       0.00      0.00      0.00       160
  1_to_6_hours_before_comp4       0.00      0.00      0.00       179
25_to_30_hours_before_comp1       0.00      0.00      0.00       222
25_to_30_hours_before_comp2      

### MLP

- Model

In [42]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

num_epochs = 100
batch_size = 64


class ClassificationModel(nn.Module):
    PATH = "data/models/"
    EXT = ".pth"

    def __init__(self, input_dim: int, output_dim: int):
        super().__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.output = nn.Linear(64, output_dim)
        self.criterion = nn.CrossEntropyLoss()
        self.state_loaded = False

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.output(x)
        return x if self.training else torch.softmax(x, dim=1)

    def train_model(
        self, X_train: np.ndarray, y_train: np.ndarray, file_name: str
    ) -> None:
        self.state_loaded = False
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
        dataset = TensorDataset(
            torch.tensor(X_train, dtype=torch.float32),
            torch.tensor(y_train, dtype=torch.long),
        )
        train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        for epoch in range(num_epochs):
            self.train()

            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                outputs = self(X_batch)
                loss = self.criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()

            if epoch % 10 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

        self.state_loaded = True
        print(f"Epoch {num_epochs}, Loss: {loss.item()}")
        torch.save(self.state_dict(), self.PATH + file_name + self.EXT)

    def test_model(
        self, X_test: np.ndarray, y_test: np.ndarray, file_name: str | None = None
    ) -> None:
        if not self.state_loaded:
            self.load_state_dict(torch.load(self.PATH + file_name + self.EXT))
            self.state_loaded = True

        self.eval()
        dataset = TensorDataset(
            torch.tensor(X_test, dtype=torch.float32),
            torch.tensor(y_test, dtype=torch.long),
        )
        test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        self.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for X_batch, y_batch in test_loader:
                outputs = self(X_batch)
                _, predicted = torch.max(outputs, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()
            print(f"Accuracy: {round(100 * correct / total, 2)}%")

- Training

In [43]:
from sklearn.preprocessing import LabelEncoder

filtered_labels = interval_labels[interval_labels["failure"] != "normal"]
filtered_indices = filtered_labels.index
filtered_data = preprocessed_data[filtered_indices]

X = filtered_data
y = filtered_labels["failure"]

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=STATE
)

model = ClassificationModel(X_train.shape[1], len(y.unique()))
model.train_model(X_train, y_train, "classification_mlp")

Epoch 1/100, Loss: 0.8780949115753174
Epoch 11/100, Loss: 0.5379189252853394
Epoch 21/100, Loss: 0.6467944383621216
Epoch 31/100, Loss: 0.39825063943862915
Epoch 41/100, Loss: 0.5953049063682556
Epoch 51/100, Loss: 0.44549211859703064
Epoch 61/100, Loss: 0.42788803577423096
Epoch 71/100, Loss: 0.5345834493637085
Epoch 81/100, Loss: 0.3950938582420349
Epoch 91/100, Loss: 0.5104618668556213
Epoch 100, Loss: 0.3816201388835907


- Testing

In [44]:
model.test_model(X_test, y_test, "classification_mlp")

Accuracy: 78.5%
