# Predictive Maintenance with Azure Dataset

## Project imports

In [1]:
import pandas as pd

## Data Imports

Needs pre-processing.ipynb to be run first

In [2]:
def read(name: str, parse_dates: list[str] | None = ["datetime"]) -> pd.DataFrame:
    path = "data/"
    ext = ".csv"
    file = path + name + ext
    return pd.read_csv(file, parse_dates=parse_dates, na_values="NaN")

VARIABLES = ["volt", "rotate", "pressure", "vibration"]
DATA = read("raw_data").dropna(subset=VARIABLES)
normal_behavior_data = read("preprocessing/expected_behavior")
abnormal_data = read("preprocessing/failures_only")

## Full dataset classification

### KNN

#### Pre-processing

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

STATE = 42

independent_data = DATA.drop(
    columns=["datetime", "machineID", "errorID", "failure", "comp"]
)
labels = DATA[["errorID", "failure", "comp"]]

numeric_features = ["volt", "rotate", "pressure", "vibration", "age"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["model"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("ordinal", OrdinalEncoder()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessed_data = preprocessor.fit_transform(independent_data)

#### Model

In [4]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X = preprocessed_data

# Run the classifier for each label
for label in ["errorID", "failure", "comp"]:
    print(f"Classification report for {label}:")
    y = labels[label].fillna("No Anomaly")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=STATE
    )

    # Train a KNN classifier
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)

    # Evaluate the classifier
    y_pred = knn.predict(X_test)
    print(classification_report(y_test, y_pred, zero_division=1))

Classification report for errorID:
              precision    recall  f1-score   support

  No Anomaly       1.00      1.00      1.00    174705
      error1       0.00      0.00      0.00       204
      error2       0.33      0.01      0.01       179
      error3       0.00      0.00      0.00       156
      error4       0.00      0.00      0.00       134
      error5       1.00      0.00      0.00        64

    accuracy                           1.00    175442
   macro avg       0.39      0.17      0.17    175442
weighted avg       0.99      1.00      0.99    175442

Classification report for failure:
              precision    recall  f1-score   support

  No Anomaly       1.00      1.00      1.00    175216
       comp1       0.00      0.00      0.00        64
       comp2       0.00      0.00      0.00        69
       comp3       0.00      0.00      0.00        39
       comp4       0.00      0.00      0.00        54

    accuracy                           1.00    175442
   macr

#### Broadening classification to only two cases

In [5]:
# Combine the labels into a single binary label
labels_combined = labels.any(axis=1).replace({False: "No Anomaly", True: "Anomaly"})

X = preprocessed_data
y = labels_combined

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=STATE
)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     Anomaly       0.14      0.04      0.06      1338
  No Anomaly       0.99      1.00      1.00    174104

    accuracy                           0.99    175442
   macro avg       0.57      0.52      0.53    175442
weighted avg       0.99      0.99      0.99    175442

