# Predictive Maintenance with Azure Dataset

## Project imports

In [4]:
import numpy as np
import pandas as pd

## Data Imports

Needs data_extraction.ipynb to be run first

In [5]:
def read(name: str, parse_dates: list[str] | None = ["datetime"]) -> pd.DataFrame:
    path = "data/"
    ext = ".csv"
    file = path + name + ext
    return pd.read_csv(file, parse_dates=parse_dates, na_values="NaN")


data = read("raw_data")
errors = read("PdM_errors")
failures = read("PdM_failures")
maint = read("PdM_maint")

## Pre-processing

Pre-processing the data to prepare it for predictive maintenance.

### Removing suspected abnormal data

In [7]:
time_before = 48
time_after = 6

normal_behavior_data = data.copy()
variables = ["volt", "rotate", "pressure", "vibration"]
normal_behavior_data = normal_behavior_data.dropna(subset=variables)
normal_behavior_data = normal_behavior_data.drop(columns=["errorID", "failure", "comp"])
anomalies = [failures, errors, maint]

# Create a DataFrame of all the windows around the anomalies
windows = pd.concat(
    [
        anomaly.assign(
            start=anomaly["datetime"] - pd.Timedelta(hours=time_before),
            end=anomaly["datetime"] + pd.Timedelta(hours=time_after),
        )
        for anomaly in anomalies
    ]
)

# Initialize a mask with all False
mask = pd.Series([False] * len(normal_behavior_data), index=normal_behavior_data.index)

# Update mask for each window values
for _, window in windows.iterrows():
    mask |= (
        (normal_behavior_data["machineID"] == window["machineID"])
        & (normal_behavior_data["datetime"] >= window["start"])
        & (normal_behavior_data["datetime"] <= window["end"])
    )

# Create a DataFrame with the removed data
removed_data = normal_behavior_data[mask]
removed_data = removed_data.drop_duplicates()
removed_data.to_csv("data/preprocessing/failures_only.csv", index=False)

# Filter normal_behavior_data
normal_behavior_data = normal_behavior_data[~mask]
normal_behavior_data.to_csv("data/preprocessing/expected_behavior.csv", index=False)
print(normal_behavior_data.head())

               datetime  machineID        volt      rotate    pressure  \
4   2015-01-01 06:00:00          1  176.217853  418.504078  113.077935   
108 2015-01-05 13:00:00          1  180.511003  429.058686  107.314608   
109 2015-01-05 14:00:00          1  174.675215  396.757832  111.505227   
110 2015-01-05 15:00:00          1  181.406935  575.505189  102.008082   
111 2015-01-05 16:00:00          1  197.636954  448.467915   78.721961   

     vibration   model  age  
4    45.087686  model3   18  
108  39.232469  model3   18  
109  33.156011  model3   18  
110  38.054036  model3   18  
111  34.367747  model3   18  


### Identifying machine expected behavior

In [8]:
from scipy.stats import kurtosis, skew, t


def calculate_statistics(
    machine_data: pd.DataFrame, machine_id: str | int
) -> list[dict]:
    variables = ["volt", "rotate", "pressure", "vibration"]
    results = []
    for var in variables:
        machine_var = machine_data[var]
        mean = machine_var.mean()
        std = machine_var.std()
        n = machine_var.count()
        confidence_interval = t.interval(0.95, df=n - 1, loc=mean, scale=std / np.sqrt(n))
        q1 = machine_var.quantile(0.25)
        q3 = machine_var.quantile(0.75)
        iqr = q3 - q1
        lower_fence = q1 - 1.5 * iqr
        upper_fence = q3 + 1.5 * iqr
        outliers = machine_var[(machine_var < lower_fence) | (machine_var > upper_fence)]

        results.append({
            "machineID": machine_id,
            "variable": var,
            "mean": mean,
            "median": machine_var.median(),
            "std": std,
            "confidence_interval_95": confidence_interval,
            "skew": skew(machine_var),
            "kurtosis": kurtosis(machine_var),
            "min": machine_var.min(),
            "lower_fence": lower_fence,
            "q1": q1,
            "q3": q3,
            "upper_fence": upper_fence,
            "max": machine_var.max(),
            "outliers": outliers.values.tolist()
        })
    return results


# Results for the entire dataset
statistics = calculate_statistics(normal_behavior_data, "all")

# Results for each machine
statistics += [
    result
    for id in normal_behavior_data["machineID"].unique()
    for result in calculate_statistics(
        normal_behavior_data[normal_behavior_data["machineID"] == id], id
    )
]

# Convert the results to a DataFrame
statistics_df = pd.DataFrame(statistics)
statistics_df.to_csv("data/preprocessing/statistics.csv", index=False)
print(statistics_df.head(4))

  machineID   variable        mean      median        std  \
0       all       volt  170.585667  170.453691  15.378671   
1       all     rotate  447.774725  448.413383  51.820495   
2       all   pressure  100.673857  100.335225  10.832220   
3       all  vibration   40.292168   40.181184   5.278982   

                     confidence_interval_95      skew  kurtosis         min  \
0   (170.5473486438166, 170.62398550059842)  0.066006  0.079516   97.333604   
1    (447.6456062965298, 447.9038444620059) -0.095101  0.136754  160.258190   
2  (100.64686638533253, 100.70084681439623)  0.334694  0.729312   51.237106   
3    (40.27901420978728, 40.30532107051222)  0.192482  0.355528   14.877054   

   lower_fence          q1          q3  upper_fence         max  \
0   129.315043  160.203042  180.795042   211.683041  250.870453   
1   309.901260  413.610414  482.749849   586.459003  695.020984   
2    72.543442   93.446370  107.381656   128.284584  182.111770   
3    26.322213   36.741937   4

### Identifying machine failure behavior

In [9]:
# Results for the entire dataset
failure_stats = calculate_statistics(removed_data, "all")

# Results for each machine
failure_stats += [
    result
    for id in removed_data["machineID"].unique()
    for result in calculate_statistics(
        removed_data[removed_data["machineID"] == id], id
    )
]

# Convert the results to a DataFrame
failure_stats_df = pd.DataFrame(failure_stats)
failure_stats_df.to_csv("data/preprocessing/failures_statistics.csv", index=False)
print(failure_stats_df.head(4))

  machineID   variable        mean      median        std  \
0       all       volt  171.239549  170.989856  15.808833   
1       all     rotate  443.792910  445.451349  54.568890   
2       all   pressure  101.303030  100.649051  11.540464   
3       all  vibration   40.608230   40.378852   5.577645   

                     confidence_interval_95      skew  kurtosis         min  \
0   (171.17846972551618, 171.3006282430816)  0.127922  0.177492  100.194137   
1   (443.5820770580944, 444.00374352779164) -0.224588  0.282028  138.432075   
2  (101.25844165071973, 101.34761749075201)  0.525895  1.075901   54.170030   
3    (40.58667980246432, 40.62977956050882)  0.350834  0.621969   16.222680   

   lower_fence          q1          q3  upper_fence         max  \
0   129.070249  160.551789  181.539482   213.021022  255.124717   
1   301.612288  409.115521  480.784343   588.287576  675.685691   
2    72.107364   93.629721  107.977959   129.500316  185.951998   
3    26.122543   36.864514   4