In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import graphviz
from sklearn.linear_model import LassoCV

In [26]:
print("Maint hours:\n", pd.read_csv("./data/PdM_maint.csv", parse_dates=["datetime"])["datetime"].dt.hour.value_counts().sort_index())
print("Fail hours:\n",  pd.read_csv("./data/PdM_failures.csv", parse_dates=["datetime"])["datetime"].dt.hour.value_counts().sort_index())
print("Err hours:\n",   pd.read_csv("./data/PdM_errors.csv", parse_dates=["datetime"])["datetime"].dt.hour.value_counts().sort_index())

Maint hours:
 6    3286
Name: datetime, dtype: int64
Fail hours:
 3     18
6    743
Name: datetime, dtype: int64
Err hours:
 0      126
1      132
2      142
3      112
4      127
5      117
6     1122
7      119
8      135
9      113
10     124
11      98
12     120
13     130
14     128
15     129
16     113
17     124
18     127
19     108
20     127
21     108
22     116
23     122
Name: datetime, dtype: int64


In [34]:
def load_data_daily(
    telemetry_path: str,
    errors_path: str,
    failures_path: str,
    maint_path: str,
    machines_path: str,
    machine_id: int = 0,
    scale_sensors: bool = True,
    telemetry_agg: str = "mean",  # could be "mean", "median", etc.
) -> pd.DataFrame:
    
    def add_operational_day(df: pd.DataFrame) -> pd.DataFrame:
        # shift back so 06:00 belongs to the same day as the preceding hours
        shift = pd.Timedelta(hours=6)
        df = df.copy()
        df["day"] = (df["datetime"] - shift).dt.floor("D")
        return df

    def process_events(path: str, col_prefix_map: dict) -> pd.DataFrame:
        df = pd.read_csv(path, parse_dates=["datetime"])
        df = add_operational_day(df)

        df = pd.get_dummies(df, dtype=int)

        df = df.groupby(["machineID", "datetime", "day"], as_index=False).max()

        event_cols = [c for c in df.columns if any(c.startswith(k) for k in col_prefix_map.keys())]
        df = df.groupby(["machineID", "day"], as_index=False)[event_cols].max()

        rename_map = {}
        for prefix, pretty in col_prefix_map.items():
            for c in event_cols:
                if c.startswith(prefix):
                    rename_map[c] = c.replace(prefix, pretty)
        df = df.rename(columns=rename_map)

        for c in df.columns:
            if c not in ["machineID", "day"]:
                df[c] = df[c].astype(int)

        return df

    tele = pd.read_csv(telemetry_path, parse_dates=["datetime"])
    tele = add_operational_day(tele)

    sensor_cols = ["volt", "rotate", "pressure", "vibration"]
    tele_daily = (tele.groupby(["machineID", "day"], as_index=False)[sensor_cols]
                     .agg(telemetry_agg))

    err_daily  = process_events(errors_path,   {"errorID_": "error_"})
    fail_daily = process_events(failures_path, {"failure_": "failure_"})
    maint_daily= process_events(maint_path,    {"comp_": "maint_"})

    df = tele_daily.merge(err_daily,  on=["machineID", "day"], how="left") \
                   .merge(fail_daily, on=["machineID", "day"], how="left") \
                   .merge(maint_daily,on=["machineID", "day"], how="left")

    # fill missing events with 0
    event_cols = [c for c in df.columns if c.startswith(("error_", "failure_", "maint_"))]
    df[event_cols] = df[event_cols].fillna(0).astype(int)

    machines = pd.read_csv(machines_path)
    df = df.merge(machines, on="machineID", how="left")

    if scale_sensors:
        scaler = StandardScaler()
        for mid in df["machineID"].unique():
            m = df["machineID"] == mid
            df.loc[m, sensor_cols] = scaler.fit_transform(df.loc[m, sensor_cols])

    df = df.sort_values(["machineID", "day"]).set_index(["machineID", "day"])

    if machine_id != 0:
        df = df.loc[(machine_id, slice(None)), :]

    return df

In [35]:
data = load_data_daily(
    telemetry_path="./data/PdM_telemetry.csv",
    errors_path="./data/PdM_errors.csv",
    failures_path="./data/PdM_failures.csv",
    maint_path="./data/PdM_maint.csv",
    machines_path="./data/PdM_machines.csv",
    machine_id=0,  # all machines
    scale_sensors=True,
    telemetry_agg="mean",
)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,volt,rotate,pressure,vibration,error_error1,error_error2,error_error3,error_error4,error_error5,failure_comp1,failure_comp2,failure_comp3,failure_comp4,maint_comp1,maint_comp2,maint_comp3,maint_comp4,model,age
machineID,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,2015-01-01,-0.257097,-0.056867,-0.898499,-0.08073,0,0,0,0,0,0,0,0,0,0,0,0,0,model3,18
1,2015-01-02,-0.118429,0.463672,0.027563,-0.876025,0,0,0,0,0,0,0,0,0,0,0,0,0,model3,18
1,2015-01-03,0.373405,0.271524,-0.099438,4.893198,1,0,1,0,0,0,0,0,0,0,0,0,0,model3,18
1,2015-01-04,0.098945,-0.279084,-0.599137,4.584068,0,0,0,0,1,0,0,0,0,0,0,0,0,model3,18
1,2015-01-05,0.11228,1.274486,0.877956,-0.241107,0,0,0,0,0,0,0,0,1,1,0,0,1,model3,18
