In [7]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import graphviz
from sklearn.linear_model import LassoCV

In [8]:
print("Maint hours:\n", pd.read_csv("./data/PdM_maint.csv", parse_dates=["datetime"])["datetime"].dt.hour.value_counts().sort_index())
print("Fail hours:\n",  pd.read_csv("./data/PdM_failures.csv", parse_dates=["datetime"])["datetime"].dt.hour.value_counts().sort_index())
print("Err hours:\n",   pd.read_csv("./data/PdM_errors.csv", parse_dates=["datetime"])["datetime"].dt.hour.value_counts().sort_index())

Maint hours:
 6    3286
Name: datetime, dtype: int64
Fail hours:
 3     18
6    743
Name: datetime, dtype: int64
Err hours:
 0      126
1      132
2      142
3      112
4      127
5      117
6     1122
7      119
8      135
9      113
10     124
11      98
12     120
13     130
14     128
15     129
16     113
17     124
18     127
19     108
20     127
21     108
22     116
23     122
Name: datetime, dtype: int64


In [9]:
def load_data_daily(
    telemetry_path: str,
    errors_path: str,
    failures_path: str,
    maint_path: str,
    machines_path: str,
    machine_id: int = 0,
    scale_sensors: bool = True,
    telemetry_agg: str = "mean",  # could be "mean", "median", etc.
) -> pd.DataFrame:
    
    def add_operational_day(df: pd.DataFrame) -> pd.DataFrame:
        # shift back so 06:00 belongs to the same day as the preceding hours
        shift = pd.Timedelta(hours=6)
        df = df.copy()
        df["day"] = (df["datetime"] - shift).dt.floor("D")
        return df

    def process_events(path: str, col_prefix_map: dict) -> pd.DataFrame:
        df = pd.read_csv(path, parse_dates=["datetime"])
        df = add_operational_day(df)

        df = pd.get_dummies(df, dtype=int)

        df = df.groupby(["machineID", "datetime", "day"], as_index=False).max()

        event_cols = [c for c in df.columns if any(c.startswith(k) for k in col_prefix_map.keys())]
        df = df.groupby(["machineID", "day"], as_index=False)[event_cols].max()

        rename_map = {}
        for prefix, pretty in col_prefix_map.items():
            for c in event_cols:
                if c.startswith(prefix):
                    rename_map[c] = c.replace(prefix, pretty)
        df = df.rename(columns=rename_map)

        for c in df.columns:
            if c not in ["machineID", "day"]:
                df[c] = df[c].astype(int)

        return df

    tele = pd.read_csv(telemetry_path, parse_dates=["datetime"])
    tele = add_operational_day(tele)

    sensor_cols = ["volt", "rotate", "pressure", "vibration"]
    tele_daily = (tele.groupby(["machineID", "day"], as_index=False)[sensor_cols]
                     .agg(telemetry_agg))

    err_daily  = process_events(errors_path,   {"errorID_": "error_"})
    fail_daily = process_events(failures_path, {"failure_": "failure_"})
    maint_daily= process_events(maint_path,    {"comp_": "maint_"})

    df = tele_daily.merge(err_daily,  on=["machineID", "day"], how="left") \
                   .merge(fail_daily, on=["machineID", "day"], how="left") \
                   .merge(maint_daily,on=["machineID", "day"], how="left")

    # fill missing events with 0
    event_cols = [c for c in df.columns if c.startswith(("error_", "failure_", "maint_"))]
    df[event_cols] = df[event_cols].fillna(0).astype(int)

    rename = {
    "error_error1":"error_1","error_error2":"error_2","error_error3":"error_3","error_error4":"error_4","error_error5":"error_5",
    "failure_comp1":"failure_c1","failure_comp2":"failure_c2","failure_comp3":"failure_c3","failure_comp4":"failure_c4",
    "maint_comp1":"maint_c1","maint_comp2":"maint_c2","maint_comp3":"maint_c3","maint_comp4":"maint_c4",
    }
    df = df.rename(columns=rename)


    machines = pd.read_csv(machines_path)
    df = df.merge(machines, on="machineID", how="left")

    if scale_sensors:
        scaler = StandardScaler()
        for mid in df["machineID"].unique():
            m = df["machineID"] == mid
            df.loc[m, sensor_cols] = scaler.fit_transform(df.loc[m, sensor_cols])

    df = df.sort_values(["machineID", "day"]).set_index(["machineID", "day"])

    if machine_id != 0:
        df = df.loc[(machine_id, slice(None)), :]

    return df

In [10]:
data = load_data_daily(
    telemetry_path="./data/PdM_telemetry.csv",
    errors_path="./data/PdM_errors.csv",
    failures_path="./data/PdM_failures.csv",
    maint_path="./data/PdM_maint.csv",
    machines_path="./data/PdM_machines.csv",
    machine_id=0,  # all machines
    scale_sensors=True,
    telemetry_agg="mean",
)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,volt,rotate,pressure,vibration,error_1,error_2,error_3,error_4,error_5,failure_c1,failure_c2,failure_c3,failure_c4,maint_c1,maint_c2,maint_c3,maint_c4,model,age
machineID,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,2015-01-01,-0.257097,-0.056867,-0.898499,-0.08073,0,0,0,0,0,0,0,0,0,0,0,0,0,model3,18
1,2015-01-02,-0.118429,0.463672,0.027563,-0.876025,0,0,0,0,0,0,0,0,0,0,0,0,0,model3,18
1,2015-01-03,0.373405,0.271524,-0.099438,4.893198,1,0,1,0,0,0,0,0,0,0,0,0,0,model3,18
1,2015-01-04,0.098945,-0.279084,-0.599137,4.584068,0,0,0,0,1,0,0,0,0,0,0,0,0,model3,18
1,2015-01-05,0.11228,1.274486,0.877956,-0.241107,0,0,0,0,0,0,0,0,1,1,0,0,1,model3,18


In [14]:
from tigramite import data_processing as pp
from tigramite.pcmci import PCMCI
from tigramite.independence_tests.regressionCI import RegressionCI


def run_pcmciplus(
    df_daily: pd.DataFrame,
    tau_max: int = 7,
    tau_min: int = 0,
    pc_alpha: float = 0.05,
    fdr_method: str = "fdr_bh",
    max_machines: int = 5,
    n_days: int = 100,
    verbosity: int = 1,
    model_filter: str | None = None,
):

    d = df_daily.reset_index().copy()
    d["day"] = pd.to_datetime(d["day"])
    d = d.sort_values(["machineID", "day"])

    if model_filter is not None:
        d = d[d["model"] == model_filter].copy()

    static_cols = {"machineID", "day", "model", "age"}
    var_names = [c for c in d.columns if c not in static_cols]

    continuous = {"volt", "rotate", "pressure", "vibration"}
    var_type = np.array([0 if c in continuous else 1 for c in var_names], dtype=int)

    start_day = d["day"].min()
    days = pd.date_range(start_day, periods=n_days, freq="D")

    data_dict = {}
    data_type_dict = {}

    machine_ids = d["machineID"].drop_duplicates().head(max_machines).tolist()
    kept_machine_ids = []

    for mid in machine_ids:
        dm = (d[d["machineID"] == mid]
              .set_index("day")[var_names]
              .sort_index())

        X = dm.loc[days].to_numpy()
        if np.isnan(X).any():
            continue

        data_dict[mid] = X
        data_type_dict[mid] = np.tile(var_type, (X.shape[0], 1))
        kept_machine_ids.append(mid)


    X_all = np.vstack([data_dict[mid] for mid in kept_machine_ids])
    nonconstant = (X_all.max(axis=0) != X_all.min(axis=0))
    var_names = [v for v, keep in zip(var_names, nonconstant) if keep]
    var_type = var_type[nonconstant]

    for mid in kept_machine_ids:
        data_dict[mid] = data_dict[mid][:, nonconstant]
        data_type_dict[mid] = np.tile(var_type, (data_dict[mid].shape[0], 1))

    dataframe = pp.DataFrame(
        data=data_dict,
        data_type=data_type_dict,
        var_names=var_names,
        analysis_mode="multiple",
    )

    pcmci = PCMCI(dataframe=dataframe, cond_ind_test=RegressionCI(), verbosity=verbosity)

    results = pcmci.run_pcmciplus(
        tau_min=tau_min,
        tau_max=tau_max,
        pc_alpha=pc_alpha,
        conflict_resolution=True,
        fdr_method=fdr_method,
    )

    graph = results["graph"]
    N = len(var_names)

    Gs = nx.DiGraph()
    Gs.add_nodes_from(var_names)

    for i in range(N):
        for j in range(N):
            if i == j:
                continue
            lags = []
            for tau in range(tau_min, tau_max + 1):
                if graph[i, j, tau] == "-->":
                    lags.append(tau)
            if lags:
                Gs.add_edge(var_names[i], var_names[j], lags=sorted(lags), method="PCMCI+")

    return Gs, results, var_names


In [None]:
Gs, results, var_names = run_pcmciplus(
    df_daily=data,
    tau_max=7,
    tau_min=1,
    max_machines=5,
    n_days=120,
    model_filter="model1",
    verbosity=1,
)

print("Nodes:", Gs.number_of_nodes())
print("Edges:", Gs.number_of_edges())
print("First 15 edges:")
for u, v, d in list(Gs.edges(data=True))[:15]:
    print(f"{u} -> {v}  lags={d['lags']}")



##
## Step 1: PC1 algorithm for selecting lagged conditions
##

Parameters:
independence test = regression_ci
tau_min = 1
tau_max = 7
pc_alpha = [0.05]
max_conds_dim = None
max_combinations = 1



## Resulting lagged parent (super)sets:

    Variable volt has 3 link(s):
        (volt -1): max_pval = 0.00000, |min_val| =  91.815
        (rotate -1): max_pval = 0.03376, |min_val| =  4.507
        (rotate -2): max_pval = 0.04686, |min_val| =  3.950

    Variable rotate has 2 link(s):
        (rotate -1): max_pval = 0.00000, |min_val| =  89.930
        (failure_c1 -6): max_pval = 0.03006, |min_val| =  7.009

    Variable pressure has 2 link(s):
        (pressure -1): max_pval = 0.00000, |min_val| =  165.957
        (vibration -3): max_pval = 0.03979, |min_val| =  4.227

    Variable vibration has 3 link(s):
        (vibration -1): max_pval = 0.00000, |min_val| =  154.306
        (maint_c4 -2): max_pval = 0.00378, |min_val| =  11.153
        (vibration -2): max_pval = 0.00106, |min_val| = 