In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import graphviz
from sklearn.linear_model import LassoCV

In [11]:
def load_data(telemetry_path: str, errors_path: str, failures_path: str, maint_path: str, machines_path: str, machine_id: int = 0) -> pd.DataFrame:
    def process_data(path: str, one_hot: bool = False) -> pd.DataFrame:
        df = pd.read_csv(path)
        df["datetime"] = pd.to_datetime(df["datetime"])

        if one_hot:
            df = pd.get_dummies(df,dtype=int)
            df = df.groupby(["datetime", "machineID"], as_index=False).max()
        
        df = df.set_index("datetime")
        df = df.sort_index()

        return df

    df_telemetry = process_data(telemetry_path)
    # print(df_telemetry.head())
    df_errors = process_data(errors_path, one_hot=True)
    df_failures = process_data(failures_path, one_hot=True)
    df_maint = process_data(maint_path, one_hot=True)
    # df_machine_info = pd.read_csv(machines_path, index_col="machineID")

    # df_merged = df_telemetry.reset_index().merge(right=df_machine_info, how="left", left_on="machineID", right_on="machineID", )
    df_merged = df_telemetry
    df_merged = df_merged.merge(right=df_errors, how="left", left_on=["datetime", "machineID"], right_on=["datetime", "machineID"])
    df_merged = df_merged.merge(right=df_failures, how="left", left_on=["datetime", "machineID"], right_on=["datetime", "machineID"])
    df_merged = df_merged.merge(right=df_maint, how="left", left_on=["datetime", "machineID"], right_on=["datetime", "machineID"])
    df_merged = df_merged.fillna(0)

    event_cols = [c for c in df_merged.columns if c.startswith(("errorID_", "failure_", "comp_"))]
    df_merged[event_cols] = df_merged[event_cols].astype(int)


    scaler = StandardScaler()
    for each in df_merged["machineID"].unique():
        mask = df_merged["machineID"] == each
        df_merged.loc[mask, ["volt", "rotate", "pressure", "vibration"]] = scaler.fit_transform(df_merged.loc[mask, ["volt", "rotate", "pressure", "vibration"]])

    rename_map = {"errorID_error1": "error_1", "errorID_error2": "error_2", "errorID_error3": "error_3", "errorID_error4": "error_4", "errorID_error5": "error_5",
                  "failure_comp1": "failure_c1", "failure_comp2": "failure_c2", "failure_comp3": "failure_c3", "failure_comp4": "failure_c4",
                  "comp_comp1": "maint_c1", "comp_comp2": "maint_c2", "comp_comp3": "maint_c3", "comp_comp4": "maint_c4"}
    df_merged = df_merged.rename(columns=rename_map)

    df_merged = df_merged.reset_index()

    if machine_id != 0:
        df_merged = df_merged[df_merged["machineID"] == machine_id]

    return df_merged

In [12]:
data = load_data(
    telemetry_path="./data/PdM_telemetry.csv",
    errors_path="./data/PdM_errors.csv",
    failures_path="./data/PdM_failures.csv",
    maint_path="./data/PdM_maint.csv",
    machines_path="./data/PdM_machines.csv"
)
data.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,error_1,error_2,error_3,error_4,error_5,failure_c1,failure_c2,failure_c3,failure_c4,maint_c1,maint_c2,maint_c3,maint_c4
0,2015-01-01 06:00:00,1,0.351582,-0.532971,1.142729,0.812255,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2015-01-01 06:00:00,53,0.769744,-0.486197,0.794046,1.042461,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2015-01-01 06:00:00,99,-0.136049,-1.121576,0.814706,0.254328,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2015-01-01 06:00:00,12,0.029746,2.438481,-0.315594,1.410844,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2015-01-01 06:00:00,6,-2.178076,0.852968,4.603447,-3.326364,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
fail_df = pd.read_csv("./data/PdM_failures.csv", parse_dates=["datetime"])
error_df = pd.read_csv("./data/PdM_errors.csv", parse_dates=["datetime"])
maint_df = pd.read_csv("./data/PdM_maint.csv", parse_dates=["datetime"])

In [14]:
fail_df["datetime"] = pd.to_datetime(fail_df["datetime"])
error_df["datetime"] = pd.to_datetime(error_df["datetime"])
maint_df["datetime"] = pd.to_datetime(maint_df["datetime"])

In [15]:
error_df[error_df["datetime"].duplicated()]

Unnamed: 0,datetime,machineID,errorID
13,2015-04-19 06:00:00,1,error3
32,2015-10-16 06:00:00,1,error3
40,2015-03-18 06:00:00,2,error2
41,2015-03-18 06:00:00,2,error3
45,2015-04-17 06:00:00,2,error3
...,...,...,...
3911,2015-11-05 02:00:00,100,error3
3913,2015-11-12 01:00:00,100,error1
3915,2015-12-04 02:00:00,100,error1
3916,2015-12-08 06:00:00,100,error2


In [16]:
error_one_hot = pd.get_dummies(error_df,dtype=int)
error_one_hot[error_one_hot["datetime"].duplicated()]

Unnamed: 0,datetime,machineID,errorID_error1,errorID_error2,errorID_error3,errorID_error4,errorID_error5
13,2015-04-19 06:00:00,1,0,0,1,0,0
32,2015-10-16 06:00:00,1,0,0,1,0,0
40,2015-03-18 06:00:00,2,0,1,0,0,0
41,2015-03-18 06:00:00,2,0,0,1,0,0
45,2015-04-17 06:00:00,2,0,0,1,0,0
...,...,...,...,...,...,...,...
3911,2015-11-05 02:00:00,100,0,0,1,0,0
3913,2015-11-12 01:00:00,100,1,0,0,0,0
3915,2015-12-04 02:00:00,100,1,0,0,0,0
3916,2015-12-08 06:00:00,100,0,1,0,0,0


In [17]:
data[data["datetime"] == "2015-12-08 06:00:00"][data["machineID"] == 100]

  data[data["datetime"] == "2015-12-08 06:00:00"][data["machineID"] == 100]


Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,error_1,error_2,error_3,error_4,error_5,failure_c1,failure_c2,failure_c3,failure_c4,maint_c1,maint_c2,maint_c3,maint_c4
818424,2015-12-08 06:00:00,100,-2.134944,-2.715345,-1.001133,0.653162,0,1,1,0,0,0,0,0,0,0,0,0,0


In [18]:
# 1) Should be exactly 8761 rows per machine in 2015 telemetry (if filtering a single machine)
print("rows:", len(data), "unique datetimes:", data.index.nunique())

# 2) No duplicated timestamps within a machine
assert not data.index.duplicated().any()


rows: 876100 unique datetimes: 876100
