In [17]:
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from dataclasses import dataclass
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import IsolationForest

import warnings
warnings.filterwarnings("ignore")

# hv.renderer('bokeh').theme = 'dark_minimal'


In [18]:
# dataset_root = Path(r"C:\Users\Turquin\Documents\MLFPMA - Machine Learning for Predictive Maintenance Application\Project\Dataset") # Raw string works without escaping \
dataset_root = Path("./Dataset")

@dataclass
class Case():
    info: pd.DataFrame
    measurements: pd.DataFrame


class RawDataset():
    def __init__(self, root, unit = "VG4", load_training=False, load_synthetic=False, load_anomalies=False) -> None:
        read_pq_file = lambda f: pq.read_table(root / f).to_pandas()
        
        cases = {
            "test": [f"{unit}_generator_data_testing_real_measurements.parquet", root / f"{unit}_generator_data_testing_real_info.csv" ], 
        }

        if load_training:
            cases = {
                **cases,
                "train": [f"{unit}_generator_data_training_measurements.parquet", root / f"{unit}_generator_data_training_info.csv" ], 
            }

        if load_synthetic:
            cases = {
                **cases,
                "test_s01": [f"{unit}_generator_data_testing_synthetic_01_measurements.parquet", root / f"{unit}_generator_data_testing_synthetic_01_info.csv"], 
                "test_s02": [f"{unit}_generator_data_testing_synthetic_02_measurements.parquet", root / f"{unit}_generator_data_testing_synthetic_02_info.csv"]
            }

        if load_anomalies:
            anomaly_folder = Path("synthetic_anomalies")  # Relative path
            subdataset = ["01", "02"]
            anomaly_types = ["a", "b", "c"]
            for anomaly in subdataset:
                for subtype in anomaly_types:
                    anomaly_key = f"anomaly_{anomaly}_type_{subtype}"
                    anomaly_file = f"{unit}_anomaly_{anomaly}_type_{subtype}.parquet"
                    full_anomaly_path = root / anomaly_folder / anomaly_file
                    if full_anomaly_path.exists():
                        cases[anomaly_key] = [anomaly_folder / anomaly_file, None]

        
        self.data_dict = dict()
        
        for id_c, c in cases.items():
            # if you need to verify the parquet header:
            # pq_rows = RawDataset.read_parquet_schema_df(root / c[0])
            measurements = read_pq_file(c[0])
            info = pd.read_csv(c[1]) if c[1] is not None else None
            self.data_dict[id_c] = Case(info, measurements)
    
    @staticmethod
    def read_parquet_schema_df(uri: str) -> pd.DataFrame:
        """Return a Pandas dataframe corresponding to the schema of a local URI of a parquet file.

        The returned dataframe has the columns: column, pa_dtype
        """
        # Ref: https://stackoverflow.com/a/64288036/
        schema = pq.read_schema(uri, memory_map=True)
        schema = pd.DataFrame(({"column": name, "pa_dtype": str(pa_dtype)} for name, pa_dtype in zip(schema.names, schema.types)))
        schema = schema.reindex(columns=["column", "pa_dtype"], fill_value=pd.NA)  # Ensures columns in case the parquet file has an empty dataframe.
        return schema
    

rds_u4 = RawDataset(dataset_root, "VG4", load_synthetic=False, load_training=True)
rds_u5 = RawDataset(dataset_root, "VG5", load_synthetic=True, load_training=True, load_anomalies=True)
rds_u6 = RawDataset(dataset_root, "VG6", load_synthetic=True, load_training=True, load_anomalies=True)

In [20]:
def add_anomaly_ground_truth(rds):
    subdataset = ["01", "02"]
    anomaly_types = ["a", "b", "c"]

    results = []
    for anomaly in subdataset:
        test_s012 = rds.data_dict[f'test_s{anomaly}'].measurements

        for subtype in anomaly_types:
            anomaly_key = f"anomaly_{anomaly}_type_{subtype}"
            labeled_df = rds.data_dict[anomaly_key].measurements
            test_s012.loc[labeled_df['ground_truth'] == 1, anomaly_key] = 1
        
        test_s012['anomaly'] = (test_s012[[f'anomaly_{anomaly}_type_a',f'anomaly_{anomaly}_type_b',f'anomaly_{anomaly}_type_c']].max(axis=1) == 1).astype(int)
        results.append(test_s012)

    return results

u5_s01, u5_s02 = add_anomaly_ground_truth(rds_u5)
u6_s01, u6_s02 = add_anomaly_ground_truth(rds_u6)

In [21]:
def get_control_vars(df):
    return df[(df['control_signal'] == True) | (df['input_feature'] == True)].attribute_name.values

# get_control_vars(rds_u4.data_dict["train"].info)
get_control_vars(rds_u5.data_dict["train"].info)
# get_control_vars(rds_u6.data_dict["train"].info)

array(['tot_activepower', 'charge', 'coupler_position',
       'injector_01_opening', 'injector_02_opening',
       'injector_03_opening', 'injector_04_opening',
       'injector_05_opening', 'pump_calculated_flow',
       'pump_pressure_diff', 'pump_rotspeed', 'turbine_pressure',
       'turbine_rotspeed', 'water_primary_pump_01_opening',
       'water_primary_pump_02_opening', 'timer_turbine_on_off',
       'timer_injector_opening'], dtype=object)

In [23]:
def get_operating_modes(df):
    df_equilibrium_turbine_mode = df[df['equilibrium_turbine_mode'] == True]
    df_equilibrium_pump_mode = df[df['equilibrium_pump_mode'] == True]
    return df_equilibrium_turbine_mode, df_equilibrium_pump_mode

# train sets
u4_train_equil_turbine, u4_train_equil_pump = get_operating_modes(rds_u4.data_dict["train"].measurements)
u5_train_equil_turbine, u5_train_equil_pump = get_operating_modes(rds_u5.data_dict["train"].measurements)
u6_train_equil_turbine, u6_train_equil_pump = get_operating_modes(rds_u6.data_dict["train"].measurements)

In [24]:
# synethetic test sets
u5_s01_equil_turbine, u5_s01_equil_pump = get_operating_modes(u5_s01)
u5_s02_equil_turbine, u5_s02_equil_pump = get_operating_modes(u5_s02)
u6_s01_equil_turbine, u6_s01_equil_pump = get_operating_modes(u6_s01)
u6_s02_equil_turbine, u6_s02_equil_pump = get_operating_modes(u6_s02)

# real test sets
u4_test_equil_turbine, u4_test_equil_pump = get_operating_modes(rds_u4.data_dict["test"].measurements)
u5_test_equil_turbine, u5_test_equil_pump = get_operating_modes(rds_u5.data_dict["test"].measurements)
u6_test_equil_turbine, u6_test_equil_pump = get_operating_modes(rds_u6.data_dict["test"].measurements)


Some examples:

For Unit 4:
- train on u4_train_equil_pump, test on u4_test_equil_pump

For Unit 5:
- train on u5_train_equil_turbine, test on u5_s01_equil_turbine, u5_s02_equil_turbine, u5_test_equil_turbine

In [25]:
rds_u4.data_dict["train"].info

Unnamed: 0.1,Unnamed: 0,signal_name,attribute_name,asset,unit,element,signal_type,control_signal,input_feature,output_feature,core_attribute
0,0,FMHL_SUP_VEY.MHL_VG4_G4_P,tot_activepower,Veytaux 1,VG4,Generator,Measurement,False,True,True,False
1,1,FMHL_SUP_VEY.MHL_VGE_PRAP_TEMP_USINE,plant_tmp,Veytaux 1,General,Plant,Measurement,False,False,True,False
2,2,FMHL_SUP_VEY.MHL_V2_EAU0_TEMP_EXT,ext_tmp,Veytaux 1,General,Plant,Measurement,False,False,True,False
3,3,FMHL_SUP_VEY.MHL_VGE_PRAP_CLI_TEMP_DEP,water_primary_cold_tmp,Veytaux 1,General,Cooling,Measurement,False,False,True,False
4,4,FMHL_SUP_VEY.MHL_VGE_PRAP_CLI_TEMP_RET,water_primary_hot_tmp,Veytaux 1,General,Cooling,Measurement,False,False,True,False
5,5,FMHL_SUP_VEY.MHL_VGE_PRAP_CLI_OUV_VANNE,valve_opening,Veytaux 1,General,Cooling,Measurement,False,False,True,False
6,6,FMHL_SUP_VEY.MHL_VGE_PRAP_NIV_REFRI,refri_bath_level,Veytaux 1,General,Cooling,Measurement,False,False,True,False
7,7,FMHL_SUP_VEY.MHL_VGE_PRAP_NIV_ASPI,aspi_bath_level,Veytaux 1,General,Cooling,Measurement,False,False,True,False
8,8,FMHL_SUP_VEY.MHL_VGE_PRAP_NIV_CAN_FUITE,canal_level,Veytaux 1,General,Cooling,Measurement,False,False,True,False
9,9,FMHL_SUP_VEY.MHL_VGA_PRAP_TEMP_CAN_FUITE,canal_tmp,Veytaux 1,General,Cooling,Measurement,False,False,True,False


In [26]:
rds_u4.data_dict["train"].measurements

Unnamed: 0,tot_activepower,plant_tmp,ext_tmp,water_primary_cold_tmp,water_primary_hot_tmp,valve_opening,refri_bath_level,aspi_bath_level,canal_level,canal_tmp,...,water_circ_hot_02_tmp,water_circ_cold_tmp,machine_on,machine_off,turbine_mode,all,equilibrium_turbine_mode,dyn_only_on,pump_mode,equilibrium_pump_mode
2020-01-02 00:00:00+01:00,-39.830002,16.516650,5.961580,10.900000,12.826471,99.875000,2.956812,5.932592,371.846035,9.000000,...,16.882689,12.220884,True,False,False,True,False,True,False,False
2020-01-02 00:00:30+01:00,-62.433334,16.554980,5.992895,10.850000,12.914706,99.873437,2.955667,5.927700,371.836559,8.912023,...,16.797366,12.095122,True,False,False,True,False,True,True,True
2020-01-02 00:01:00+01:00,-62.571904,16.593311,6.024210,10.800000,13.002564,99.871875,2.960449,5.922808,371.711225,8.824047,...,16.712043,11.969360,True,False,False,True,False,True,True,True
2020-01-02 00:01:30+01:00,-62.507894,16.631641,6.055526,10.850000,13.079487,99.870312,2.968310,5.917917,371.765866,8.736070,...,16.626720,11.825357,True,False,False,True,False,True,True,True
2020-01-02 00:02:00+01:00,-62.397368,16.669971,6.086841,10.900000,13.126829,99.868750,2.979270,5.913025,371.842773,8.648094,...,16.369280,11.608393,True,False,False,True,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-30 23:58:00+01:00,0.000000,15.461263,9.937729,13.200000,16.400000,99.882243,3.285375,6.218800,372.293601,7.000000,...,15.755860,15.035156,False,True,False,True,False,False,False,False
2020-12-30 23:58:30+01:00,0.000000,15.514974,9.946683,13.295652,16.400000,99.883762,3.273071,6.218850,372.295560,7.000000,...,15.762891,15.030469,False,True,False,True,False,False,False,False
2020-12-30 23:59:00+01:00,0.000000,15.568685,9.955638,13.230435,16.400000,99.885282,3.259483,6.218900,372.239923,7.000000,...,15.769922,15.025781,False,True,False,True,False,False,False,False
2020-12-30 23:59:30+01:00,0.000000,15.622396,9.964592,13.222857,16.400000,99.886801,3.247857,6.218950,372.177756,7.000000,...,15.776954,15.021094,False,True,False,True,False,False,False,False
