## Data Loading

In [68]:
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from dataclasses import dataclass
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
# from sklearn.decomposition import PCA
# from sklearn.ensemble import IsolationForest

import warnings
warnings.filterwarnings("ignore")

# hv.renderer('bokeh').theme = 'dark_minimal'


In [2]:
# dataset_root = Path(r"C:\Users\Turquin\Documents\MLFPMA - Machine Learning for Predictive Maintenance Application\Project\Dataset") # Raw string works without escaping \
dataset_root = Path("./Dataset")

@dataclass
class Case():
    info: pd.DataFrame
    measurements: pd.DataFrame


class RawDataset():
    def __init__(self, root, unit = "VG4", load_training=False, load_synthetic=False, load_anomalies=False) -> None:
        read_pq_file = lambda f: pq.read_table(root / f).to_pandas()
        
        cases = {
            "test": [f"{unit}_generator_data_testing_real_measurements.parquet", root / f"{unit}_generator_data_testing_real_info.csv" ], 
        }

        if load_training:
            cases = {
                **cases,
                "train": [f"{unit}_generator_data_training_measurements.parquet", root / f"{unit}_generator_data_training_info.csv" ], 
            }

        if load_synthetic:
            cases = {
                **cases,
                "test_s01": [f"{unit}_generator_data_testing_synthetic_01_measurements.parquet", root / f"{unit}_generator_data_testing_synthetic_01_info.csv"], 
                "test_s02": [f"{unit}_generator_data_testing_synthetic_02_measurements.parquet", root / f"{unit}_generator_data_testing_synthetic_02_info.csv"]
            }

        if load_anomalies:
            anomaly_folder = Path("synthetic_anomalies")  # Relative path
            subdataset = ["01", "02"]
            anomaly_types = ["a", "b", "c"]
            for anomaly in subdataset:
                for subtype in anomaly_types:
                    anomaly_key = f"anomaly_{anomaly}_type_{subtype}"
                    anomaly_file = f"{unit}_anomaly_{anomaly}_type_{subtype}.parquet"
                    full_anomaly_path = root / anomaly_folder / anomaly_file
                    if full_anomaly_path.exists():
                        cases[anomaly_key] = [anomaly_folder / anomaly_file, None]

        
        self.data_dict = dict()
        
        for id_c, c in cases.items():
            # if you need to verify the parquet header:
            # pq_rows = RawDataset.read_parquet_schema_df(root / c[0])
            measurements = read_pq_file(c[0])
            info = pd.read_csv(c[1]) if c[1] is not None else None
            self.data_dict[id_c] = Case(info, measurements)
    
    @staticmethod
    def read_parquet_schema_df(uri: str) -> pd.DataFrame:
        """Return a Pandas dataframe corresponding to the schema of a local URI of a parquet file.

        The returned dataframe has the columns: column, pa_dtype
        """
        # Ref: https://stackoverflow.com/a/64288036/
        schema = pq.read_schema(uri, memory_map=True)
        schema = pd.DataFrame(({"column": name, "pa_dtype": str(pa_dtype)} for name, pa_dtype in zip(schema.names, schema.types)))
        schema = schema.reindex(columns=["column", "pa_dtype"], fill_value=pd.NA)  # Ensures columns in case the parquet file has an empty dataframe.
        return schema
    

rds_u4 = RawDataset(dataset_root, "VG4", load_synthetic=False, load_training=True)
rds_u5 = RawDataset(dataset_root, "VG5", load_synthetic=True, load_training=True, load_anomalies=True)
rds_u6 = RawDataset(dataset_root, "VG6", load_synthetic=True, load_training=True, load_anomalies=True)

## Manual Thresholding

In [75]:
u5_s01_equil_turbine_raw = pd.read_csv('u5_s01_equil_turbine_raw.csv')
u5_s01_equil_pump_raw = pd.read_csv('u5_s01_equil_pump_raw.csv')
u5_s02_equil_turbine_raw = pd.read_csv('u5_s02_equil_turbine_raw.csv')
u5_s02_equil_pump_raw = pd.read_csv('u5_s02_equil_pump_raw.csv')

u6_s01_equil_turbine_raw = pd.read_csv('u6_s01_equil_turbine_raw.csv')
u6_s01_equil_pump_raw = pd.read_csv('u6_s01_equil_pump_raw.csv')
u6_s02_equil_turbine_raw = pd.read_csv('u6_s02_equil_turbine_raw.csv')
u6_s02_equil_pump_raw = pd.read_csv('u6_s02_equil_pump_raw.csv')

u4_test_equil_turbine_raw = pd.read_csv('u4_test_equil_turbine_raw.csv')
u4_test_equil_pump_raw = pd.read_csv('u4_test_equil_pump_raw.csv')
u5_test_equil_turbine_raw = pd.read_csv('u5_test_equil_turbine_raw.csv')
u5_test_equil_pump_raw = pd.read_csv('u5_test_equil_pump_raw.csv')
u6_test_equil_turbine_raw = pd.read_csv('u6_test_equil_turbine_raw.csv')
u6_test_equil_pump_raw = pd.read_csv('u6_test_equil_pump_raw.csv')


In [93]:
# print(u6_s01.max())
with pd.option_context('display.max_rows', None):
  display(u6_s01.max())

Unnamed: 0                        2021-07-31 00:00:00+02:00
tot_activepower                                  124.874138
ext_tmp                                           39.514061
plant_tmp                                         21.090532
charge                                           122.702652
coupler_position                                 185.696976
injector_01_opening                                81.88051
injector_02_opening                               81.752243
injector_03_opening                               81.909982
injector_04_opening                               82.022994
injector_05_opening                               81.762417
pump_calculated_flow                              16.094669
pump_pressure_diff                               439.216675
pump_rotspeed                                    504.960002
tot_current                                       5739.0625
turbine_pressure                                  90.405334
turbine_rotspeed                        

In [78]:
# Define the anomaly detection thresholds
anomaly_rules = {
    "magnetic_circuit_temperature": {
        "columns": [
            'stat_magn_01_tmp', 'stat_magn_02_tmp', 'stat_magn_03_tmp', 'stat_magn_04_tmp',
            'stat_magn_05_tmp', 'stat_magn_06_tmp', 'stat_magn_07_tmp', 'stat_magn_08_tmp',
            'stat_magn_09_tmp', 'stat_magn_10_tmp', 'stat_magn_11_tmp', 'stat_magn_12_tmp'
        ],
        "high": 100, "too_high": 105, "duration": 10
    },
    "stator_coil_temperature": {
        "columns": [
            'stat_coil_ph01_01_tmp', 'stat_coil_ph01_02_tmp', 'stat_coil_ph01_03_tmp', 'stat_coil_ph01_04_tmp',
            'stat_coil_ph01_05_tmp', 'stat_coil_ph01_06_tmp', 'stat_coil_ph02_01_tmp', 'stat_coil_ph02_02_tmp',
            'stat_coil_ph02_03_tmp', 'stat_coil_ph02_04_tmp', 'stat_coil_ph02_05_tmp', 'stat_coil_ph02_06_tmp',
            'stat_coil_ph03_01_tmp', 'stat_coil_ph03_02_tmp', 'stat_coil_ph03_03_tmp', 'stat_coil_ph03_04_tmp',
            'stat_coil_ph03_05_tmp', 'stat_coil_ph03_06_tmp'
        ],
        "high": 103, "too_high": 108, "duration": 10
    },
    "hot_air_temperature": {
        'columns': ['air_circ_hot_01_tmp', 'air_circ_hot_02_tmp', 'air_circ_hot_03_tmp','air_circ_hot_04_tmp',
                    'air_circ_hot_05_tmp', 'air_circ_hot_06_tmp'],
        "high": 72, "too_high": 74, "duration": 10},
    "cold_air_temperature": {
        'columns': ['air_circ_cold_01_tmp', 'air_circ_cold_02_tmp','air_circ_cold_03_tmp',
                    'air_circ_cold_04_tmp','air_circ_cold_05_tmp','air_circ_cold_06_tmp'],
        "high": 35, "too_high": 37, "duration": 10},
}

def detect_anomalies(df, rules):
    """
    Detect anomalies in the DataFrame based on the given rules.

    Parameters:
    df (pd.DataFrame): The input data.
    rules (dict): A dictionary with thresholds for anomaly detection.

    Returns:
    pd.DataFrame: A DataFrame with added anomaly flags.
    """
    df = df.copy()
    df['anomaly_detected'] = False
    for signal, thresholds in rules.items():
        if 'columns' in thresholds:
            for col in thresholds['columns']:
             if col in df.columns:
                high_thresh = thresholds["high"]
                too_high_thresh = thresholds["too_high"]
                
                # Add anomaly columns
                df['anomaly_detected'] |= df[col] > high_thresh
                # df[f"{signal}_anomaly_high"] = df[signal] > high_thresh
                # df[f"{signal}_anomaly_too_high"] = df[signal] > too_high_thresh
            
    return df

# Apply the function to detect anomalies
df_with_anomalies = detect_anomalies(u6_s01, anomaly_rules)

# Display the first few rows with anomalies
df_with_anomalies[[col for col in df_with_anomalies.columns if "anomaly" in col]].head()


Unnamed: 0,anomaly_01_type_a,anomaly_01_type_b,anomaly_01_type_c,anomaly,anomaly_detected
0,0.0,0.0,0.0,0,False
1,0.0,0.0,0.0,0,False
2,0.0,0.0,0.0,0,False
3,0.0,0.0,0.0,0,False
4,0.0,0.0,0.0,0,False


In [88]:
df_with_anomalies = detect_anomalies(u5_s01_equil_turbine_raw, anomaly_rules)
print('Accuracy u5_s01_equil_turbine_raw:', accuracy_score(df_with_anomalies['anomaly'], df_with_anomalies['anomaly_detected']))
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u5_s01_equil_pump_raw, anomaly_rules)
print('Accuracy u5_s01_equil_pump_raw:', accuracy_score(df_with_anomalies['anomaly'], df_with_anomalies['anomaly_detected']))
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u5_s02_equil_turbine_raw, anomaly_rules)
print('Accuracy u5_s02_equil_turbine_raw:', accuracy_score(df_with_anomalies['anomaly'], df_with_anomalies['anomaly_detected']))
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u5_s02_equil_pump_raw, anomaly_rules)
print('Accuracy u5_s02_equil_pump:', accuracy_score(df_with_anomalies['anomaly'], df_with_anomalies['anomaly_detected']))
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u6_s01_equil_turbine_raw, anomaly_rules)
print('Accuracy u6_s01_equil_turbine_raw:', accuracy_score(df_with_anomalies['anomaly'], df_with_anomalies['anomaly_detected']))
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u6_s01_equil_pump_raw, anomaly_rules)
print('Accuracy u6_s01_equil_pump:', accuracy_score(df_with_anomalies['anomaly'], df_with_anomalies['anomaly_detected']))
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u6_s02_equil_turbine_raw, anomaly_rules)
print('Accuracy u6_s02_equil_turbine_raw:', accuracy_score(df_with_anomalies['anomaly'], df_with_anomalies['anomaly_detected']))
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u6_s02_equil_pump_raw, anomaly_rules)
print('Accuracy u6_s02_equil_pump_raw:', accuracy_score(df_with_anomalies['anomaly'], df_with_anomalies['anomaly_detected']))
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u4_test_equil_turbine_raw, anomaly_rules)
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u4_test_equil_pump_raw, anomaly_rules)
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u5_test_equil_turbine_raw, anomaly_rules)
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u5_test_equil_pump_raw, anomaly_rules)
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u6_test_equil_turbine_raw, anomaly_rules)
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())

df_with_anomalies = detect_anomalies(u6_test_equil_pump_raw, anomaly_rules)
print('Number anomalies detected:', df_with_anomalies['anomaly_detected'].sum())


Accuracy u5_s01_equil_turbine_raw: 0.33783634393862344
Number anomalies detected: 0
Accuracy u5_s01_equil_pump_raw: 0.09549979728816613
Number anomalies detected: 0
Accuracy u5_s02_equil_turbine_raw: 0.16170707703319318
Number anomalies detected: 0
Accuracy u5_s02_equil_pump: 0.3643812182055553
Number anomalies detected: 0
Accuracy u6_s01_equil_turbine_raw: 0.24049129091806334
Number anomalies detected: 0
Accuracy u6_s01_equil_pump: 0.06786669909997567
Number anomalies detected: 0
Accuracy u6_s02_equil_turbine_raw: 0.1744948312502251
Number anomalies detected: 0
Accuracy u6_s02_equil_pump_raw: 0.48687038988408854
Number anomalies detected: 0
Number anomalies detected: 0
Number anomalies detected: 0
Number anomalies detected: 0
Number anomalies detected: 0
Number anomalies detected: 0
Number anomalies detected: 0
