In [13]:
import glob
import os
import random
import sys

In [14]:
import numpy as np
import pandas as pd

In [15]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.npz_files import open_npz, save_npz
from utils.features import compute_cwru_features
from utils.transform import extract_sequences

In [16]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [17]:
RPMS_LIST = [1730, 1750, 1772, 1797]
FAULT_LIST = ['IR', 'B', 'OR@6', 'OR@3', 'OR@12']
DIAMETER_LIST = [7, 14, 21, 28]
END_LIST = ['FE', 'DE12', 'DE48']

## Create daframes

In [None]:
ROOT_DATA_DIR = '../datasets/step1'
ROOT_DATA_SERIES_DIR = '../datasets/step2'

for item in os.walk(ROOT_DATA_DIR):
    input_directory = item[0]
    output_directory = input_directory.replace(ROOT_DATA_DIR, ROOT_DATA_SERIES_DIR)
    # os.makedirs(output_directory, exist_ok=True)
    
    for basefile in item[2]:
        base, ext = os.path.splitext(basefile)
        if ext != '.npz':
            continue

        classification_labels = base.split('_')
        # print(classification_labels)
        is_normal = len(classification_labels) == 2

        rpm_label = int(classification_labels[0])
        anomaly_type = np.nan if is_normal else classification_labels[1]
        diameter_fault = np.nan if is_normal else int(classification_labels[2])
        sampling_value = np.nan if is_normal else \
                (12 if classification_labels[3] == 'FE' or '12' in classification_labels[3] else 48)
        sampling_label = np.nan if is_normal else classification_labels[3][:3]
        # print(is_normal, rpm_label, anomaly_type, diameter_fault, sampling_label, sampling_label)
        # break

        input_filepath = os.path.join(input_directory, basefile)
        output_filepath = os.path.join(ROOT_DATA_SERIES_DIR, base + ".parquet")

        data = open_npz(input_filepath)
        output_df = None
        for key_data in data.keys():
            df = compute_cwru_features(data[key_data])
            n_rows = len(df)
            df["rpm"] = [rpm_label] * n_rows
            df["anomaly_type"] = [anomaly_type] * n_rows
            df["diameter_fault"] = [diameter_fault] * n_rows
            df["sampling_value"] = [sampling_value] * n_rows
            df["sampling_label"] = [sampling_label] * n_rows
            df["accelerometer"] = [key_data] * n_rows
            
            if output_df is None:
                output_df = df
            else:
                output_df = pd.concat((output_df, df))
        
        output_df.to_parquet(output_filepath, index=False)
