In [1]:
import glob
import os
import random
import sys

In [2]:
import numpy as np
import pandas as pd

In [3]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.npz_files import open_npz, open_npz_key, save_npz
from utils.features import compute_cwru_features
from utils.transform import extract_sequences

In [4]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [5]:
RPMS_LIST = [1730, 1750, 1772, 1797]
FAULT_LIST = ['IR', 'B', 'OR@6', 'OR@3', 'OR@12']
DIAMETER_LIST = [7, 14, 21, 28]
END_LIST = ['FE', 'DE12', 'DE48']

## Create daframes

In [6]:
from tqdm import tqdm 

In [7]:
ANOMALY_ROWS = 8192
NORMAL_ROWS = 8192

In [8]:
ROOT_DATA_DIR = '../datasets/step2'
ROOT_DATA_SERIES_DIR = '../datasets/step3'
os.makedirs(ROOT_DATA_SERIES_DIR, exist_ok=True)

In [9]:
# select normal
df_normal = None
np_normal = None
for dirpath, dirnames, filenames in os.walk(ROOT_DATA_DIR):
    for fn in tqdm(filenames):
        if not "normal" in fn.lower() or not fn.endswith(".parquet"):
            continue

        parquet_filepath = os.path.join(dirpath, fn)
        df = pd.read_parquet(parquet_filepath)
        npdata = open_npz_key(parquet_filepath.replace(".parquet", ".npz"), "a")
        if df_normal is None:
            df_normal = df
            np_normal = npdata
        else:
            df_normal = pd.concat((df_normal, df))
            np_normal = np.vstack((np_normal, npdata))

# sampling normal
print("df_normal.shape", df_normal.shape)
num_normal = df_normal.shape[0]
index_normal_selected = random.choices(list(range(num_normal)), k=NORMAL_ROWS)
df_normal = df_normal.iloc[index_normal_selected]
df_normal["anomaly"] = 0
# df_normal = df_normal\
#             .sample(NORMAL_ROWS, random_state=RANDOM_SEED)\
#             .reset_index(drop=True)
np_normal = np_normal[index_normal_selected]


df_anomaly = None
np_anomaly = None
for dirpath, dirnames, filenames in os.walk(ROOT_DATA_DIR):
    for fn in tqdm(filenames):
        if "normal" in fn.lower() or not fn.endswith(".parquet"):
            continue

        parquet_filepath = os.path.join(dirpath, fn)
        df = pd.read_parquet(parquet_filepath)
        npdata = open_npz_key(parquet_filepath.replace(".parquet", ".npz"), "a")
        if df_anomaly is None:
            df_anomaly = df
            np_anomaly = npdata
        else:
            df_anomaly = pd.concat((df_anomaly, df))
            np_anomaly = np.vstack((np_anomaly, npdata))

# sampling anomaly
print("df_anomaly.shape", df_anomaly.shape)
num_anomaly = df_anomaly.shape[0]
index_anomaly_selected = random.choices(list(range(num_anomaly)), k=ANOMALY_ROWS)
df_anomaly = df_anomaly.iloc[index_anomaly_selected]
df_anomaly["anomaly"] = 1
# df_anomaly = df_anomaly\
#             .sample(ANOMALY_ROWS, random_state=RANDOM_SEED)\
#             .reset_index(drop=True)
np_anomaly = np_anomaly[index_anomaly_selected]

  0%|          | 0/322 [00:00<?, ?it/s]

100%|██████████| 322/322 [00:04<00:00, 72.85it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


df_normal.shape (65536, 17)


100%|██████████| 322/322 [01:12<00:00,  4.46it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]

df_anomaly.shape (313344, 17)





In [10]:
df = pd.concat((df_normal, df_anomaly))
np_data = np.vstack((np_normal, np_anomaly))


# df = df.sample(frac=1).reset_index(drop=True)
index_all = list(range(np_data.shape[0]))
random.shuffle(index_all)


df = df.iloc[index_all]
np_data = np_data[index_all]

### Dataset 03

In [11]:
SELECTED_COLUMNS = [
    "maximum",
    "minimum",
    "mean",
    "std",
    "rms",
    "skewness",
    "kurtosis",
    "crest_factor",
    "form_factor",
    "rpm",
    "anomaly_type",
    "diameter_fault",
    "sampling_value",
    "sampling_label",
    "accelerometer",
    "anomaly"
]

df3 = df.copy()
df3 = df3[SELECTED_COLUMNS]

In [12]:
output_filename = os.path.join(ROOT_DATA_SERIES_DIR, "dataset.parquet")
df3.to_parquet(output_filename, index=False)
save_npz(output_filename.replace(".parquet", ".npz"), a=np_data)

In [13]:
df3 = pd.read_parquet(output_filename)


In [14]:
df3.sample(5)

Unnamed: 0,maximum,minimum,mean,std,rms,skewness,kurtosis,crest_factor,form_factor,rpm,anomaly_type,diameter_fault,sampling_value,sampling_label,accelerometer,anomaly
9858,1.175579,-0.86827,0.007074,0.183589,0.18368,0.276186,3.249519,6.400133,1.361418,1797,IR,14.0,12.0,FE,FE,1
2932,0.206738,-0.244497,0.011519,0.061864,0.062913,-0.273648,0.024957,3.286099,1.23306,1750,,,,,DE,0
12843,0.301813,-0.180184,0.02831,0.075886,0.080977,0.170158,-0.179512,3.727141,1.253164,1797,,,,,FE,0
5156,0.20799,-0.232815,0.01347,0.065289,0.066648,-0.2852,-0.025748,3.120694,1.235312,1730,,,,,DE,0
2677,0.908987,-0.920033,0.003088,0.23816,0.238122,0.035524,1.354609,3.817315,1.35051,1750,OR@3,14.0,12.0,FE,DE,1


### Dataset 05

In [15]:
SELECTED_COLUMNS = [
    "maximum",
    "minimum",
    "mean",
    "std",
    "rms",
    "skewness",
    "kurtosis",
    "crest_factor",
    "form_factor",
    "thd",
    "f0",
    "rpm",
    "anomaly_type",
    "diameter_fault",
    "sampling_value",
    "sampling_label",
    "accelerometer",
    "anomaly"
]

df5 = df.copy()
df5 = df5[SELECTED_COLUMNS]

In [17]:
OUTPUT_STEP4_DIR = '../datasets/step4'
os.makedirs(OUTPUT_STEP4_DIR, exist_ok=True)
output_filename = os.path.join(OUTPUT_STEP4_DIR, "dataset.parquet")
df5.to_parquet(output_filename, index=False)
save_npz(output_filename.replace(".parquet", ".npz"), a=np_data)