In [23]:
import glob
import os
import random
import sys

In [24]:
import numpy as np
import pandas as pd

In [25]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.npz_files import open_npz, open_npz_key, save_npz
from utils.features import compute_cwru_features
from utils.transform import extract_sequences

In [26]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [27]:
RPMS_LIST = [1730, 1750, 1772, 1797]
FAULT_LIST = ['IR', 'B', 'OR@6', 'OR@3', 'OR@12']
DIAMETER_LIST = [7, 14, 21, 28]
END_LIST = ['FE', 'DE12', 'DE48']

## Create daframes

In [28]:
from tqdm import tqdm 

In [29]:
ANOMALY_ROWS = 8192
NORMAL_ROWS = 8192

In [30]:
ROOT_DATA_DIR = '../datasets/step2'
ROOT_DATA_SERIES_DIR = '../datasets/step3'
os.makedirs(ROOT_DATA_SERIES_DIR, exist_ok=True)

In [31]:
# select normal
df_normal = None
np_normal = None
for dirpath, dirnames, filenames in os.walk(ROOT_DATA_DIR):
    for fn in tqdm(filenames):
        if not "normal" in fn.lower() or not fn.endswith(".parquet"):
            continue

        parquet_filepath = os.path.join(dirpath, fn)
        df = pd.read_parquet(parquet_filepath)
        npdata = open_npz_key(parquet_filepath.replace(".parquet", ".npz"), "a")
        if df_normal is None:
            df_normal = df
            np_normal = npdata
        else:
            df_normal = pd.concat((df_normal, df))
            np_normal = np.vstack((np_normal, npdata))

# sampling normal
print("df_normal.shape", df_normal.shape)
num_normal = df_normal.shape[0]
index_normal_selected = random.choices(list(range(num_normal)), k=NORMAL_ROWS)
df_normal = df_normal.iloc[index_normal_selected]
df_normal["anomaly"] = 0
# df_normal = df_normal\
#             .sample(NORMAL_ROWS, random_state=RANDOM_SEED)\
#             .reset_index(drop=True)
np_normal = np_normal[index_normal_selected]


df_anomaly = None
np_anomaly = None
for dirpath, dirnames, filenames in os.walk(ROOT_DATA_DIR):
    for fn in tqdm(filenames):
        if "normal" in fn.lower() or not fn.endswith(".parquet"):
            continue

        parquet_filepath = os.path.join(dirpath, fn)
        df = pd.read_parquet(parquet_filepath)
        npdata = open_npz_key(parquet_filepath.replace(".parquet", ".npz"), "a")
        if df_anomaly is None:
            df_anomaly = df
            np_anomaly = npdata
        else:
            df_anomaly = pd.concat((df_anomaly, df))
            np_anomaly = np.vstack((np_anomaly, npdata))

# sampling anomaly
print("df_anomaly.shape", df_anomaly.shape)
num_anomaly = df_anomaly.shape[0]
index_anomaly_selected = random.choices(list(range(num_anomaly)), k=ANOMALY_ROWS)
df_anomaly = df_anomaly.iloc[index_anomaly_selected]
df_anomaly["anomaly"] = 1
# df_anomaly = df_anomaly\
#             .sample(ANOMALY_ROWS, random_state=RANDOM_SEED)\
#             .reset_index(drop=True)
np_anomaly = np_anomaly[index_anomaly_selected]

100%|██████████| 322/322 [00:01<00:00, 262.04it/s]


df_normal.shape (65536, 15)


100%|██████████| 322/322 [01:45<00:00,  3.05it/s]

df_anomaly.shape (412672, 15)





In [32]:
df = pd.concat((df_normal, df_anomaly))
np_data = np.vstack((np_normal, np_anomaly))


# df = df.sample(frac=1).reset_index(drop=True)
index_all = list(range(np_data.shape[0]))
random.shuffle(index_all)


df = df.iloc[index_all]
np_data = np_data[index_all]

In [33]:
output_filename = os.path.join(ROOT_DATA_SERIES_DIR, "dataset.parquet")
df.to_parquet(output_filename, index=False)
save_npz(output_filename.replace(".parquet", ".npz"), a=np_data)

In [34]:
df = pd.read_parquet(output_filename)


In [35]:
df.sample(5)

Unnamed: 0,maximum,minimum,mean,std,rms,skewness,kurtosis,crest_factor,form_factor,rpm,anomaly_type,diameter_fault,sampling_value,sampling_label,accelerometer,anomaly
9858,9.612606,-8.644184,0.018901,2.334692,2.334199,0.036867,0.464266,4.118161,1.290133,1730,B,28.0,12.0,DE1,DE,1
2932,0.222593,-0.238447,0.011302,0.065588,0.066539,-0.198584,0.154071,3.345287,1.255593,1730,,,,,DE,0
12843,0.280651,-0.182444,0.03372,0.06605,0.074145,0.1419,-0.107158,3.785149,1.261062,1772,,,,,FE,0
5156,0.220715,-0.200479,0.012389,0.070482,0.071546,0.024511,-0.2724,3.084943,1.236751,1797,,,,,DE,0
2677,0.586716,-0.571447,0.002786,0.130161,0.130159,0.093741,2.467234,4.507689,1.375545,1772,OR@6,7.0,12.0,FE,DE,1
