In [1]:
import glob
import os
import random
import sys

In [2]:
import numpy as np
import pandas as pd

In [None]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.npz_files import open_npz, open_npz_key, save_npz
from utils.features import compute_cwru_features
from utils.transform import extract_sequences

In [4]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [5]:
RPMS_LIST = [1730, 1750, 1772, 1797]
FAULT_LIST = ['IR', 'B', 'OR@6', 'OR@3', 'OR@12']
DIAMETER_LIST = [7, 14, 21, 28]
END_LIST = ['FE', 'DE12', 'DE48']

## Create daframes

In [6]:
ANOMALY_ROWS = 8192
NORMAL_ROWS = 8192

In [None]:
ROOT_DATA_DIR = '../datasets/step2'
ROOT_DATA_SERIES_DIR = '../datasets/step3'


# select normal
df_normal = None
np_normal = None
for dirpath, dirnames, filenames in os.walk(ROOT_DATA_DIR):
    for fn in filenames:
        if not "normal" in fn.lower() or not fn.endswith(".parquet"):
            continue

        parquet_filepath = os.path.join(dirpath, fn)
        df = pd.read_parquet(parquet_filepath)
        npdata = open_npz_key(parquet_filepath.replace(".parquet", ".npz"), "a")
        if df_normal is None:
            df_normal = df
            np_normal = npdata
        else:
            df_normal = pd.concat((df_normal, df))
            np_normal = np.vstack((np_normal, npdata))

# sampling normal
print("df_normal.shape", df_normal.shape)
num_normal = df_normal.shape[0]
index_normal_selected = random.choices(list(range(num_normal)), k=NORMAL_ROWS)
df_normal = df_normal.loc[index_normal_selected]
# df_normal["anomaly"] = 0
# df_normal = df_normal\
#             .sample(NORMAL_ROWS, random_state=RANDOM_SEED)\
#             .reset_index(drop=True)
np_normal = np_normal[index_normal_selected]


df_anomaly = None
np_anomaly = None
for dirpath, dirnames, filenames in os.walk(ROOT_DATA_DIR):
    for fn in filenames:
        if "normal" in fn.lower() or not fn.endswith(".parquet"):
            continue

        parquet_filepath = os.path.join(dirpath, fn)
        df = pd.read_parquet(parquet_filepath)
        npdata = open_npz_key(parquet_filepath.replace(".parquet", ".npz"), "a")
        if df_anomaly is None:
            df_anomaly = df
            np_anomaly = npdata
        else:
            df_anomaly = pd.concat((df_anomaly, df))
            np_anomaly = np.vstack((np_anomaly, npdata))

# sampling anomaly
print("df_anomaly.shape", df_anomaly.shape)
num_anomaly = df_anomaly.shape[0]
index_anomaly_selected = random.choices(list(range(num_anomaly)), k=ANOMALY_ROWS)
df_anomaly = df_anomaly.loc[index_anomaly_selected]
# df_anomaly["anomaly"] = 1
# df_anomaly = df_anomaly\
#             .sample(ANOMALY_ROWS, random_state=RANDOM_SEED)\
#             .reset_index(drop=True)
np_anomaly = np_anomaly[index_anomaly_selected]

df = pd.concat((df_normal, df_anomaly))
np_data = np.vstack((np_normal, np_anomaly))


# df = df.sample(frac=1).reset_index(drop=True)
index_all = list(range(np_data.shape[0]))
random.shuffle(index_all)


df = df.loc[index_all]
np_data = np_data[index_all]


output_filename = os.path.join(ROOT_DATA_SERIES_DIR, "dataset.parquet")
df.to_parquet(output_filename, index=False)

save_npz(output_filename.replace(".parquet", ".npz"), a=np_data)

df_normal.shape (65536, 15)
df_anomaly.shape (3301376, 15)


In [27]:
df = pd.read_parquet(output_filename)


In [37]:
df.sample(5)

Unnamed: 0,maximum,minimum,mean,std,rms,skewness,kurtosis,crest_factor,form_factor,rpm,anomaly_type,diameter_fault,sampling_value,sampling_label,accelerometer,anomaly
9709,0.262982,-0.21244,0.027879,0.075861,0.080805,-0.047581,-0.081599,3.254541,1.254324,1730,,,,,FE,0
10506,0.2486,-0.136011,0.033431,0.062643,0.070992,0.24468,-0.034349,3.501783,1.26125,1750,,,,,FE,0
9662,0.224679,-0.197142,0.01105,0.072615,0.073433,-0.097792,-0.217563,3.059643,1.243939,1797,,,,,DE,0
3428,0.204443,-0.2155,0.016326,0.064552,0.066569,-0.320879,0.058369,3.071149,1.240844,1750,,,,,DE,0
6319,0.228225,-0.231354,0.014486,0.061628,0.063293,-0.241741,0.265263,3.605857,1.243073,1730,,,,,DE,0
