In [1]:
import glob
import os
import random
import sys

In [2]:
import numpy as np
import pandas as pd

In [3]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.npz_files import open_npz, save_npz
from utils.features import compute_cwru_features
from utils.transform import extract_sequences

In [4]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [5]:
RPMS_LIST = [1730, 1750, 1772, 1797]
FAULT_LIST = ['IR', 'B', 'OR@6', 'OR@3', 'OR@12']
DIAMETER_LIST = [7, 14, 21, 28]
END_LIST = ['FE', 'DE12', 'DE48']

## Create daframes

In [6]:
ANOMALY_ROWS = 8192
NORMAL_ROWS = 8192

In [11]:
ROOT_DATA_DIR = '../datasets/step2'
ROOT_DATA_SERIES_DIR = '../datasets/step3'


# select normal
df_normal = None
for dirpath, dirnames, filenames in os.walk(ROOT_DATA_DIR):
    for fn in filenames:
        if not "normal" in fn.lower():
            continue

        df = pd.read_parquet(os.path.join(dirpath, fn))
        if df_normal is None:
            df_normal = df
        else:
            df_normal = pd.concat((df_normal, df))
# sampling normal
print("df_normal.shape", df_normal.shape)
df_normal["anomaly"] = 0
df_normal = df_normal\
            .sample(NORMAL_ROWS, random_state=RANDOM_SEED)\
            .reset_index(drop=True)


df_anomaly = None
for dirpath, dirnames, filenames in os.walk(ROOT_DATA_DIR):
    for fn in filenames:
        if "normal" in fn.lower():
            continue

        df = pd.read_parquet(os.path.join(dirpath, fn))
        if df_anomaly is None:
            df_anomaly = df
        else:
            df_anomaly = pd.concat((df_anomaly, df))
# sampling anomaly
print("df_anomaly.shape", df_anomaly.shape)
df_anomaly["anomaly"] = 1
df_anomaly = df_anomaly\
            .sample(ANOMALY_ROWS, random_state=RANDOM_SEED)\
            .reset_index(drop=True)


df = pd.concat((df_normal, df_anomaly))
df = df.sample(frac=1).reset_index(drop=True)

output_filename = os.path.join(ROOT_DATA_SERIES_DIR, "dataset.parquet")
df.to_parquet(output_filename, index=False)

df_normal.shape (65536, 15)
df_anomaly.shape (3301376, 15)


In [None]:
df = pd.read_parquet(output_filename)
df.sample(5)

Unnamed: 0,maximum,minimum,mean,std,rms,skewness,kurtosis,crest_factor,form_factor,rpm,anomaly_type,diameter_fault,sampling_value,sampling_label,accelerometer,anomaly
0,0.179409,-0.208407,0.013639,0.063569,0.065,-0.157703,-0.115043,2.760138,1.23936,1730,,,,,DE,0
1,0.260105,-0.187169,0.033857,0.073629,0.081024,0.189077,-0.3989,3.210223,1.252071,1797,,,,,FE,0
2,0.260516,-0.197442,0.029447,0.076546,0.081997,0.04639,-0.225414,3.17713,1.246046,1730,,,,,FE,0
3,0.264831,-0.167035,0.03188,0.062409,0.070066,0.255523,0.000302,3.779729,1.269137,1750,,,,,FE,0
4,0.219672,-0.172316,0.011776,0.062428,0.063514,-0.217052,-0.215444,3.458636,1.233507,1750,,,,,DE,0
