In [1]:
import numpy as np
import pandas as pd
import os
import json_module, dirs_module
import txt_module
import importlib
from frequency_domain_module import wavelet_transform, fft_transform, histogram_sums
from sklearn.preprocessing import StandardScaler


In [2]:
# dirs and files
project_dir = 'D:/FINKI/40_diploma_thesis'

healthy_metadata = pd.read_csv(f'{project_dir}/metadata/dataset_paper_tables/DatasetPaper_Table7.csv')
real_damage_metadata = pd.read_csv(f'{project_dir}/metadata/dataset_paper_tables/DatasetPaper_Table5.csv')
artificial_damage_metadata = pd.read_csv(f'{project_dir}/metadata/dataset_paper_tables/DatasetPaper_Table4.csv')
y_map = json_module.read_json(f'{project_dir}/metadata/y_map.json')

data_dir = r'E:\40_diploma_thesis\datasets_numpy_1S_OffsetHealthy001_OffsetDamaged005'
testing_data_dir = f'F:\s3_test_datasets_numpy_1S_OffsetHealthy001_OffsetDamaged005'
write_data_dir = rf'G:/data/splits/s3_1s_bayesian'
dirs_module.create_directory(write_data_dir)

healthy_train = ['K001','K002','K003']
real_damage_train = ['KA04','KA15','KA22','KA30','KB23','KB27','KI04','KI17']
artificial_damage_train = ['KA01','KA05','KA07','KI01','KI03']
train_bearing_codes = healthy_train + artificial_damage_train + real_damage_train

healthy_val = ['K004']
real_damage_val = ['KB27', 'KI14']
artificial_damage_val = ['KI05']
validation_bearing_codes = healthy_val + artificial_damage_val + real_damage_val

txt_module.list_to_txt(train_bearing_codes, f'{write_data_dir}/train_bearing_codes.txt')

In [3]:
def include_file_in_train(file_name_, regime_):
    for train_bearing_code in train_bearing_codes:
        if train_bearing_code in file_name_:
            if regime_ in file_name_:
                return True
    return False

def include_file_in_test(file_name_, regime_):
    bearing_code = file_name_.split('_')[3]
    if bearing_code in train_bearing_codes:
        return False
    if bearing_code in validation_bearing_codes:
        return False
    if regime_ not in file_name_:
        return False
    return True

def include_file_in_val(file_name_, regime_):
    bearing_code = file_name_.split('_')[3]
    if bearing_code in train_bearing_codes:
        return False
    if bearing_code not in validation_bearing_codes:
        return False
    if regime_ not in file_name_:
        return False
    return True

700 3 N09_M07_F10
700 3 N15_M07_F10
700 3 N15_M01_F10
700 3 N15_M07_F04


In [None]:
for bin_width in [700]:
    for levels in [3]:
        for regime in ['N09_M07_F10', 'N15_M07_F10', 'N15_M01_F10', 'N15_M07_F04']:
            print(bin_width, levels, regime)

            train_files = [file for file in os.listdir(data_dir)
                           if include_file_in_train(file, regime)]
            test_files = [file for file in os.listdir(testing_data_dir)
                           if include_file_in_test(file, regime)]
            val_files = [file for file in os.listdir(testing_data_dir)
                           if include_file_in_val(file, regime)]

            for set_type, set_type_files in zip(['train', 'test', 'val'], [train_files, test_files, val_files]):

                file_names = list()
                y = list()
                for file in set_type_files:
                    bearing_code = file.split('_')[3]
                    file_names.append(file.strip('.npy'))
                    y.append(y_map[bearing_code])
                np.save(f'{write_data_dir}/y_{set_type}_{regime}_{levels}_{bin_width}.npy',y)
                txt_module.list_to_txt(file_names, f'{write_data_dir}/x_{set_type}_index.txt')

                arrays = list()
                for file in set_type_files:
                    if set_type != 'train':
                        if file.split('_')[3] in train_bearing_codes:
                            continue
                        feature_array = np.load(f'{data_dir}/{file}', allow_pickle = True).astype('float32').ravel()
                    else:
                        feature_array = np.load(f'{testing_data_dir}/{file}', allow_pickle = True).astype('float32').ravel()
                    feature_array = np.array(wavelet_transform(feature_array, levels_ = levels, bin_width_ = bin_width), dtype = 'float32')
                    arrays.append(feature_array)

                for i in range(len(arrays)):
                    arrays[i] = np.reshape(arrays[i], (1, arrays[i].shape[0]))
                X = np.concatenate(arrays, axis = 0); del arrays
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
                np.save(f'{write_data_dir}/x_{set_type}_{regime}_{levels}_{bin_width}.npy', X); del X


In [4]:
NUM_COEF_TO_SAVE = 12500
pairwise_bin_width_zero_padding_multiple =  [
    (15, 1),
]

for bin_width, zero_padding_multiple in pairwise_bin_width_zero_padding_multiple:
        for regime in ['N09_M07_F10', 'N15_M07_F10', 'N15_M01_F10', 'N15_M07_F04']:
            print(bin_width, zero_padding_multiple, regime)

            train_files = [file for file in os.listdir(data_dir)
                           if include_file_in_train(file, regime)]
            test_files = [file for file in os.listdir(testing_data_dir)
                           if include_file_in_test(file, regime)]
            val_files = [file for file in os.listdir(testing_data_dir)
                           if include_file_in_val(file, regime)]

            for set_type, set_type_files in zip(['train', 'test', 'val'], [train_files, test_files, val_files]):

                file_names = list()
                y = list()
                for file in set_type_files:
                    bearing_code = file.split('_')[3]
                    file_names.append(file.strip('.npy'))
                    y.append(y_map[bearing_code])
                np.save(f'{write_data_dir}/y_{set_type}_{regime}_{levels}_{bin_width}.npy',y)
                txt_module.list_to_txt(file_names, f'{write_data_dir}/x_{set_type}_index.txt')

                arrays = list()
                for file in set_type_files:
                    if set_type != 'train':
                        if file.split('_')[3] in train_bearing_codes:
                            continue
                        feature_array = np.load(f'{data_dir}/{file}', allow_pickle = True).astype('float32').ravel()
                    else:
                        feature_array = np.load(f'{testing_data_dir}/{file}', allow_pickle = True).astype('float32').ravel()
                    feature_array = fft_transform(feature_array,
                              zero_padding_multiple_ = zero_padding_multiple,
                              num_coefs_to_save_ = NUM_COEF_TO_SAVE * zero_padding_multiple)
                    arrays.append(feature_array)

                for i in range(len(arrays)):
                    arrays[i] = np.reshape(arrays[i], (1, arrays[i].shape[0]))
                X = np.concatenate(arrays, axis = 0); del arrays
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
                np.save(f'{write_data_dir}/x_{set_type}_{regime}_{levels}_{bin_width}.npy', X); del X

15 1 N09_M07_F10
15 1 N15_M07_F10
15 1 N15_M01_F10
15 1 N15_M07_F04


In [5]:
def read_npy(file_path_):
    return np.load(file_path_, allow_pickle = True)

for REGIME in ['N09_M07_F10', 'N15_M07_F10', 'N15_M01_F10', 'N15_M07_F04']:
    for fft_bin_width, fft_zero_padding_multiple in [(15, 1)]:

        fft_file_suffix = f'{REGIME}_{fft_zero_padding_multiple}_{fft_bin_width}.npy'
        fft_sets = dict(
            x_train = read_npy(f'{write_data_dir}/x_train_{fft_file_suffix}'),
            x_test = read_npy(f'{write_data_dir}/x_test_{fft_file_suffix}'),
            x_val = read_npy(f'{write_data_dir}/x_val_{fft_file_suffix}'),
            y_train = read_npy(f'{write_data_dir}/y_train_{fft_file_suffix}'),
            y_test = read_npy(f'{write_data_dir}/y_test_{fft_file_suffix}'),
            y_val = read_npy(f'{write_data_dir}/y_val_{fft_file_suffix}'),
        )

        for wavelet_levels, wavelet_bin_width in [(3,700)]:

            wavelet_file_suffix = f'{REGIME}_{wavelet_levels}_{wavelet_bin_width}.npy'
            wavelet_sets = dict(
                x_train = read_npy(f'{write_data_dir}/x_train_{wavelet_file_suffix}'),
                x_test = read_npy(f'{write_data_dir}/x_test_{wavelet_file_suffix}'),
                x_val = read_npy(f'{write_data_dir}/x_val_{wavelet_file_suffix}'),
                y_train = read_npy(f'{write_data_dir}/y_train_{wavelet_file_suffix}'),
                y_test = read_npy(f'{write_data_dir}/y_test_{wavelet_file_suffix}'),
                y_val = read_npy(f'{write_data_dir}/y_val_{wavelet_file_suffix}'),
            )

            combined_id = f'fft_{fft_bin_width}_{fft_zero_padding_multiple}_' \
                          f'wavelet_{wavelet_bin_width}_{wavelet_levels}'

            for set_type in ['train', 'test', 'val']:

                assert fft_sets[f'y_{set_type}'].tolist() == fft_sets[f'y_{set_type}'].tolist(), f'diff in  y_{set_type}'

                # features
                x_set = np.concatenate([fft_sets[f'x_{set_type}'], wavelet_sets[f'x_{set_type}']], axis = 1)

                if set_type == 'train':

                    scaler = StandardScaler()
                    x_set_scaled = scaler.fit_transform(x_set); del x_set
                    np.save(f'{write_data_dir}/x_{set_type}_{REGIME}_{combined_id}.npy', x_set_scaled)
                    del x_set_scaled

                if set_type == 'test':

                    x_set_scaled = scaler.transform(x_set); del x_set
                    np.save(f'{write_data_dir}/x_{set_type}_{REGIME}_{combined_id}.npy', x_set_scaled)
                    del x_set_scaled

                if set_type == 'val':

                    x_set_scaled = scaler.transform(x_set); del x_set
                    np.save(f'{write_data_dir}/x_{set_type}_{REGIME}_{combined_id}.npy', x_set_scaled)
                    del x_set_scaled

                # target
                np.save(f'{write_data_dir}/y_{set_type}_{REGIME}_{combined_id}.npy', fft_sets[f'y_{set_type}'])