In [5]:

import numpy as np
import os
import txt_module, json_module, dirs_module

data_dir = r'D:\FINKI\40_diploma_thesis\data\datasets_numpy'
arrays_index = txt_module.read_list_from_txt(r'D:\FINKI\40_diploma_thesis\data\datasets_numpy\index.txt')
min_measurement_lengths = json_module.read_json(r'D:\FINKI\40_diploma_thesis\metadata\min_measurement_lengths.json')

write_data_dir = r'D:\FINKI\40_diploma_thesis\data\datasets_numpy_1S_Offset001_Debug'
dirs_module.create_directory(write_data_dir)

In [6]:
# healthy_train = ['K001','K002']
healthy_train = []
real_damage_train = ['KB24', 'KI16']
datasets_to_oversample = healthy_train + real_damage_train

NEW_SIGNAL_DURATION_SECONDS = 1
OFFSET_SECONDS_DAMAGED = 0.05
OFFSET_SECONDS_HEALTHY = 0.05

In [7]:
def number_of_samples_per_second_from_string(hz_string):
    hz_string = hz_string.split('_')[1].strip('Hz')
    kHz = False
    if hz_string.endswith('k'):
        kHz = True
        hz_string = hz_string.strip('k')
    return int(hz_string) * 1000 if kHz else int(hz_string)

samples_per_second_dict = {feature: number_of_samples_per_second_from_string(feature) for feature in arrays_index}
healthy_offset_samples_dict = {feature: samples_per_second * OFFSET_SECONDS_HEALTHY
                               for feature, samples_per_second in samples_per_second_dict.items()}
damaged_offset_samples_dict = {feature: samples_per_second * OFFSET_SECONDS_DAMAGED
                               for feature, samples_per_second in samples_per_second_dict.items()}

In [8]:
for file in os.listdir(data_dir):
    if file.startswith('index'): continue
    if file.split('_')[3] not in datasets_to_oversample: continue
    file_ndarray = np.load(f'{data_dir}/{file}', allow_pickle = True)

    current_start_position = {feature: 0 for feature in arrays_index}

    new_signal_id = 0
    more_signals_to_generate = True
    while more_signals_to_generate:
        new_signal_id += 1

        # generating subsignals
        new_file_list_to_ndarray = list()
        feature_position = -1
        for feature_array in file_ndarray:
            feature_position += 1
            feature = arrays_index[feature_position]
            # ignore temperature since it is measured in 1Hz
            if feature.startswith('Temp'): continue

            feature_array = feature_array[:min_measurement_lengths[feature]].astype(np.float32)

            start = int(current_start_position[feature])
            end = int(current_start_position[feature] + samples_per_second_dict[feature] * NEW_SIGNAL_DURATION_SECONDS)
            new_file_list_to_ndarray.append(feature_array[start:end])

        # new start posoition
        if file.split('_')[3].startswith('K00'):
            current_start_position = {feature: int(current_start_pos + healthy_offset_samples_dict[feature])
                                      for feature, current_start_pos in current_start_position.items()}
        else:
            current_start_position = {feature: int(current_start_pos + damaged_offset_samples_dict[feature])
                                      for feature, current_start_pos in current_start_position.items()}

        # break condition
        for feature, new_start_position in current_start_position.items():
            if new_start_position + samples_per_second_dict[feature] * NEW_SIGNAL_DURATION_SECONDS > min_measurement_lengths[feature]:
                more_signals_to_generate = False

        np.save(f'{write_data_dir}/{file.strip(".npy")}_{new_signal_id}.npy', np.array(new_file_list_to_ndarray))


  np.save(f'{write_data_dir}/{file.strip(".npy")}_{new_signal_id}.npy', np.array(new_file_list_to_ndarray))


In [5]:
new_arrays_index = [feature for feature in arrays_index if not feature.startswith('Temp')]