In [7]:
import os
import shutil
import random
from pathlib import Path
import h5py
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [3]:
def load_data(file_path):
    """
    Load vibration data from an HDF5 file.
    """
    with h5py.File(file_path, 'r') as file:
        df = file['vibration_data'][:]
    return pd.DataFrame({'X': df[:, 0], 'Y': df[:, 1], 'Z': df[:, 2]})

In [4]:
def find_all_h5s_in_dir(s_dir):
    """
    list all .h5 files in a directory
    """

    fileslist = []
    for root, dirs, files in os.walk(s_dir):
        for file in files:
            if file.endswith(".h5"):
                fileslist.append(file)
    return fileslist

In [5]:
# Define a function to load H5 files and calculate the duration in seconds
def load_h5_files_and_calculate_duration(data_root):
    data_list = []
    labels = []

    for machine in ['M01', 'M02', 'M03']:
        for operation in os.listdir(os.path.join(data_root, machine)):
            if os.path.isdir(os.path.join(data_root, machine, operation)):
                for label in ['good', 'bad']:
                    data_path = os.path.join(data_root, machine, operation, label)
                    files = find_all_h5s_in_dir(data_path)

                    for file in files:
                        file_path = os.path.join(data_path, file)
                        with h5py.File(file_path, 'r') as f:
                            vibration_data = f['vibration_data'][:]
                            samples_s = len(vibration_data) / 2000  # Assuming a data sampling frequency of 2000 Hz

                        data_list.append({
                            'Machine': machine,
                            'Operation': operation,
                            'Sample Type': label,
                            'File Name': file,
                            'Duration (s)': samples_s
                        })

    return data_list

In [41]:
def balance_dataset(machines, selected_operations, data_root):
    balanced_data = []
    balanced_data_dir = Path(data_root) / "Selected_data"
    balanced_data_dir.mkdir(exist_ok=True)
    print(f"Directory '{balanced_data_dir}' created")

    for machine in machines:
        for operation in selected_operations:
            bad_folder = Path(data_root) / machine / operation / 'bad'
            good_folder = Path(data_root) / machine / operation / 'good'

            # Check if there are bad samples for the current operation
            if bad_folder.exists():
                # Get the list of bad files
                bad_files = [file for file in bad_folder.iterdir() if file.is_file() and file.suffix == '.h5']
                # Check if there are bad files for the current operation
                if bad_files:
                    # Get the number of bad samples for this operation
                    num_bad_samples = len(bad_files)

                    for file in bad_files:
                        # Define the destination directory (parent directory of the file)
                        dest_bad_dir = Path(balanced_data_dir) / machine / operation / 'bad'
                        dest_bad_dir.mkdir(parents=True, exist_ok=True)  # Create the directory structure

                        # Define the full file path
                        dest_bad = dest_bad_dir / file.name
                        try:
                            shutil.copy2(file, dest_bad)
                            print(f"Copied '{file}' to '{dest_bad}'")
                        except shutil.Error as e:
                            print(f"Error occurred while copying file: {e}")
                        except IOError as e:
                            print(f"Error occurred while accessing file: {e.strerror}")

                        balanced_data.append((machine, operation, 'bad', file.name))

                    # Get the list of good files
                    good_files = [file for file in good_folder.iterdir() if file.is_file() and file.suffix == '.h5']
                    # Randomly select the same number of good samples as bad samples
                    random_good_files = random.sample(good_files, 2 * num_bad_samples)

                    for file in random_good_files:
                        dest_good_dir = Path(balanced_data_dir) / machine / operation / 'good'
                        dest_good_dir.mkdir(parents=True, exist_ok=True)  # Create the directory structure

                        # Define the full file path
                        dest_good = dest_good_dir / file.name

                        try:
                            shutil.copy2(file, dest_good)
                            print(f"Copied '{file}' to '{dest_good}'")
                        except shutil.Error as e:
                            print(f"Error occurred while copying file: {e}")
                        except IOError as e:
                            print(f"Error occurred while accessing file: {e.strerror}")

                        balanced_data.append((machine, operation, 'good', file.name))

    return balanced_data

In [26]:
def trim_and_window(time_series, label):
    """
    Trim the time series based on the label ("GOOD" or "BAD") and create 5-second windows.

    Args:
        time_series (np.array): The time series data (1D array).
        label (str): "GOOD" or "BAD" indicating the class of the time series.

    Returns:
        List of 5-second windows.
    """
    total_len = time_series.shape[0]  # Total number of data points
    n_windows = total_len // window_size  # Number of full windows
    remainder = total_len % window_size  # Remaining points after full windows

    print(f"Total length of time series: {total_len} data points")
    print(f"Number of full windows: {n_windows}, Remainder: {remainder} data points")

    if label == 'BAD':
        # For BAD class: trim remainder from both sides
        trim_amount = remainder // 2
        print(f"Trimming {trim_amount} data points from each side for BAD class")
        trimmed_series = time_series[trim_amount:total_len - trim_amount]

    elif label == 'GOOD':
        # For GOOD class: add another 5-second window if remainder exists
        if remainder > 0:
            print("Adding another window for GOOD class due to leftover")
            remainder += window_size
        trim_amount = remainder // 2
        print(f"Trimming {trim_amount} data points from each side for GOOD class")
        trimmed_series = time_series[trim_amount:total_len - trim_amount]

    print(f"Trimmed time series length: {len(trimmed_series)} data points")

    # Now split the trimmed series into 5-second windows
    windows = [trimmed_series[i:i + window_size] for i in range(0, len(trimmed_series) - window_size + 1, window_size)]
    print(f"Number of 5-second windows created: {len(windows)}")

    return windows

In [27]:
def process_and_save_windows(machine, operation, label, folder):
    """
    Processes and saves the 5-second windows for a given machine, operation, and label (GOOD or BAD) as .h5 files.

    Args:
        machine (str): Machine ID (e.g., 'M01').
        operation (str): Operation ID (e.g., 'OP01').
        label (str): 'good' or 'bad'.
        folder (Path): Path to the 'good' or 'bad' folder.
    """
    # Create save directory
    save_dir = window_save_root / machine / operation / label
    save_dir.mkdir(parents=True, exist_ok=True)
    print(f"Processing {label.upper()} files in {folder}")

    # Process each h5 file in the folder
    for h5_file in folder.glob("*.h5"):
        print(f"Processing file: {h5_file}")
        with h5py.File(h5_file, 'r') as f:
            if 'vibration_data' not in f:
                print(f"No 'vibration_data' dataset in file: {h5_file}")
                continue

            vibration_data = f['vibration_data'][:]
            print(f"Loaded time series data of shape {vibration_data.shape}")

            # Trim and create windows
            windows = trim_and_window(vibration_data, label.upper())

            # Save each window as a new .h5 file
            for idx, window in enumerate(windows):
                window_filename = save_dir / f"{h5_file.stem}_window_{idx}.h5"
                with h5py.File(window_filename, 'w') as hf:
                    hf.create_dataset('vibration_data', data=window)
                    print(f"Saved window {idx} for {h5_file.stem} to {window_filename}")

In [36]:
# Define a function to group the dataset into 'good' and 'bad' folders
def group_dataset(machines, selected_operations, data_root, windowed_root):
    grouped_data = []
    windowed_data_dir = Path(data_root) / "Selected_data_windowed_grouped"
    windowed_data_dir.mkdir(exist_ok=True)
    print(f"Directory '{windowed_data_dir}' created")

    bad_data_dir = windowed_data_dir / "bad"
    good_data_dir = windowed_data_dir / "good"

    bad_data_dir.mkdir(exist_ok=True)
    print(f"Directory '{bad_data_dir}' created")

    good_data_dir.mkdir(exist_ok=True)
    print(f"Directory '{good_data_dir}' created")

    for machine in machines:
        for operation in selected_operations:
            bad_folder = Path(windowed_root) / machine / operation / 'bad'
            good_folder = Path(windowed_root) / machine / operation / 'good'

            # Get the list of bad files
            bad_files = [file for file in bad_folder.iterdir() if file.is_file() and file.suffix == '.h5']

            for file in bad_files:
                dest_bad = bad_data_dir / file.name
                try:
                    shutil.copy2(file, dest_bad)
                    print(f"Copied '{file}' to '{dest_bad}'")
                except shutil.Error as e:
                    print(f"Error occurred while copying file: {e}")
                except IOError as e:
                    print(f"Error occurred while accessing file: {e.strerror}")

                grouped_data.append((machine, operation, 'bad', file.name))

            # Get the list of good files
            good_files = [file for file in good_folder.iterdir() if file.is_file() and file.suffix == '.h5']

            for file in good_files:
                dest_good = good_data_dir / file.name
                try:
                    shutil.copy2(file, dest_good)
                    print(f"Copied '{file}' to '{dest_good}'")
                except shutil.Error as e:
                    print(f"Error occurred while copying file: {e}")
                except IOError as e:
                    print(f"Error occurred while accessing file: {e.strerror}")

                grouped_data.append((machine, operation, 'good', file.name))

    return grouped_data

In [45]:
# Step 1: Balance Data
selected_operations = ['OP01', 'OP02', 'OP04', 'OP07','OP10']
machines = ['M01', 'M02', 'M03']
data_root = Path("../data")

balanced_data = balance_dataset(machines, selected_operations, data_root)
print(len(balanced_data), balanced_data)

Directory '..\data\Selected_data' created
Copied '..\data\M01\OP01\bad\M01_Aug_2019_OP01_000.h5' to '..\data\Selected_data\M01\OP01\bad\M01_Aug_2019_OP01_000.h5'
Copied '..\data\M01\OP01\bad\M01_Feb_2019_OP01_000.h5' to '..\data\Selected_data\M01\OP01\bad\M01_Feb_2019_OP01_000.h5'
Copied '..\data\M01\OP01\good\M01_Aug_2019_OP01_004.h5' to '..\data\Selected_data\M01\OP01\good\M01_Aug_2019_OP01_004.h5'
Copied '..\data\M01\OP01\good\M01_Feb_2019_OP01_000.h5' to '..\data\Selected_data\M01\OP01\good\M01_Feb_2019_OP01_000.h5'
Copied '..\data\M01\OP01\good\M01_Feb_2019_OP01_004.h5' to '..\data\Selected_data\M01\OP01\good\M01_Feb_2019_OP01_004.h5'
Copied '..\data\M01\OP01\good\M01_Feb_2020_OP01_000.h5' to '..\data\Selected_data\M01\OP01\good\M01_Feb_2020_OP01_000.h5'
Copied '..\data\M01\OP02\bad\M01_Feb_2019_OP02_000.h5' to '..\data\Selected_data\M01\OP02\bad\M01_Feb_2019_OP02_000.h5'
Copied '..\data\M01\OP02\good\M01_Feb_2021_OP02_006.h5' to '..\data\Selected_data\M01\OP02\good\M01_Feb_2021_O

In [30]:
# Define the root of balanced data and configurations for windowing
balanced_root = Path("../data/final/Selected_data/")
window_save_root = Path("../data/final/Selected_data_windowed")
window_size = 10000  # 5 seconds * 2000Hz = 10000 data points

In [31]:
# Iterate over machines, operations, and good/bad data --> process and save windows
for machine in machines:
    for operation in selected_operations:
        bad_folder = Path(balanced_root) / machine / operation / 'bad'
        good_folder = Path(balanced_root) / machine / operation / 'good'

        if bad_folder.exists():
            process_and_save_windows(machine, operation, 'bad', bad_folder)
        else:
            print(f"Bad folder not found: {bad_folder}")

        if good_folder.exists():
            process_and_save_windows(machine, operation, 'good', good_folder)
        else:
            print(f"Good folder not found: {good_folder}")

Processing BAD files in ..\data\final\Selected_data\M01\OP01\bad
Processing file: ..\data\final\Selected_data\M01\OP01\bad\M01_Aug_2019_OP01_000.h5
Loaded time series data of shape (38983, 3)
Total length of time series: 38983 data points
Number of full windows: 3, Remainder: 8983 data points
Trimming 4491 data points from each side for BAD class
Trimmed time series length: 30001 data points
Number of 5-second windows created: 3
Saved window 0 for M01_Aug_2019_OP01_000 to ..\data\final\Selected_data_windowed\M01\OP01\bad\M01_Aug_2019_OP01_000_window_0.h5
Saved window 1 for M01_Aug_2019_OP01_000 to ..\data\final\Selected_data_windowed\M01\OP01\bad\M01_Aug_2019_OP01_000_window_1.h5
Saved window 2 for M01_Aug_2019_OP01_000 to ..\data\final\Selected_data_windowed\M01\OP01\bad\M01_Aug_2019_OP01_000_window_2.h5
Processing file: ..\data\final\Selected_data\M01\OP01\bad\M01_Feb_2019_OP01_000.h5
Loaded time series data of shape (47081, 3)
Total length of time series: 47081 data points
Number of

In [32]:
# Assert that the windowed data is saved correctly by iterating over the saved files and printing the window shape
# Define the root directory where your data is stored
window_root = "../data/final/Selected_data_windowed/"

# Create a DataFrame
data_list = load_h5_files_and_calculate_duration(window_root)
df = pd.DataFrame(data_list)

# Print the DataFrame
print(df)

    Machine Operation Sample Type                          File Name  \
0       M01      OP01        good  M01_Aug_2019_OP01_004_window_0.h5   
1       M01      OP01        good  M01_Aug_2019_OP01_004_window_1.h5   
2       M01      OP01        good  M01_Aug_2019_OP01_004_window_2.h5   
3       M01      OP01        good  M01_Aug_2019_OP01_004_window_3.h5   
4       M01      OP01        good  M01_Aug_2019_OP01_004_window_4.h5   
..      ...       ...         ...                                ...   
682     M03      OP10         bad  M03_Aug_2021_OP10_000_window_4.h5   
683     M03      OP10         bad  M03_Aug_2021_OP10_000_window_5.h5   
684     M03      OP10         bad  M03_Aug_2021_OP10_000_window_6.h5   
685     M03      OP10         bad  M03_Aug_2021_OP10_000_window_7.h5   
686     M03      OP10         bad  M03_Aug_2021_OP10_000_window_8.h5   

     Duration (s)  
0             5.0  
1             5.0  
2             5.0  
3             5.0  
4             5.0  
..            .

In [33]:
# save the DataFrame to a CSV file
csv_filename = '../data/data_info/dataset_windowed_info.csv'
df.to_csv(csv_filename, index=False)

# Print a message to confirm the file has been saved
print(f"DataFrame saved to {csv_filename}")


DataFrame saved to ../data/data_info/dataset_windowed_info.csv


In [34]:
# print the number of samples for each class
csv_filename = '../data/data_info/dataset_windowed_info.csv'
df = pd.read_csv(csv_filename)

# Count the number of "good" and "bad" samples
sample_counts = df['Sample Type'].value_counts()

# Print the counts
print("Sample Type Counts:")
print(sample_counts)

Sample Type Counts:
Sample Type
good    468
bad     219
Name: count, dtype: int64


In [35]:
# Alternatively, you can group by 'Sample Type' for a more complex analysis
grouped_counts = df.groupby(['Machine', 'Sample Type']).size()

print("\nSample counts per machine:")
print(grouped_counts)


Sample counts per machine:
Machine  Sample Type
M01      bad             78
         good           179
M02      bad             86
         good           177
M03      bad             55
         good           112
dtype: int64


In [38]:
# group the dataset into 'good' and 'bad' folders
windowed_root = Path("../data/final/Selected_data_windowed")
data_root = Path("../data/final")

grouped_data = group_dataset(machines, selected_operations, data_root, windowed_root)
print(len(grouped_data), grouped_data)

Directory '..\data\final\Selected_data_windowed_grouped' created
Directory '..\data\final\Selected_data_windowed_grouped\bad' created
Directory '..\data\final\Selected_data_windowed_grouped\good' created
Copied '..\data\final\Selected_data_windowed\M01\OP01\bad\M01_Aug_2019_OP01_000_window_0.h5' to '..\data\final\Selected_data_windowed_grouped\bad\M01_Aug_2019_OP01_000_window_0.h5'
Copied '..\data\final\Selected_data_windowed\M01\OP01\bad\M01_Aug_2019_OP01_000_window_1.h5' to '..\data\final\Selected_data_windowed_grouped\bad\M01_Aug_2019_OP01_000_window_1.h5'
Copied '..\data\final\Selected_data_windowed\M01\OP01\bad\M01_Aug_2019_OP01_000_window_2.h5' to '..\data\final\Selected_data_windowed_grouped\bad\M01_Aug_2019_OP01_000_window_2.h5'
Copied '..\data\final\Selected_data_windowed\M01\OP01\bad\M01_Feb_2019_OP01_000_window_0.h5' to '..\data\final\Selected_data_windowed_grouped\bad\M01_Feb_2019_OP01_000_window_0.h5'
Copied '..\data\final\Selected_data_windowed\M01\OP01\bad\M01_Feb_2019_O

In [45]:
# Normalize the data

# Define directories
input_root = Path("../data/final/Selected_data_windowed_grouped")
output_root = Path("../data/final/Selected_data_windowed_grouped_normalized")

# Ensure output directory exists
output_root.mkdir(exist_ok=True)

# Get list of all machine-operation combinations
machine_operations = {}

for label in ['bad', 'good']:
    input_folder = input_root / label
    if not input_folder.exists():
        print(f"Skipping missing folder: {input_folder}")
        continue

    for file_path in input_folder.glob("*.h5"):
        file_name = file_path.name
        parts = file_name.split('_')  # Example: M01_Aug_2019_OP01_002.h5
        machine = parts[0]  # e.g., M01
        operation = parts[3]  # e.g., OP01
        key = f"{machine}_{operation}"

        if key not in machine_operations:
            machine_operations[key] = []
        machine_operations[key].append(file_path)

# Step 1: Compute mean & std per machine-operation
machine_operation_stats = {}

for key, files in machine_operations.items():
    all_data = []

    for file_path in files:
        with h5py.File(file_path, 'r') as file:
            vibration_data = file['vibration_data'][:]  # Shape: (time_steps, 3)
            all_data.append(vibration_data)

    all_data = np.vstack(all_data)  # Combine all time steps
    mean_vals = np.mean(all_data, axis=0)
    std_vals = np.std(all_data, axis=0)

    std_vals[std_vals == 0] = 1  # Avoid division by zero

    machine_operation_stats[key] = (mean_vals, std_vals)
    print(f"Computed Mean & Std for {key} → Mean: {mean_vals}, Std: {std_vals}")

# Step 2: Normalize each file using its machine-operation group stats
for key, files in machine_operations.items():
    mean_vals, std_vals = machine_operation_stats[key]

    for file_path in files:
        with h5py.File(file_path, 'r') as file:
            vibration_data = file['vibration_data'][:]

        # Debug: Print mean/std of a sample before normalization
        print(f"\nBefore Normalization ({file_path.name}):")
        print("Mean:", np.mean(vibration_data, axis=0))
        print("Std:", np.std(vibration_data, axis=0))

        # Normalize using machine-operation-specific mean & std
        normalized_data = (vibration_data - mean_vals) / std_vals

        # Debug: Print mean/std after normalization to verify correctness
        print(f"After Normalization ({file_path.name}):")
        print("Mean:", np.mean(normalized_data, axis=0))
        print("Std:", np.std(normalized_data, axis=0))

        # Define output path using pathlib
        label = "bad" if "bad" in str(file_path) else "good"
        output_folder = output_root / label
        output_folder.mkdir(parents=True, exist_ok=True)

        output_file_path = output_folder / file_path.name

        # Save the normalized data
        with h5py.File(output_file_path, 'w') as new_file:
            new_file.create_dataset('vibration_data', data=normalized_data)

        print(f"✅ Saved normalized file: {output_file_path}")

print("\n✅ Normalization complete! All files saved in:", output_root)


Computed Mean & Std for M01_OP01 → Mean: [ 3.42948148e-01  3.23442111e+01 -1.02897044e+03], Std: [434.4355121  154.12715324 187.91984521]
Computed Mean & Std for M01_OP04 → Mean: [   -1.59996212    33.08841061 -1020.28683485], Std: [440.91395592 197.63331634 210.56282007]
Computed Mean & Std for M01_OP07 → Mean: [   -1.37118667    32.00749111 -1018.33340667], Std: [474.90197891 241.18713304 235.92104658]
Computed Mean & Std for M01_OP10 → Mean: [-3.23483673e-01  3.28462122e+01 -1.01927697e+03], Std: [437.57569471 228.00417256 278.38184551]
Computed Mean & Std for M01_OP02 → Mean: [   -3.1969        32.40517619 -1021.14022857], Std: [391.99500611 257.61986495 177.47244471]
Computed Mean & Std for M02_OP01 → Mean: [    3.38152973     5.77617297 -1036.17444865], Std: [430.82131946 193.21615213 210.24663215]
Computed Mean & Std for M02_OP04 → Mean: [    5.00936154     6.15398942 -1038.27358654], Std: [452.21923    270.56797662 303.63555488]
Computed Mean & Std for M02_OP07 → Mean: [    6.6

In [6]:
# Assert that the normalized data is saved correctly by iterating over the saved files

normalized_root = ("../data/final/Selected_data_windowed_grouped_normalized")

# List to store file metadata
data_list_normed = []

# Sampling rate (2 kHz)
SAMPLE_RATE = 2000

# Iterate through both 'bad' and 'good' folders
for sample_type in ["bad", "good"]:
    folder_path = Path(normalized_root) / sample_type
    if not folder_path.exists():
        print(f"Skipping missing folder: {folder_path}")
        continue

    # Iterate through all .h5 files
    for file_path in folder_path.glob("*.h5"):
        file_name = file_path.name  # Extract file name

        # Read the file to determine its duration
        with h5py.File(file_path, 'r') as file:
            vibration_data = file['vibration_data'][:]  # Get dataset
            num_samples = vibration_data.shape[0]  # Number of time steps
            duration = num_samples / SAMPLE_RATE  # Compute duration in seconds

        # Append data to the list
        data_list_normed.append({
            "Sample Type": sample_type,
            "File Name": file_name,
            "Duration (s)": duration
        })

# Convert to Pandas DataFrame
df_normed = pd.DataFrame(data_list_normed)

# Print the result
print(df_normed)

# Save to CSV file (optional)
#df_normed.to_csv("normalized_data_summary.csv", index=False)

print("\n✅ Data saved to normalized_data_summary.csv")


    Sample Type                          File Name  Duration (s)
0           bad  M01_Aug_2019_OP01_000_window_0.h5           5.0
1           bad  M01_Aug_2019_OP01_000_window_1.h5           5.0
2           bad  M01_Aug_2019_OP01_000_window_2.h5           5.0
3           bad  M01_Aug_2019_OP04_000_window_0.h5           5.0
4           bad  M01_Aug_2019_OP04_000_window_1.h5           5.0
..          ...                                ...           ...
682        good  M03_Feb_2021_OP07_003_window_2.h5           5.0
683        good  M03_Feb_2021_OP07_004_window_0.h5           5.0
684        good  M03_Feb_2021_OP07_004_window_1.h5           5.0
685        good  M03_Feb_2021_OP07_004_window_2.h5           5.0
686        good  M03_Feb_2021_OP07_004_window_3.h5           5.0

[687 rows x 3 columns]

✅ Data saved to normalized_data_summary.csv


In [8]:


# Define paths
normalized_root = Path("../data/final/Selected_data_windowed_grouped_normalized")
downsampled_root = Path("../data/final/Selected_data_windowed_grouped_normalized_downsampled")

# Sampling rate
SAMPLE_RATE = 2000

# Original and target number of samples per window
original_samples = 10000  # 5 seconds at 2000 Hz
target_samples = 2000     # Downsample to 1000 samples (0.5 seconds per sample, effective rate 200 Hz)

# Downsampling factor (using average pooling)
downsampling_factor = original_samples // target_samples  # 5

# Create the downsampled root directory
downsampled_root.mkdir(exist_ok=True)
print(f"Directory '{downsampled_root}' created")

# Function to downsample a single time series
def downsample_time_series(vibration_data, original_length=10000, target_length=2000):
    """
    Downsample the vibration data using average pooling.

    Args:
        vibration_data (np.array): Shape (10000, 3) - original vibration data (X, Y, Z axes)
        original_length (int): Original number of samples (10000)
        target_length (int): Target number of samples after downsampling (2000)

    Returns:
        downsampled_data (np.array): Shape (1000, 3) - downsampled vibration data
    """
    # Convert to torch tensor with shape (1, 3, 10000)
    data_tensor = torch.tensor(vibration_data.T, dtype=torch.float32).unsqueeze(0)  # Shape: (1, 3, 10000)

    # Define average pooling
    kernel_size = original_length // target_length  # 10
    pool = nn.AvgPool1d(kernel_size=kernel_size, stride=kernel_size)

    # Apply pooling
    downsampled_tensor = pool(data_tensor)  # Shape: (1, 3, 1000)

    # Convert back to numpy and reshape to (1000, 3)
    downsampled_data = downsampled_tensor.squeeze(0).T.numpy()  # Shape: (2000, 3)

    return downsampled_data

# Function to process and save downsampled files
def downsample_and_save_files():
    """
    Downsample all .h5 files in the normalized_root directory and save them to downsampled_root.
    """
    # Iterate through both 'bad' and 'good' folders
    for sample_type in ["bad", "good"]:
        input_folder = normalized_root / sample_type
        output_folder = downsampled_root / sample_type

        # Create output folder
        output_folder.mkdir(exist_ok=True)
        print(f"Directory '{output_folder}' created")

        if not input_folder.exists():
            print(f"Skipping missing folder: {input_folder}")
            continue

        # Iterate through all .h5 files
        for h5_file in input_folder.glob("*.h5"):
            print(f"Processing file: {h5_file}")

            # Load the original data
            with h5py.File(h5_file, 'r') as f:
                vibration_data = f['vibration_data'][:]  # Shape: (10000, 3)
                print(f"Loaded time series data of shape {vibration_data.shape}")

            # Downsample the data
            downsampled_data = downsample_time_series(vibration_data, original_samples, target_samples)
            print(f"Downsampled time series data to shape {downsampled_data.shape}")

            # Define the new filename with "_downsampled" appended
            new_filename = f"{h5_file.stem}_downsampled.h5"
            output_path = output_folder / new_filename

            # Save the downsampled data
            with h5py.File(output_path, 'w') as hf:
                hf.create_dataset('vibration_data', data=downsampled_data)
                print(f"Saved downsampled file to {output_path}")

# Run the downsampling process
downsample_and_save_files()

# Verify the downsampled data
data_list_downsampled = []
for sample_type in ["bad", "good"]:
    folder_path = downsampled_root / sample_type
    if not folder_path.exists():
        print(f"Skipping missing folder: {folder_path}")
        continue

    for file_path in folder_path.glob("*.h5"):
        file_name = file_path.name
        with h5py.File(file_path, 'r') as file:
            vibration_data = file['vibration_data'][:]
            num_samples = vibration_data.shape[0]
            duration = num_samples / (SAMPLE_RATE / downsampling_factor)  # Effective sampling rate after downsampling
            data_list_downsampled.append({
                "Sample Type": sample_type,
                "File Name": file_name,
                "Duration (s)": duration,
                "Shape": vibration_data.shape
            })

Directory '..\data\final\Selected_data_windowed_grouped_normalized_downsampled' created
Directory '..\data\final\Selected_data_windowed_grouped_normalized_downsampled\bad' created
Processing file: ..\data\final\Selected_data_windowed_grouped_normalized\bad\M01_Aug_2019_OP01_000_window_0.h5
Loaded time series data of shape (10000, 3)
Downsampled time series data to shape (2000, 3)
Saved downsampled file to ..\data\final\Selected_data_windowed_grouped_normalized_downsampled\bad\M01_Aug_2019_OP01_000_window_0_downsampled.h5
Processing file: ..\data\final\Selected_data_windowed_grouped_normalized\bad\M01_Aug_2019_OP01_000_window_1.h5
Loaded time series data of shape (10000, 3)
Downsampled time series data to shape (2000, 3)
Saved downsampled file to ..\data\final\Selected_data_windowed_grouped_normalized_downsampled\bad\M01_Aug_2019_OP01_000_window_1_downsampled.h5
Processing file: ..\data\final\Selected_data_windowed_grouped_normalized\bad\M01_Aug_2019_OP01_000_window_2.h5
Loaded time ser

In [9]:
# Convert to DataFrame and print
df_downsampled = pd.DataFrame(data_list_downsampled)
print("\nDownsampled Data Summary:")
print(df_downsampled)

# Save to CSV for reference
df_downsampled.to_csv("downsampled_data_summary.csv", index=False)
print("\n✅ Downsampled data summary saved to downsampled_data_summary.csv")


Downsampled Data Summary:
    Sample Type                                      File Name  Duration (s)  \
0           bad  M01_Aug_2019_OP01_000_window_0_downsampled.h5           5.0   
1           bad  M01_Aug_2019_OP01_000_window_1_downsampled.h5           5.0   
2           bad  M01_Aug_2019_OP01_000_window_2_downsampled.h5           5.0   
3           bad  M01_Aug_2019_OP04_000_window_0_downsampled.h5           5.0   
4           bad  M01_Aug_2019_OP04_000_window_1_downsampled.h5           5.0   
..          ...                                            ...           ...   
682        good  M03_Feb_2021_OP07_003_window_2_downsampled.h5           5.0   
683        good  M03_Feb_2021_OP07_004_window_0_downsampled.h5           5.0   
684        good  M03_Feb_2021_OP07_004_window_1_downsampled.h5           5.0   
685        good  M03_Feb_2021_OP07_004_window_2_downsampled.h5           5.0   
686        good  M03_Feb_2021_OP07_004_window_3_downsampled.h5           5.0   

         Sha