In [2]:
import os
import dask.dataframe as dd
import pandas as pd
import h5py
from dask import delayed

In [19]:
import os
import dask.dataframe as dd
import h5py
import pandas as pd
import re

# Define the base folder path
base_folder = 'Bosch_CNC_Machining_Data'

# Initialize an empty list to hold dataframes
df_list = []

# Define a mapping for machine and operation labels
machine_mapping = {'M01': 'Machine 1', 'M02': 'Machine 2', 'M03': 'Machine 3'}
operation_mapping = {f'OP{i:02d}': f'Operation {i}' for i in range(1, 15)}

# Regular expression to match the expected file name format
filename_pattern = re.compile(r"^M(\d{2})_(\w+)_(OP\d{2})_(\d+)\.h5$")

# Function to extract data from an h5 file and convert to DataFrame
def extract_data_from_h5(file_path, machine, operation, label):
    with h5py.File(file_path, 'r') as f:
        # Extract the dataset
        dataset = f[list(f.keys())[0]][:]
    
    # Convert to DataFrame
    df = pd.DataFrame(dataset, columns=['X-axis', 'Y-axis', 'Z-axis'])
    
    # Create a Time column representing row numbers
    df['Time'] = df.index
    
    # Extract metadata from file name
    file_name = os.path.basename(file_path)
    match = filename_pattern.match(file_name)
    
    if match:
        machine_no, timeframe, process_no, example_no = match.groups()
        # Split the timeframe into month and year
        month, year = timeframe.split('_')
        
        # Add metadata to the DataFrame
        df['Machine'] = machine
        df['TimeFrame_Month'] = month
        df['TimeFrame_Year'] = year
        df['Process'] = operation
        df['Example'] = example_no
        df['Label'] = label
    else:
        print(f"Filename does not match expected format: {file_name}")
        return None  # Return None if filename format is incorrect

    return df

# Traverse through the folder structure
for machine_folder in os.listdir(base_folder):
    machine_path = os.path.join(base_folder, machine_folder)
    if os.path.isdir(machine_path) and machine_folder in machine_mapping:
        machine = machine_mapping[machine_folder]
        
        for operation_folder in os.listdir(machine_path):
            operation_path = os.path.join(machine_path, operation_folder)
            if os.path.isdir(operation_path) and operation_folder in operation_mapping:
                operation = operation_mapping[operation_folder]
                
                for label_folder in ['good', 'bad']:
                    label_path = os.path.join(operation_path, label_folder)
                    if os.path.isdir(label_path):
                        label = 'good' if label_folder == 'good' else 'bad'
                        
                        # Process all .h5 files in the label folder
                        for file in os.listdir(label_path):
                            if file.endswith('.h5'):
                                file_path = os.path.join(label_path, file)
                                df = extract_data_from_h5(file_path, machine, operation, label)
                                if df is not None:  # Only append if df is valid
                                    df_list.append(df)

# Concatenate all dataframes into a single Dask DataFrame
dask_df = dd.from_pandas(pd.concat(df_list, ignore_index=True), npartitions=4)

# Save to CSV
dask_df.to_csv('extracted_data.csv', single_file=True, index=False)

print("Data extraction and conversion completed successfully!")

Data extraction and conversion completed successfully!


In [20]:
df = dd.read_csv('extracted_data.csv')

In [22]:
print(df.head())
print(df.tail())

   X-axis  Y-axis  Z-axis  Time    Machine TimeFrame_Month  TimeFrame_Year  \
0   -15.0    23.0 -1018.0     0  Machine 1             Aug            2019   
1    -7.0    15.0 -1016.0     1  Machine 1             Aug            2019   
2    -7.0    19.0 -1009.0     2  Machine 1             Aug            2019   
3   -15.0    27.0 -1016.0     3  Machine 1             Aug            2019   
4   -17.0    35.0 -1015.0     4  Machine 1             Aug            2019   

       Process  Example Label  
0  Operation 1        0  good  
1  Operation 1        0  good  
2  Operation 1        0  good  
3  Operation 1        0  good  
4  Operation 1        0  good  
        X-axis  Y-axis  Z-axis   Time    Machine TimeFrame_Month  \
956541    21.0    -3.0 -1022.0  63483  Machine 3             Feb   
956542    27.0    25.0 -1011.0  63484  Machine 3             Feb   
956543    21.0    -5.0 -1015.0  63485  Machine 3             Feb   
956544     9.0    17.0 -1034.0  63486  Machine 3             Feb   

In [None]:
print(df['Machine'].unique().compute())
print(df['Process'].unique().compute())
print(df['Example'].unique().compute())
print(df['Label'].unique().compute())