In [2]:
import numpy as np
import pandas as pd

import os
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#results_folder = f'../data/results/'
results_folder = f'../data/undersampled_4khz_results'

tests_channels = { 
    1 : [1,2,3,4,5,6,7,8,], 
    2 : [1,2,3,4,], 
    3 : [1,2,3,4,], 
}

In [6]:
def read_test_channel_data(test, channel):
    global results_folder
    global tests_channels
    
    # Validate test and channel
    if test not in tests_channels:
        raise ValueError(f"Invalid test number: {test}. Available tests: {list(tests_channels.keys())}")
    
    if channel not in tests_channels[test]:
        raise ValueError(f"Invalid channel number: {channel}. Available channels for test {test}: {tests_channels[test]}")
    
    # Construct the file name
    test_file_name = f'Test_{test}_Channel_{channel}.csv'
    file_path = os.path.join(results_folder, test_file_name)
    
    # Attempt to read the CSV file
    try:
        df = pd.read_csv(file_path, index_col=False, header=0)
        # Add a new column 'test' with the test number
        df['test'] = test
        return df
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {file_path}. Please check the file path and name.")
    except pd.errors.EmptyDataError:
        raise ValueError(f"The file {file_path} is empty.")
    except Exception as e:
        raise Exception(f"An error occurred while reading the file: {e}")

In [8]:
def create_unique_id(df, test, channel):
    """Create a unique identifier for each row based on test number, channel number, and time."""
    if 'time' not in df.columns:
        raise ValueError("The DataFrame must contain a 'time' column to create unique IDs.")
    
    # Create the unique ID by combining test, channel, and time
    df['id'] = df.apply(lambda row: f"{test}_{channel}_{row['time']}", axis=1)
    return df


In [10]:
def coalesce_data():
    """
    Read all data for the given test and channel combinations and coalesce into a single DataFrame.
    """
    all_data = []
    
    for test, channels in tests_channels.items():
        for channel in channels:
            try:
                df = read_test_channel_data(test, channel)
                df = create_unique_id(df, test, channel)
                all_data.append(df)
            except Exception as e:
                print(f"Error processing Test {test}, Channel {channel}: {e}")
    
    # Concatenate all DataFrames into a single DataFrame
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no data was read

In [12]:
def parse_unique_id(unique_id):
    """Parse the unique ID to extract test number, channel number, and time as a datetime object."""
    try:
        # Split the unique ID into components
        parts = unique_id.split('_')
        
        # Extract test number, channel number, and time
        test = int(parts[0])
        channel = int(parts[1])
        time_str = parts[2]
        
        # Convert time string to datetime object
        time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')  # Adjust format as needed '%Y.%m.%d.%H.%M.%S'
        
        return test, channel, time
    except (IndexError, ValueError) as e:
        raise ValueError(f"Invalid unique ID format: {unique_id}. Error: {e}")

# Example usage:
# unique_id = "1_2_2023-10-01 12:30:00"
# test, channel, time = parse_unique_id(unique_id)
# print(test, channel, time)

In [14]:
def mark_failure(df, test_number, channel_number, from_time, to_time=None):
    """
    Update the 'class' value to 1 for rows in the dataframe that fall within the specified time range
    and match the given test_number and channel_number.

    Parameters:
    df (pd.DataFrame): The dataframe containing the data.
    test_number (int): The test number to match.
    channel_number (int): The channel number to match.
    from_time (datetime): The start of the time range.
    to_time (datetime, optional): The end of the time range. If not specified, extends to the end of the dataframe.

    Raises:
    ValueError: If from_time is greater than to_time.
    KeyError: If the 'id' column is not found in the dataframe or if 'time' column is missing.
    """
    if to_time is None:
        to_time = df['time'].max()  # Set to_time to the maximum time if not provided

    if from_time > to_time:
        raise ValueError("from_time must be less than or equal to to_time.")

    # Filter the dataframe based on the given time range
    mask = (df['time'] >= from_time) & (df['time'] <= to_time)
    
    for idx in df[mask].index:
        try:
            test, channel, _ = parse_unique_id(df.at[idx, 'id'])
            if test == test_number and channel == channel_number:
                df.at[idx, 'class'] = 1
        except KeyError:
            raise KeyError("The 'id' column is not found in the dataframe or 'time' column is missing.")
        except Exception as e:
            print(f"An error occurred while processing index {idx}: {e}")

In [16]:
def mark_pre_fault(df, test_number, channel_number, from_time, to_time=None):
    """
    Update the 'class' value to 2 for rows in the dataframe that fall within the specified time range
    and match the given test_number and channel_number.

    Parameters:
    df (pd.DataFrame): The dataframe containing the data.
    test_number (int): The test number to match.
    channel_number (int): The channel number to match.
    from_time (datetime): The start of the time range.
    to_time (datetime, optional): The end of the time range. If not specified, extends to the end of the dataframe.

    Raises:
    ValueError: If from_time is greater than to_time.
    KeyError: If the 'id' column is not found in the dataframe or if 'time' column is missing.
    """
    if to_time is None:
        to_time = df['time'].max()  # Set to_time to the maximum time if not provided

    if from_time > to_time:
        raise ValueError("from_time must be less than or equal to to_time.")

    # Filter the dataframe based on the given time range
    mask = (df['time'] >= from_time) & (df['time'] <= to_time)
    
    for idx in df[mask].index:
        try:
            test, channel, _ = parse_unique_id(df.at[idx, 'id'])
            if test == test_number and channel == channel_number:
                df.at[idx, 'class'] = 2
        except KeyError:
            raise KeyError("The 'id' column is not found in the dataframe or 'time' column is missing.")
        except Exception as e:
            print(f"An error occurred while processing index {idx}: {e}")

In [18]:
full_data_frame = coalesce_data()

In [None]:
file_name_full_data = 'full_data.csv'
full_data_frame = pd.read_csv( 
    os.path.join(results_folder, file_name_full_data),
    index_col = False,
    header = 0,
)


In [None]:
file_name_full_data = 'full_data.csv'
full_data_frame.to_csv(
    os.path.join(results_folder, file_name_full_data),
    index = False,    
)

In [None]:
full_data_frame[full_data_frame['channel'] == 1].head()

In [20]:
full_data_frame['time'] = pd.to_datetime(full_data_frame['time'])

In [22]:
# Set the 'time' column as the index
#full_data_frame.set_index('time', inplace=True)
###  DO NOT set_index for now ###

In [24]:
full_data_frame['class'] = 0

In [26]:
# Test 1 - Bearing 1 (Channel 1)
# 2003-11-25 00:00:00
from_time = datetime.strptime('2003-11-25 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 1, channel_number = 1, from_time = from_time)

In [28]:
# Test 1 - Bearing 1 (Channel 2)
# 2003-11-25 00:00:00
from_time = datetime.strptime('2003-11-25 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 1, channel_number = 2, from_time = from_time)

In [30]:
# Test 1 - Bearing 2 (Channel 3)
# 2003-11-25 00:00:00
from_time = datetime.strptime('2003-11-25 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 1, channel_number = 3, from_time = from_time)

In [32]:
# Test 1 - Bearing 2 (Channel 4)
# 2003-11-25 00:00:00
from_time = datetime.strptime('2003-11-25 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 1, channel_number = 4, from_time = from_time)

In [34]:
# Test 1 - Bearing 3 (Channel 5)
# 2003-11-22 00:00:00
from_time = datetime.strptime('2003-11-22 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 1, channel_number = 5, from_time = from_time)

In [36]:
# Test 1 - Bearing 3 (Channel 5)
# 2003-11-22 19:30:00
from_time = datetime.strptime('2003-11-22 19:30:00', '%Y-%m-%d %H:%M:%S')
mark_failure(full_data_frame, test_number = 1, channel_number = 5, from_time = from_time)

In [38]:
# Test 1 - Bearing 3 (Channel 6)
# 2003-11-22 00:00:00
from_time = datetime.strptime('2003-11-22 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 1, channel_number = 6, from_time = from_time)

In [40]:
# Test 1 - Bearing 3 (Channel 6)
# 2003-11-22 19:30:00
from_time = datetime.strptime('2003-11-22 19:30:00', '%Y-%m-%d %H:%M:%S')
mark_failure(full_data_frame, test_number = 1, channel_number = 6, from_time = from_time)

In [42]:
# Test 1 - Bearing 4 (Channel 7)
# 2003-11-20 23:40:00
from_time = datetime.strptime('2003-11-19 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 1, channel_number = 7, from_time = from_time)

In [44]:
# Test 1 - Bearing 4 (Channel 7)
# 2003-11-20 23:40:00
from_time = datetime.strptime('2003-11-20 23:40:00', '%Y-%m-%d %H:%M:%S')
mark_failure(full_data_frame, test_number = 1, channel_number = 7, from_time = from_time)

In [46]:
# Test 1 - Bearing 4 (Channel 8)
from_time = datetime.strptime('2003-11-19 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 1, channel_number = 8, from_time = from_time)

In [48]:
# Test 1 - Bearing 4 (Channel 8)
from_time = datetime.strptime('2003-11-20 23:40:00', '%Y-%m-%d %H:%M:%S')
mark_failure(full_data_frame, test_number = 1, channel_number = 8, from_time = from_time)

In [50]:
# Test 2 - Bearing 1 (Channel 1)
# 2004-02-17 18:00:00
from_time = datetime.strptime('2004-02-17 18:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 2, channel_number = 1, from_time = from_time)

In [52]:
# Test 2 - Bearing 1 (Channel 1)
# 2004-02-18 17:00:00
from_time = datetime.strptime('2004-02-18 17:00:00', '%Y-%m-%d %H:%M:%S')
mark_failure(full_data_frame, test_number = 2, channel_number = 1, from_time = from_time)

In [54]:
# Test 2 - Bearing 2 (Channel 2)
# 2004-02-18 12:00:00
from_time = datetime.strptime('2004-02-18 12:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 2, channel_number = 2, from_time = from_time)

In [56]:
# Test 2 - Bearing 3 (Channel 3)
# 2004-02-18 12:00:00
from_time = datetime.strptime('2004-02-18 12:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 2, channel_number = 3, from_time = from_time)

In [58]:
# Test 2 - Bearing 4 (Channel 4)
# 2004-02-18 16:00:00
from_time = datetime.strptime('2004-02-18 16:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 2, channel_number = 4, from_time = from_time)

In [60]:
# Test 3 - Bearing 1 (Channel 1)
# 2004-04-17 00:00:00
from_time = datetime.strptime('2004-04-17 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 3, channel_number = 1, from_time = from_time)

In [62]:
# Test 3 - Bearing 2 (Channel 2)
# 2004-04-17 00:00:00
from_time = datetime.strptime('2004-04-17 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 3, channel_number = 2, from_time = from_time)

In [64]:
# Test 3 - Bearing 3 (Channel 3)
# 2004-04-16 23:22:55
from_time = datetime.strptime('2004-04-16 23:22:50', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 3, channel_number = 3, from_time = from_time)

In [66]:
# Test 3 - Bearing 3 (Channel 3)
# 2004-04-17 12:00:00
from_time = datetime.strptime('2004-04-17 12:00:00', '%Y-%m-%d %H:%M:%S')
mark_failure(full_data_frame, test_number = 3, channel_number = 3, from_time = from_time)

In [68]:
# Test 3 - Bearing 4 (Channel 4)
# 2004-04-17 00:00:00
from_time = datetime.strptime('2004-04-17 00:00:00', '%Y-%m-%d %H:%M:%S')
mark_pre_fault(full_data_frame, test_number = 3, channel_number = 4, from_time = from_time)

In [70]:
file_name_full_data = 'full_data_with_failure_and_all_pre-fault_marking.csv'

full_data_frame.to_csv(
    os.path.join(results_folder, file_name_full_data),
    index = False,    
)

### Scratch pad area

In [None]:
#full_data_frame[full_data_frame['class'] == 1].head()