In [10]:
import numpy as np
import scipy.io as sio

In [31]:
def find_first_occurrence(full_data, trimmed_first_row, exclude_cols=None):
    """
    Find the row index in full_data that best matches the first row of trimmed_data,
    excluding specified columns.

    The matching is determined by the smallest mean squared error (MSE) computed
    over only the channels where both full_data and trimmed_first_row have valid (non-NaN) data,
    excluding the columns provided in exclude_cols.

    Parameters:
        full_data (np.ndarray): Array of shape (n1, channels)
        trimmed_first_row (np.ndarray): Array of shape (channels,)
        exclude_cols (list or np.ndarray): Column indices to exclude from comparison

    Returns:
        best_index (int): The index in full_data where the first row of trimmed_data best matches.
        best_error (float): The corresponding mean squared error.
    """
    # If there are columns to exclude, create a mask to include only allowed columns.
    if exclude_cols is not None:
        # Create a boolean mask for columns to include.
        total_cols = full_data.shape[1]
        include_mask = np.ones(total_cols, dtype=bool)
        include_mask[exclude_cols] = False
    else:
        include_mask = slice(None)

    best_error = np.inf
    best_index = -1

    # Loop through every row in the full data
    for i in range(full_data.shape[0]):
        row_full = full_data[i, :][include_mask]
        row_trimmed = trimmed_first_row[include_mask]
        
        # Create a mask for indices that are valid (i.e. not NaN) in both rows.
        valid_mask = ~(np.isnan(row_full) | np.isnan(row_trimmed))
        
        # If no valid comparisons are available, skip this row.
        if np.sum(valid_mask) == 0:
            continue
        
        # Compute the mean squared error for this row comparison.
        diff = row_full[valid_mask] - row_trimmed[valid_mask]
        error = np.mean(diff ** 2)
        
        # Update the best match if the error is lower than what we have seen so far.
        if error < best_error:
            best_error = error
            best_index = i
        
        if best_error == 0:
            break

    return best_index, best_error

In [32]:
# ---------------------------
# 1. Load the data
# ---------------------------
og_path = 'Group 1/synced_EEG_data_1.mat'
trimmed_path = 'Group 1/sync_to_actual_task/task_cutoff_EEG_data_1.mat'

full_mat = sio.loadmat(og_path)
trimmed_mat = sio.loadmat(trimmed_path)

# Assume that the variable name inside the .mat files is 'data'
full_data = full_mat['synced_EEG_data_1']             # shape: (n1, 32)
trimmed_data = trimmed_mat['task_cutoff_EEG_data_1']  # shape: (n2, 32)

# Check the shapes
print("Full data shape:", full_data.shape)
print("Trimmed data shape:", trimmed_data.shape)

Full data shape: (978765, 32)
Trimmed data shape: (893558, 32)


In [33]:
def find_alignment(exclude_columns=None):
    # Extract the first row of the trimmed data.
    trimmed_first_row = trimmed_data[0, :]

    # Find where the first row of trimmed_data best appears in full_data (excluding the specified columns).
    best_index, best_error = find_first_occurrence(full_data, trimmed_first_row, exclude_cols=exclude_columns)

    print("\nThe first row of the trimmed data best appears in the full data at row index:", best_index)
    print("Corresponding mean squared error:", best_error)

### Best MSE with Full Data

In [34]:
find_alignment()


The first row of the trimmed data best appears in the full data at row index: 40508
Corresponding mean squared error: 116.5336540043354


### Best MSE without HCILab 2

In [None]:
# Exclude HCILab 2
exclude_columns = list(range(8, 16))
find_alignment(exclude_columns)


The first row of the trimmed data best appears in the full data at row index: 40519
Corresponding mean squared error: 129.95735013484955


### Best MSE with only HCILab 1

Index: 40561

In [35]:
# Exclude columns 8 to 31 inclusive.
exclude_columns = list(range(8, 32))
find_alignment(exclude_columns)


The first row of the trimmed data best appears in the full data at row index: 40561
Corresponding mean squared error: 0.0


### Best MSE with only HCILab 2

Index: 40512

In [36]:
# Exclude columns 8 to 31 inclusive.
exclude_columns = list(range(0, 8)) + list(range(16, 32))
find_alignment(exclude_columns)


The first row of the trimmed data best appears in the full data at row index: 40512
Corresponding mean squared error: 0.0


### Best MSE with only CSL_Laptop

Index: 40530

In [37]:
exclude_columns = list(range(0, 16)) + list(range(24, 32))
find_alignment(exclude_columns)


The first row of the trimmed data best appears in the full data at row index: 40530
Corresponding mean squared error: 0.0


### Best MSE with only CSL_LabPC

Index: 40452

In [38]:
exclude_columns = list(range(0, 24))
find_alignment(exclude_columns)


The first row of the trimmed data best appears in the full data at row index: 40452
Corresponding mean squared error: 0.0


### Analysis of effects of different sampling rates

Stream 1 with Hz = 1 (1 timestamp / sec)

Timestamps: 1, 2, 3, 4, 5, 6

UNIX Time:  1, 2, 3, 4, 5, 6

Stream 2 with Hz = 2 (2 timestamps / sec)

Timestamps: 1  , 2, 3  , 4, 5  , 6

UNIX Time:  0.5, 1, 1.5, 2, 2.5, 3

Suppose cutoff of one stream is Unix Time = 1, then:

Stream 1: 1, 2, 3, 4, 5, 6

Stream 2: 2, 3, 4, 5, 6

Supposed cutoff of another stream is Unix Time = 2, then:

Stream 1: 2, 3, 4, 5, 6

Stream 2: 4, 5, 6

We see that the relative alignments of both stream 1 and 2 differ between cutoff Unix timestamps