 Imports

In [235]:
from statistics import LinearRegression
import json
import numpy as np
import matplotlib.pyplot as plt
from math import ceil
from scipy import stats

import pandas as pd
import sklearn
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model

import os
from PIL import Image, ImageDraw, ImageFont
from ultralytics import YOLO
import random


 analysis functions

In [236]:
# the simulated signals were presented in randomized order to each human labeler. The ordering was recorded as a list of indices.
# the selections the labelers made were recorded in the order they were made.
# This function finds the index at which the desired signal index appears in the ordering of signal indices.
def reverse_order_search(j, order):
    for i in range(len(order)):
        if j == order[i]:
            return i


# the linear coefficients come from a linear regression model searching for the best fit line between
# signal-noise ratio and _ratio parameter used to generate signals
def ratio_to_snr_converter(_ratio):
    snr = -19.65 * (_ratio) + 9.668
    return snr


# this decodes ONE entry in the list of params used to generate one signal.
# the function returns a dictionary with named parameters for visual inspection
# this function is only used when investigating the signal with the worst f1 score.
def decode_params(param_list) -> dict | None:
    if len(param_list) != 5:
        return None
    retDict = {}
    retDict["freq"] = 2 * param_list[0]
    retDict["n_cycles"] = param_list[1]
    retDict["rise-decay asymmetry"] = param_list[2]
    retDict["aperiodic exponent"] = param_list[3]
    retDict["signal-noise ratio"] = ratio_to_snr_converter(param_list[4])
    return retDict


# this decodes ONE entry in the list of params used to generate one signal.
# the function returns a numpy array with the same parameters as decode_params.
# this function is used when preparing data for regression analysis.
# It differs from decode_params in that it returns a numpy array instead of a dictionary.
def decode_params_np(param_list) -> dict | None:
    if len(param_list) != 5:
        return None
    retArray = np.zeros(5)
    retArray[0] = 2 * param_list[0]
    retArray[1] = param_list[1]
    retArray[2] = param_list[2]
    retArray[3] = param_list[3]
    retArray[4] = ratio_to_snr_converter(param_list[4])
    return retArray


# this function takes a list of parameters and returns a dictionary with named parameters.
# the function is used when preparing data for regression analysis.
# this function calls decode_params to ensure frequency and snr are accurate.
def param_list_to_training_data(param_list):
    num_samples = len(param_list)
    num_features = 5
    retArray = np.zeros((num_samples, num_features))
    for i in range(len(param_list)):
        row = decode_params_np(param_list[i])
        retArray[i][0:num_features] = row[:num_features]
    return retArray


def create_signal_images(signal_data, output_directory):
    """
    Save signal data as cropped images with specific requirements.

    Parameters:
        signal_data (list or array-like): A list of signals, where each signal is an array of amplitude values.
        output_directory (str): Directory where the images will be saved.
    """
    dpi = 100
    figsize_width = 1000.0 / float(dpi)
    figsize_height = 1.0

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    else:
        print(f"Directory {output_directory} already exists. Skipping image creation.")
        return

    for i, signal in enumerate(signal_data):
        if i % 100 == 0:
            print(i)

        # Normalize signal only if its full y-axis isn't in range [-3, 3]
        signal_min, signal_max = np.min(signal), np.max(signal)
        if signal_min < -3 or signal_max > 3:
            signal = (signal - signal_min) / (
                signal_max - signal_min
            ) * 5.8 - 2.9  # Normalize to [-3, 3]

        filename = f"sig_{i}.png"
        filepath = os.path.join(output_directory, filename)

        # Create the plot
        fig = plt.figure(figsize=(figsize_width, figsize_height), dpi=dpi)
        plt.ylim(-3, 3)

        # Remove axes and internal padding
        plt.gca().set_axis_off()
        plt.subplots_adjust(left=0, right=1, top=1, bottom=0)

        # Plot the signal
        plt.plot(signal)
        plt.savefig(
            filepath,
            bbox_inches="tight",  # Crop tightly to the plot content
            pad_inches=0,  # Remove any padding
            transparent=False,  # Optional: Save with a transparent background
        )
        plt.close(fig)

        # Crop the image
        img = Image.open(filepath)
        box = (45, 0, 955, 100)  # Define the cropping box
        img = img.crop(box)
        img.save(filepath)

        print(f"Saved cropped signal image to: {filepath}")



 Import File Data

In [237]:

# Load data from results json exported from firebase
with open("./voyteklabstudy-default-rtdb-export.json") as f:
    results = json.load(f)


 Set constants

In [238]:


# this is the number of real recorded EEG signal we used in the study platform we hosted. The
# signals were arranged in (real signals, simulated signals) order. Thus, num_real_sigs is used
# as an array offset in this analysis.
num_real_sigs = 49

# List of names of human collaborators who labeled data
# length of which gives us number of labelings. Allows us to iterate through labelers
who = list(results["selections"].keys())
print(who)

"""
y_pred and y_true are variables for auc_roc
  we want to analyze humans as a whole.
  so y_pred is the interval the humans select (on average?)
  let's start with just one lab person.
"""
y_pred = np.zeros((num_real_sigs, len(results["sigs"]["sig_" + str(0)])))
y_true = np.zeros((num_real_sigs, len(results["sigs"]["sig_" + str(0)])))

# the classes are "non-bursting" and "bursting"
num_classes = 2


threshold = 0.7

onsets = np.zeros((num_real_sigs, len(who)))
offsets = np.zeros((num_real_sigs, len(who)))

# Here we want to find the average selection onset and offset. We can handle outliers and such later -- now we need to scaffold.
for curr_sig_idx in range(num_real_sigs):
    eeg_signal_profiled_in_this_loop = results["sigs"]["sig_" + str(curr_sig_idx)]
    no_labels = 0

    for person_index in range(len(who)):
        order = np.array(results["selections"][who[person_index]]["indices"])
        selections = np.array(results["selections"][who[person_index]]["selections"])
        reverse_search_sig_idx = reverse_order_search(curr_sig_idx, order)
        selections_indexed_by_labeler = selections[reverse_search_sig_idx]

        len_curr_sig = len(eeg_signal_profiled_in_this_loop)

        if selections_indexed_by_labeler[0] == -1 and selections_indexed_by_labeler[1] == -1:
            print(f"nosel made by {who[person_index]}: {selections_indexed_by_labeler}")
            no_labels += 1
        else:
            onsets[curr_sig_idx][person_index] = selections_indexed_by_labeler[0]
            offsets[curr_sig_idx][person_index] = selections_indexed_by_labeler[1]
            print(selections_indexed_by_labeler[0], selections_indexed_by_labeler[1])

    for person_index in range(len(who)):
        print(f"onset: {onsets[curr_sig_idx][person_index]}, offset: {offsets[curr_sig_idx][person_index]}")


onsets= onsets * (910./1000.)
offsets = offsets * (910./1000.)
print(onsets)
print(offsets)

# Here we want to generate an image dataset from the signals.

test_dir = "signal_images"
collection_real_sigs = [results["sigs"]["sig_" + str(i)] for i in range(num_real_sigs)]
create_signal_images(collection_real_sigs, test_dir)

# now we want to predict the onset and offset of the signals with yolo.

output_collage = "collage_with_boxes_and_borders_2.png"

# Load the model
model = YOLO(
    "/Users/kenton/HOME/coding/python/publish_the_paper/runs/detect/train50/weights/best.pt"
)

# Get all image files in the directory
all_images = [
    os.path.join(test_dir, f)
    for f in os.listdir(test_dir)
    if f.lower().endswith((".png", ".jpg", ".jpeg"))
]

# Optional: Load a font for better text rendering
try:
    font = ImageFont.truetype(
        "arial.ttf", size=16
    )  # Use a font installed on your system
except IOError:
    font = ImageFont.load_default()


['Andrew Bender@1714089263343', 'Bradley Voytek Apr22 2024@1713819121072', 'Dillan@1713909205994', 'Eena Kosik@1713821677039', 'MJ@1714513556139', 'Quirine@1714514427397', 'Ryan Hammonds@1713819289745', 'Sydney Smith@1714416232441', 'rgao@1715689500559']
593 922
590 973
602 950
588 976
609 921
610 988
594 920
598 939
596 943
onset: 593.0, offset: 922.0
onset: 590.0, offset: 973.0
onset: 602.0, offset: 950.0
onset: 588.0, offset: 976.0
onset: 609.0, offset: 921.0
onset: 610.0, offset: 988.0
onset: 594.0, offset: 920.0
onset: 598.0, offset: 939.0
onset: 596.0, offset: 943.0
420 896
250 992
395 933
63 943
420 902
27 999
419 931
205 938
311 991
onset: 420.0, offset: 896.0
onset: 250.0, offset: 992.0
onset: 395.0, offset: 933.0
onset: 63.0, offset: 943.0
onset: 420.0, offset: 902.0
onset: 27.0, offset: 999.0
onset: 419.0, offset: 931.0
onset: 205.0, offset: 938.0
onset: 311.0, offset: 991.0
561 961
510 996
557 979
545 963
575 943
560 999
554 965
518 982
555 996
onset: 561.0, offset: 961.0
o

In [239]:
#visualize responses
# for i in range(len(onsets)):
#     plt.figure()
#     plt.boxplot(onsets[i])

# plt.show()

# process responses
center_onsets = np.zeros(num_real_sigs)
center_offsets = np.zeros(num_real_sigs)
valid_index_array = np.full((num_real_sigs,len(who)),np.nan)
for i in range(num_real_sigs):
    spot_index = 0
    # std_on = np.std(onsets[i])
    # mean_on = np.mean(onsets[i])
    # lower_on = mean_on -  std_on
    # upper_on = mean_on +  std_on

    # std_off = np.std(offsets[i])
    # mean_off = np.mean(offsets[i])
    # lower_off = mean_off - std_off
    # upper_off = mean_off + std_off

    ## latest version
    # lower_on = np.percentile(onsets[i], 25)
    # upper_on = np.percentile(onsets[i], 75)
    # lower_off = np.percentile(offsets[i], 25)
    # upper_off = np.percentile(offsets[i], 75)

    # for person_index in range(len(who)):
    #     if onsets[i][person_index] >= lower_on and onsets[i][person_index] <= upper_on and offsets[i][person_index] >= lower_off and offsets[i][person_index] <= upper_off:
    #         valid_index_array[i][spot_index] = person_index
    #         spot_index += 1

    # print(f"valid indices for signal {i}: {valid_index_array}")

    ## newest version
    iqr_on = stats.iqr(onsets[i])
    iqr_off = stats.iqr(offsets[i])
    lower_on = np.percentile(onsets[i], 25) - iqr_on
    upper_on = np.percentile(onsets[i], 75) + iqr_on
    lower_off = np.percentile(offsets[i], 25) - iqr_off
    upper_off = np.percentile(offsets[i], 75) + iqr_off

    for person_index in range(len(who)):
        if onsets[i][person_index] >= lower_on and onsets[i][person_index] <= upper_on and offsets[i][person_index] >= lower_off and offsets[i][person_index] <= upper_off:
            valid_index_array[i][spot_index] = person_index
            spot_index += 1

    print(f"valid indices for signal {i}: {valid_index_array}")

    center_onsets[i] = np.mean(onsets[i][~np.isnan(valid_index_array[i])])
    center_offsets[i] = np.mean(offsets[i][~np.isnan(valid_index_array[i])])
    print(f"center onset {onsets[i][~np.isnan(valid_index_array[i])]}: {center_onsets[i]}\tspread: {np.std(onsets[i][~np.isnan(valid_index_array[i])])}")
    print(f"center offset {offsets[i][~np.isnan(valid_index_array[i])]}: {center_offsets[i]}\tspread: {np.std(offsets[i][~np.isnan(valid_index_array[i])])}")

print(center_onsets)
print(center_offsets)

    

        

    




valid indices for signal 0: [[          0           1           2           3           4           5           6           7           8]
 [        nan         nan         nan         nan         nan         nan         nan         nan         nan]
 [        nan         nan         nan         nan         nan         nan         nan         nan         nan]
 [        nan         nan         nan         nan         nan         nan         nan         nan         nan]
 [        nan         nan         nan         nan         nan         nan         nan         nan         nan]
 [        nan         nan         nan         nan         nan         nan         nan         nan         nan]
 [        nan         nan         nan         nan         nan         nan         nan         nan         nan]
 [        nan         nan         nan         nan         nan         nan         nan         nan         nan]
 [        nan         nan         nan         nan         nan         nan         na

In [240]:

# Annotate each selected image and record the predicted onsets and offsets
pred_onsets = np.zeros(num_real_sigs)
pred_offsets = np.zeros(num_real_sigs)
annotated_images = []
for i in range(len(all_images)):
    image_path = all_images[i]
    # Predict pred_results for the image
    pred_results = model.predict(source=image_path, conf=0.001)

    # Load the image using PIL
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)

    # Process model predictions
    for r in pred_results:
        found = False

        draw.rectangle([center_onsets[i], 0, center_offsets[i], 100], outline="blue", width=3)
        for box_index in range(len(r.boxes.data)):
            box=r.boxes.data[box_index]
            # Extract bounding box and class information
            x1, y1, x2, y2, confidence, class_id = box.tolist()
            class_name = model.names[
                int(class_id)
            ]  # Get class name using model's class names
            if class_id == 1:
                print(f"box bounds: {x1}, {x2}")
                pred_onsets[i] = x1
                pred_offsets[i] = x2
                found = True
            elif box_index == len(r.boxes.data) - 1:
                draw.rectangle([center_onsets[i], 0, center_offsets[i], 100], outline="blue", width=3)
                continue
            else:
                continue

            # Draw the bounding box
            draw.rectangle([x1, y1, x2, y2], outline="red", width=3)

            # Create a label
            label = f"{class_name} ({confidence:.2f})"

            # Draw label inside the bounding box
            text_bbox = draw.textbbox((x1, y1), label, font=font)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]

            # Position text inside the bounding box, adjusted to fit
            label_x = max(x1, 0) + 2
            label_y = max(y1, 0) + 2

            # Draw label background and text
            draw.rectangle(
                [label_x, label_y, label_x + text_width, label_y + text_height],
                fill="red",
            )
            draw.text((label_x, label_y), label, fill="white", font=font)
        if not found:
            pred_onsets[i] = np.nan
            pred_offsets[i] = np.nan

    # Add a black border around the image
    border_size = 5
    bordered_image = Image.new(
        "RGB",
        (image.width + 2 * border_size, image.height + 2 * border_size),
        color="black",
    )
    bordered_image.paste(image, (border_size, border_size))

    # Ensure the image is resized to 910x100
    resized_image = bordered_image.resize((910, 100))  # Natural resolution
    # resized_image.show(f"Image {i}")
    annotated_images.append(resized_image)




image 1/1 /Users/kenton/HOME/coding/python/publish_the_paper/signal_images/sig_16.png: 64x416 7 non-bursts, 8 bursts, 7.9ms
Speed: 0.2ms preprocess, 7.9ms inference, 0.4ms postprocess per image at shape (1, 3, 64, 416)
box bounds: 69.64598846435547, 351.0574951171875
box bounds: 204.99314880371094, 358.37762451171875
box bounds: 57.19611740112305, 218.66773986816406
box bounds: 69.27925872802734, 259.1919860839844
box bounds: 255.18540954589844, 368.5364074707031
box bounds: 44.93610763549805, 157.93101501464844
box bounds: 246.87890625, 436.1355285644531
box bounds: 0.9195303916931152, 255.14892578125

image 1/1 /Users/kenton/HOME/coding/python/publish_the_paper/signal_images/sig_17.png: 64x416 17 non-bursts, 10 bursts, 7.5ms
Speed: 0.2ms preprocess, 7.5ms inference, 0.3ms postprocess per image at shape (1, 3, 64, 416)
box bounds: 412.0537414550781, 709.7371826171875
box bounds: 400.03131103515625, 573.0982055664062
box bounds: 442.430419921875, 789.81298828125
box bounds: 657.603027

In [None]:
# Determine collage dimensions
collage_width = 910  # Each image's width
collage_images_per_row = 3  # Number of images per row
collage_rows = ceil(len(annotated_images) / collage_images_per_row)
collage_height = collage_rows * 100  # 100 pixels per image height

# Create the blank collage canvas
collage = Image.new(
    "RGB", (collage_width * collage_images_per_row, collage_height), color="white"
)

# Paste each image into the collage
for i, annotated_image in enumerate(annotated_images):
    row = i // collage_images_per_row
    col = i % collage_images_per_row
    x_offset = col * 910
    y_offset = row * 100
    collage.paste(annotated_image, (x_offset, y_offset))

# Save the collage
collage.save(output_collage)
print(f"Collage saved to {output_collage}")
# collage.show()

print("onsets", onsets)
print("predicted onsets", onsets)
print("offsets", offsets)
print("predicted offsets", offsets)

valid_indices = [None]*num_real_sigs
for i in range(num_real_sigs):
    real_sel_indices = np.where(~np.isnan(valid_index_array[i]))
    prediction_indices = list(map(lambda x: x[0], filter(lambda x: x[1] != np.nan, enumerate(pred_onsets))))
    valid_indices[i] = np.intersect1d(real_sel_indices, prediction_indices)
np.intersect1d

print(valid_index_array)
diff_onsets = center_onsets[valid_indices[i]] - pred_onsets[valid_indices[i]]
diff_offsets = center_offsets[valid_indices[i]] - pred_offsets[valid_indices[i]]
# print("diff onsets", diff_onsets)

avg_diff_onsets = np.mean(diff_onsets)
avg_diff_offsets = np.mean(diff_offsets)

print("average onset error, not counting missed onsets", avg_diff_onsets)
print("average offset error, not counting missed onsets", avg_diff_offsets)

# print((~np.isnan(pred_onsets)))
find_rate = np.count_nonzero(~np.isnan(pred_onsets)) / num_real_sigs
print("find rate", find_rate)

find_rate_relative_to_labels = np.count_nonzero(~np.isnan(pred_onsets)) / np.count_nonzero(~np.isnan(center_onsets))
print("find rate relative to labels", find_rate_relative_to_labels)



# for i in range(len(onsets)):
#     print(f"real vs predicted onset for signal {center_onsets[i]} vs {pred_onsets[i]}")
#     print(f"real vs predicted offset for signal {center_offsets[i]} vs {pred_offsets[i]}")
#     print(f"diff between onsets for signal {i}: {center_onsets[i] - pred_onsets[i]}")
#     print(f"diff between offsets for signal {i}: {center_offsets[i] - pred_offsets[i]}")
#     print() # newline

# print("average onset error, not counting missed onsets", np.mean(onsets[onsets > 0] - onsets[onsets > 0]))


Collage saved to collage_with_boxes_and_borders_2.png
onsets [[     539.63       536.9      547.82      535.08      554.19       555.1      540.54      544.18      542.36]
 [      382.2       227.5      359.45       57.33       382.2       24.57      381.29      186.55      283.01]
 [     510.51       464.1      506.87      495.95      523.25       509.6      504.14      471.38      505.05]
 [      22.75           0           0        1.82      121.94           0           0        7.28        8.19]
 [     497.77      493.22      485.94      325.78       536.9      545.09      494.13      493.22      506.87]
 [      98.28       87.36       89.18      102.83      105.56      107.38       99.19       94.64      101.92]
 [     412.23      333.97      338.52       336.7      421.33      401.31      350.35      339.43      357.63]
 [     508.69      332.15      487.76      331.24      522.34      466.83      506.87      447.72      485.94]
 [     586.04      401.31      441.35      446.81  