In [22]:
import os
import json
import re
from io import BytesIO
from pathlib import Path

import numpy as np
import pandas as pd
from google.cloud import storage


In [23]:
def load_environment(env_path: str = ".env") -> tuple[str, str]:
    """Load GCP credentials and bucket name from a .env file (supports running from notebooks/)."""
    env_file = Path(env_path)

    # If running from notebooks/, fall back to parent .env
    if not env_file.is_file() and env_path == ".env":
        parent_env = Path.cwd().parent / ".env"
        if parent_env.is_file():
            env_file = parent_env

    if not env_file.is_file():
        raise FileNotFoundError(f".env file not found at {env_file}")

    for line in env_file.read_text().splitlines():
        stripped = line.strip()
        if not stripped or stripped.startswith("#") or "=" not in stripped:
            continue
        key, value = stripped.split("=", 1)
        key = key.strip()
        value = value.strip()
        if key in {"GOOGLE_APPLICATION_CREDENTIALS", "GCP_BUCKET_NAME"}:
            os.environ[key] = value

    credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    bucket_name = os.environ.get("GCP_BUCKET_NAME")

    if not credentials_path or not bucket_name:
        raise EnvironmentError(
            "GOOGLE_APPLICATION_CREDENTIALS and GCP_BUCKET_NAME must be set in the .env file."
        )

    credentials_file = Path(credentials_path)
    if not credentials_file.is_file():
        raise FileNotFoundError(f"Credentials file not found at {credentials_file}")

    print(f"Loaded credentials from: {credentials_file}")
    print(f"Using bucket: {bucket_name}")
    return str(credentials_file), bucket_name


In [24]:
def load_emg_dataframe_from_csv(env_path: str = ".env") -> pd.DataFrame:
    """Download the EMG CSV from GCS and return the raw DataFrame."""
    _, bucket_name = load_environment(env_path=env_path)

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob_path = "EMG-nature/Clean_df/emg_trial_level_df.csv"

    print(f"Accessing blob: {blob_path}")
    blob = bucket.blob(blob_path)

    if not blob.exists(client=client):
        raise FileNotFoundError(f"Blob {blob_path} not found in bucket {bucket_name}")

    data_bytes = blob.download_as_bytes()
    print(f"Downloaded {len(data_bytes)} bytes from GCS")

    df = pd.read_csv(BytesIO(data_bytes))
    print(f"Loaded DataFrame with shape: {df.shape}")
    return df


In [27]:
def parse_signal_cell(cell: str | np.ndarray | list | tuple) -> np.ndarray:
    """Parse a corrupted/loosely formatted signal into a (16, N) float32 array."""
    # If already array-like, trust and reshape as needed.
    if isinstance(cell, (np.ndarray, list, tuple)):
        arr = np.asarray(cell, dtype=np.float32)
        if arr.shape == (16, 10000):
            return arr
        if arr.shape == (10000, 16):
            return arr.T
        if arr.ndim == 1 and arr.size == 160000:
            return arr.reshape(16, 10000)
        if arr.ndim == 2 and arr.shape[0] == 16:
            return arr

    raw = str(cell)
    tokens = re.findall(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", raw)
    if not tokens:
        raise ValueError(f"No numeric tokens found in signal cell: {raw[:120]}")

    numbers = np.asarray([float(t) for t in tokens], dtype=np.float32)
    if numbers.size < 16:
        raise ValueError(f"Too few numeric tokens ({numbers.size}); sample: {raw[:120]}")

    usable = (numbers.size // 16) * 16
    numbers = numbers[:usable]
    channel_len = usable // 16
    if channel_len == 0:
        raise ValueError(f"Not enough data to form 16 channels; tokens={numbers.size}")

    arr = numbers.reshape(16, channel_len)
    return arr


def fix_length(arr: np.ndarray, target_len: int = 10000) -> np.ndarray:
    """Trim or pad signals to exactly (16, target_len)."""
    if arr.ndim != 2 or arr.shape[0] != 16:
        raise ValueError(f"Signal must have shape (16, N); got {arr.shape}")

    current_len = arr.shape[1]
    if current_len == target_len:
        return arr
    if current_len < target_len:
        pad_width = target_len - current_len
        return np.pad(arr, ((0, 0), (0, pad_width)), mode="constant")
    return arr[:, :target_len]


In [28]:
emg_df = load_emg_dataframe_from_csv()

emg_df["signal"] = emg_df["signal"].apply(parse_signal_cell)
emg_df["signal_fixed"] = emg_df["signal"].apply(fix_length)

print(emg_df["signal_fixed"].apply(lambda x: x.shape).value_counts())
print(emg_df.head())
print(emg_df.shape)

emg_df


Loaded credentials from: /Users/rayanhasan/code/hildieleyser/Inkling/keys/inkling-479911-fd24b9bdf83e.json
Using bucket: inkling-ssvep-emg
Accessing blob: EMG-nature/Clean_df/emg_trial_level_df.csv
Downloaded 3078852 bytes from GCS
Loaded DataFrame with shape: (4800, 8)
signal_fixed
(16, 10000)    4800
Name: count, dtype: int64
   Unnamed: 0  participant  day  block  trial_id  position  grasp  \
0           0            1    1      1         1         2      3   
1           1            1    1      1         2         2      3   
2           2            1    1      1         3         2      3   
3           3            1    1      1         4         2      3   
4           4            1    1      1         5         2      3   

                                              signal  \
0  [[3.763498e-05, 1.9842508e-05], [9.071698e-06,...   
1  [[1.0537988e-05, 1.153949e-05], [1.18090165e-0...   
2  [[1.6977565e-05, 1.9937088e-05], [2.1830994e-0...   
3  [[3.6807487e-06, 3.2587977e-

Unnamed: 0.1,Unnamed: 0,participant,day,block,trial_id,position,grasp,signal,signal_fixed
0,0,1,1,1,1,2,3,"[[3.763498e-05, 1.9842508e-05], [9.071698e-06,...","[[3.763498e-05, 1.9842508e-05, 0.0, 0.0, 0.0, ..."
1,1,1,1,1,2,2,3,"[[1.0537988e-05, 1.153949e-05], [1.18090165e-0...","[[1.0537988e-05, 1.153949e-05, 0.0, 0.0, 0.0, ..."
2,2,1,1,1,3,2,3,"[[1.6977565e-05, 1.9937088e-05], [2.1830994e-0...","[[1.6977565e-05, 1.9937088e-05, 0.0, 0.0, 0.0,..."
3,3,1,1,1,4,2,3,"[[3.6807487e-06, 3.2587977e-06], [2.339907e-06...","[[3.6807487e-06, 3.2587977e-06, 0.0, 0.0, 0.0,..."
4,4,1,1,1,5,2,3,"[[1.5383765e-05, 1.8471881e-05], [1.6300444e-0...","[[1.5383765e-05, 1.8471881e-05, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...,...
4795,4795,8,2,2,146,9,2,"[[1.59362e-05, 1.7032327e-05], [1.913987e-05, ...","[[1.59362e-05, 1.7032327e-05, 0.0, 0.0, 0.0, 0..."
4796,4796,8,2,2,147,9,2,"[[1.3964933e-06, 1.4381011e-06], [5.3344643e-0...","[[1.3964933e-06, 1.4381011e-06, 0.0, 0.0, 0.0,..."
4797,4797,8,2,2,148,9,2,"[[9.107072e-06, 1.3961595e-05], [1.9139401e-05...","[[9.107072e-06, 1.3961595e-05, 0.0, 0.0, 0.0, ..."
4798,4798,8,2,2,149,9,2,"[[4.061428e-05, 3.564699e-05], [3.1583953e-05,...","[[4.061428e-05, 3.564699e-05, 0.0, 0.0, 0.0, 0..."


In [30]:
emg_df = emg_df.drop(columns = "Unnamed: 0")

In [31]:
emg_df

Unnamed: 0,participant,day,block,trial_id,position,grasp,signal,signal_fixed
0,1,1,1,1,2,3,"[[3.763498e-05, 1.9842508e-05], [9.071698e-06,...","[[3.763498e-05, 1.9842508e-05, 0.0, 0.0, 0.0, ..."
1,1,1,1,2,2,3,"[[1.0537988e-05, 1.153949e-05], [1.18090165e-0...","[[1.0537988e-05, 1.153949e-05, 0.0, 0.0, 0.0, ..."
2,1,1,1,3,2,3,"[[1.6977565e-05, 1.9937088e-05], [2.1830994e-0...","[[1.6977565e-05, 1.9937088e-05, 0.0, 0.0, 0.0,..."
3,1,1,1,4,2,3,"[[3.6807487e-06, 3.2587977e-06], [2.339907e-06...","[[3.6807487e-06, 3.2587977e-06, 0.0, 0.0, 0.0,..."
4,1,1,1,5,2,3,"[[1.5383765e-05, 1.8471881e-05], [1.6300444e-0...","[[1.5383765e-05, 1.8471881e-05, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...
4795,8,2,2,146,9,2,"[[1.59362e-05, 1.7032327e-05], [1.913987e-05, ...","[[1.59362e-05, 1.7032327e-05, 0.0, 0.0, 0.0, 0..."
4796,8,2,2,147,9,2,"[[1.3964933e-06, 1.4381011e-06], [5.3344643e-0...","[[1.3964933e-06, 1.4381011e-06, 0.0, 0.0, 0.0,..."
4797,8,2,2,148,9,2,"[[9.107072e-06, 1.3961595e-05], [1.9139401e-05...","[[9.107072e-06, 1.3961595e-05, 0.0, 0.0, 0.0, ..."
4798,8,2,2,149,9,2,"[[4.061428e-05, 3.564699e-05], [3.1583953e-05,...","[[4.061428e-05, 3.564699e-05, 0.0, 0.0, 0.0, 0..."


In [32]:
emg_df['signal_fixed'].apply(lambda x: x.shape).value_counts()

signal_fixed
(16, 10000)    4800
Name: count, dtype: int64

In [33]:
import keras
from keras import Sequential
from keras.layers import (
    BatchNormalization,
    Conv2D,
    Dense,
    Dropout,
    GlobalAveragePooling2D,
    MaxPooling2D,
)

In [34]:
input_shape = (10000, 16, 1)

model = Sequential([
    Conv2D(32, (25, 3), padding="same", activation="relu", input_shape=input_shape),
    BatchNormalization(),
    MaxPooling2D(pool_size=(4, 1)),
    Conv2D(64, (15, 3), padding="same", activation="relu"),
    BatchNormalization(),
    MaxPooling2D(pool_size=(4, 1)),
    Conv2D(128, (9, 3), padding="same", activation="relu"),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 1)),
    GlobalAveragePooling2D(),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(6, activation="softmax"),
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
