In [2]:
import os
import json
import re
from io import BytesIO
from pathlib import Path

import numpy as np
import pandas as pd
from google.cloud import storage
from keras.utils import to_categorical
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, Sequential
import tensorflow as tf
from tensorflow.keras.layers import (
    Conv2D,
    BatchNormalization,
    MaxPooling2D,
    GlobalAveragePooling2D,
    Dense,
    Dropout
)



In [3]:
def load_environment(env_path: str = ".env") -> tuple[str, str]:
    """Load GCP credentials and bucket name from a .env file (supports running from notebooks/)."""
    env_file = Path(env_path)

    if not env_file.is_file() and env_path == ".env":
        parent_env = Path.cwd().parent / ".env"
        if parent_env.is_file():
            env_file = parent_env

    if not env_file.is_file():
        raise FileNotFoundError(f".env file not found at {env_file}")

    for line in env_file.read_text().splitlines():
        stripped = line.strip()
        if not stripped or stripped.startswith("#") or "=" not in stripped:
            continue
        key, value = stripped.split("=", 1)
        key = key.strip()
        value = value.strip()
        if key in {"GOOGLE_APPLICATION_CREDENTIALS", "GCP_BUCKET_NAME"}:
            os.environ[key] = value

    credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    bucket_name = os.environ.get("GCP_BUCKET_NAME")

    if not credentials_path or not bucket_name:
        raise EnvironmentError(
            "GOOGLE_APPLICATION_CREDENTIALS and GCP_BUCKET_NAME must be set in the .env file."
        )

    credentials_file = Path(credentials_path)
    if not credentials_file.is_file():
        raise FileNotFoundError(f"Credentials file not found at {credentials_file}")

    print(f"Loaded credentials from: {credentials_file}")
    print(f"Using bucket: {bucket_name}")
    return str(credentials_file), bucket_name


In [4]:
def load_emg_dataframe_from_csv(env_path: str = ".env") -> pd.DataFrame:
    """Download the EMG CSV from GCS and return the raw DataFrame."""
    _, bucket_name = load_environment(env_path=env_path)

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob_path = "EMG-nature/Clean_df/emg_trial_level_df.csv"

    print(f"Accessing blob: {blob_path}")
    blob = bucket.blob(blob_path)

    if not blob.exists(client=client):
        raise FileNotFoundError(f"Blob {blob_path} not found in bucket {bucket_name}")

    data_bytes = blob.download_as_bytes()
    print(f"Downloaded {len(data_bytes)} bytes from GCS")

    df = pd.read_csv(BytesIO(data_bytes))
    print(f"Loaded DataFrame with shape: {df.shape}")
    return df


In [5]:
def parse_signal_cell(cell: str | np.ndarray | list | tuple) -> np.ndarray:
    """Parse a corrupted/loosely formatted signal into a (16, N) float32 array."""
    if isinstance(cell, (np.ndarray, list, tuple)):
        arr = np.asarray(cell, dtype=np.float32)
        if arr.shape == (16, 10000):
            return arr
        if arr.shape == (10000, 16):
            return arr.T
        if arr.ndim == 1 and arr.size == 160000:
            return arr.reshape(16, 10000)
        if arr.ndim == 2 and arr.shape[0] == 16:
            return arr

    raw = str(cell)
    tokens = re.findall(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", raw)
    if not tokens:
        raise ValueError(f"No numeric tokens found in signal cell: {raw[:120]}")

    numbers = np.asarray([float(t) for t in tokens], dtype=np.float32)
    if numbers.size < 16:
        raise ValueError(f"Too few numeric tokens ({numbers.size}); sample: {raw[:120]}")

    usable = (numbers.size // 16) * 16
    numbers = numbers[:usable]
    channel_len = usable // 16
    if channel_len == 0:
        raise ValueError(f"Not enough data to form 16 channels; tokens={numbers.size}")

    arr = numbers.reshape(16, channel_len)
    return arr


def fix_length(arr: np.ndarray, target_len: int = 10000) -> np.ndarray:
    """Trim or pad signals to exactly (16, target_len)."""
    if arr.ndim != 2 or arr.shape[0] != 16:
        raise ValueError(f"Signal must have shape (16, N); got {arr.shape}")

    current_len = arr.shape[1]
    if current_len == target_len:
        return arr
    if current_len < target_len:
        pad_width = target_len - current_len
        return np.pad(arr, ((0, 0), (0, pad_width)), mode="constant")
    return arr[:, :target_len]


In [6]:
emg_df = load_emg_dataframe_from_csv()

if "Unnamed: 0" in emg_df.columns:
    emg_df = emg_df.drop(columns=["Unnamed: 0"])

emg_df["signal"] = emg_df["signal"].apply(parse_signal_cell)
emg_df["signal_fixed"] = emg_df["signal"].apply(fix_length)

print(emg_df["signal_fixed"].apply(lambda x: x.shape).value_counts())
print(emg_df.head())
print(emg_df.shape)

emg_df


Loaded credentials from: /Users/rayanhasan/code/hildieleyser/Inkling/keys/inkling-479911-fd24b9bdf83e.json
Using bucket: inkling-ssvep-emg
Accessing blob: EMG-nature/Clean_df/emg_trial_level_df.csv
Downloaded 3078852 bytes from GCS
Loaded DataFrame with shape: (4800, 8)
signal_fixed
(16, 10000)    4800
Name: count, dtype: int64
   participant  day  block  trial_id  position  grasp  \
0            1    1      1         1         2      3   
1            1    1      1         2         2      3   
2            1    1      1         3         2      3   
3            1    1      1         4         2      3   
4            1    1      1         5         2      3   

                                              signal  \
0  [[3.763498e-05, 1.9842508e-05], [9.071698e-06,...   
1  [[1.0537988e-05, 1.153949e-05], [1.18090165e-0...   
2  [[1.6977565e-05, 1.9937088e-05], [2.1830994e-0...   
3  [[3.6807487e-06, 3.2587977e-06], [2.339907e-06...   
4  [[1.5383765e-05, 1.8471881e-05], [1.6300444e

Unnamed: 0,participant,day,block,trial_id,position,grasp,signal,signal_fixed
0,1,1,1,1,2,3,"[[3.763498e-05, 1.9842508e-05], [9.071698e-06,...","[[3.763498e-05, 1.9842508e-05, 0.0, 0.0, 0.0, ..."
1,1,1,1,2,2,3,"[[1.0537988e-05, 1.153949e-05], [1.18090165e-0...","[[1.0537988e-05, 1.153949e-05, 0.0, 0.0, 0.0, ..."
2,1,1,1,3,2,3,"[[1.6977565e-05, 1.9937088e-05], [2.1830994e-0...","[[1.6977565e-05, 1.9937088e-05, 0.0, 0.0, 0.0,..."
3,1,1,1,4,2,3,"[[3.6807487e-06, 3.2587977e-06], [2.339907e-06...","[[3.6807487e-06, 3.2587977e-06, 0.0, 0.0, 0.0,..."
4,1,1,1,5,2,3,"[[1.5383765e-05, 1.8471881e-05], [1.6300444e-0...","[[1.5383765e-05, 1.8471881e-05, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...
4795,8,2,2,146,9,2,"[[1.59362e-05, 1.7032327e-05], [1.913987e-05, ...","[[1.59362e-05, 1.7032327e-05, 0.0, 0.0, 0.0, 0..."
4796,8,2,2,147,9,2,"[[1.3964933e-06, 1.4381011e-06], [5.3344643e-0...","[[1.3964933e-06, 1.4381011e-06, 0.0, 0.0, 0.0,..."
4797,8,2,2,148,9,2,"[[9.107072e-06, 1.3961595e-05], [1.9139401e-05...","[[9.107072e-06, 1.3961595e-05, 0.0, 0.0, 0.0, ..."
4798,8,2,2,149,9,2,"[[4.061428e-05, 3.564699e-05], [3.1583953e-05,...","[[4.061428e-05, 3.564699e-05, 0.0, 0.0, 0.0, 0..."


In [7]:
class SignalExtractor(BaseEstimator, TransformerMixin):
    """Extract EMG signals and reshape to (time, channels, 1)."""

    def __init__(self, signal_col: str = "signal_fixed"):
        self.signal_col = signal_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        signals = []
        for idx, sig in enumerate(X[self.signal_col].to_list()):
            arr = np.asarray(sig, dtype=np.float32)
            if arr.shape != (16, 10000):
                raise ValueError(f"Row {idx} has shape {arr.shape}; expected (16, 10000)")
            signals.append(arr.T[:, :, None])  # (10000, 16, 1)
        return np.stack(signals, axis=0)


class EMGNormalizer(BaseEstimator, TransformerMixin):
    """Z-score normalize EMG signals using training statistics."""

    def __init__(self, epsilon: float = 1e-8):
        self.epsilon = epsilon
        self.mean_ = None
        self.std_ = None

    def fit(self, X, y=None):
        self.mean_ = X.mean(axis=(0, 1), keepdims=True)
        self.std_ = X.std(axis=(0, 1), keepdims=True)
        self.std_ = np.where(self.std_ < self.epsilon, self.epsilon, self.std_)
        return self

    def transform(self, X):
        if self.mean_ is None or self.std_ is None:
            raise RuntimeError("EMGNormalizer must be fitted before transform().")
        return (X - self.mean_) / self.std_


class LabelExtractor(BaseEstimator, TransformerMixin):
    """Convert grasp labels 1-6 to one-hot encoded 0-5 classes."""

    def __init__(self, label_col: str = "grasp"):
        self.label_col = label_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        labels = X[self.label_col].astype(int).to_numpy() - 1
        if labels.min() < 0 or labels.max() > 5:
            raise ValueError("Labels must be in range 1-6 before conversion to 0-5.")
        return to_categorical(labels, num_classes=6)


In [8]:
X_pipeline = Pipeline(
    steps=[
        ("signals", SignalExtractor(signal_col="signal_fixed")),
        ("normalize", EMGNormalizer()),
    ]
)

y_pipeline = Pipeline(
    steps=[
        ("labels", LabelExtractor(label_col="grasp")),
    ]
)

X = X_pipeline.fit_transform(emg_df)
y_cat = y_pipeline.fit_transform(emg_df)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_cat,
    test_size=0.3,
    random_state=42,
    stratify=y_cat.argmax(axis=1),
)

print("X:", X.shape)
print("y:", y_cat.shape)
print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)


X: (4800, 10000, 16, 1)
y: (4800, 6)
Train: (3360, 10000, 16, 1) (3360, 6)
Test: (1440, 10000, 16, 1) (1440, 6)


In [9]:


# Your data input shape
input_shape = (10000, 16, 1)   # (timesteps, channels, 1)

model = Sequential([
    Conv2D(32, (25, 3), padding="same", activation="relu", input_shape=input_shape),
    BatchNormalization(),
    MaxPooling2D(pool_size=(4, 1)),

    Conv2D(64, (15, 3), padding="same", activation="relu"),
    BatchNormalization(),
    MaxPooling2D(pool_size=(4, 1)),

    Conv2D(128, (9, 3), padding="same", activation="relu"),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 1)),

    GlobalAveragePooling2D(),   # <<< REPLACES FLATTEN

    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(6, activation="softmax"),
])

model.summary()

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
import matplotlib.pyplot as plt
from keras import callbacks

early_stop = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True,
)
checkpoint = callbacks.ModelCheckpoint(
    filepath="best_emg_cnn.keras",
    monitor="val_loss",
    save_best_only=True,
)

history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stop, checkpoint],
    verbose=1,
)

train_acc = history.history.get("accuracy", [None])[-1]
val_acc = history.history.get("val_accuracy", [None])[-1]
print(f"Final train accuracy: {train_acc:.4f}" if train_acc is not None else "Final train accuracy unavailable")
print(f"Final validation accuracy: {val_acc:.4f}" if val_acc is not None else "Final validation accuracy unavailable")


Epoch 1/50
[1m  4/105[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10:22[0m 6s/step - accuracy: 0.1738 - loss: 1.7920

KeyboardInterrupt: 

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(history.history.get("loss", []), label="train")
axes[0].plot(history.history.get("val_loss", []), label="val")
axes[0].set_title("Loss")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Loss")
axes[0].legend()

axes[1].plot(history.history.get("accuracy", []), label="train")
axes[1].plot(history.history.get("val_accuracy", []), label="val")
axes[1].set_title("Accuracy")
axes[1].set_xlabel("Epoch")
axes[1].set_ylabel("Accuracy")
axes[1].legend()

plt.tight_layout()
plt.show()