In [1]:
import os
import json
import re
from io import BytesIO
from pathlib import Path

import numpy as np
import pandas as pd
from google.cloud import storage



In [2]:
def load_environment(env_path: str = ".env") -> tuple[str, str]:
    """Load GCP credentials and bucket name from a .env file (supports running from notebooks/)."""
    env_file = Path(env_path)

    # If running from notebooks/, fall back to parent .env
    if not env_file.is_file() and env_path == ".env":
        parent_env = Path.cwd().parent / ".env"
        if parent_env.is_file():
            env_file = parent_env

    if not env_file.is_file():
        raise FileNotFoundError(f".env file not found at {env_file}")

    for line in env_file.read_text().splitlines():
        stripped = line.strip()
        if not stripped or stripped.startswith("#") or "=" not in stripped:
            continue
        key, value = stripped.split("=", 1)
        key = key.strip()
        value = value.strip()
        if key in {"GOOGLE_APPLICATION_CREDENTIALS", "GCP_BUCKET_NAME"}:
            os.environ[key] = value

    credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    bucket_name = os.environ.get("GCP_BUCKET_NAME")

    if not credentials_path or not bucket_name:
        raise EnvironmentError(
            "GOOGLE_APPLICATION_CREDENTIALS and GCP_BUCKET_NAME must be set in the .env file."
        )

    credentials_file = Path(credentials_path)
    if not credentials_file.is_file():
        raise FileNotFoundError(f"Credentials file not found at {credentials_file}")

    print(f"Loaded credentials from: {credentials_file}")
    print(f"Using bucket: {bucket_name}")
    return str(credentials_file), bucket_name


In [3]:
def load_emg_dataframe_from_csv(env_path: str = ".env") -> pd.DataFrame:
    """Download the EMG CSV from GCS and return the raw DataFrame."""
    _, bucket_name = load_environment(env_path=env_path)

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob_path = "EMG-nature/Clean_df/emg_trial_level_df.csv"

    print(f"Accessing blob: {blob_path}")
    blob = bucket.blob(blob_path)

    if not blob.exists(client=client):
        raise FileNotFoundError(f"Blob {blob_path} not found in bucket {bucket_name}")

    data_bytes = blob.download_as_bytes()
    print(f"Downloaded {len(data_bytes)} bytes from GCS")

    df = pd.read_csv(BytesIO(data_bytes))
    print(f"Loaded DataFrame with shape: {df.shape}")
    return df


In [4]:
def parse_signal_cell(cell: str | np.ndarray | list | tuple) -> np.ndarray:
    """Parse a corrupted/loosely formatted signal into a (16, N) float32 array."""
    # If already array-like, trust and reshape as needed.
    if isinstance(cell, (np.ndarray, list, tuple)):
        arr = np.asarray(cell, dtype=np.float32)
        if arr.shape == (16, 10000):
            return arr
        if arr.shape == (10000, 16):
            return arr.T
        if arr.ndim == 1 and arr.size == 160000:
            return arr.reshape(16, 10000)
        if arr.ndim == 2 and arr.shape[0] == 16:
            return arr

    raw = str(cell)
    tokens = re.findall(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", raw)
    if not tokens:
        raise ValueError(f"No numeric tokens found in signal cell: {raw[:120]}")

    numbers = np.asarray([float(t) for t in tokens], dtype=np.float32)
    if numbers.size < 16:
        raise ValueError(f"Too few numeric tokens ({numbers.size}); sample: {raw[:120]}")

    usable = (numbers.size // 16) * 16
    numbers = numbers[:usable]
    channel_len = usable // 16
    if channel_len == 0:
        raise ValueError(f"Not enough data to form 16 channels; tokens={numbers.size}")

    arr = numbers.reshape(16, channel_len)
    return arr


def fix_length(arr: np.ndarray, target_len: int = 10000) -> np.ndarray:
    """Trim or pad signals to exactly (16, target_len)."""
    if arr.ndim != 2 or arr.shape[0] != 16:
        raise ValueError(f"Signal must have shape (16, N); got {arr.shape}")

    current_len = arr.shape[1]
    if current_len == target_len:
        return arr
    if current_len < target_len:
        pad_width = target_len - current_len
        return np.pad(arr, ((0, 0), (0, pad_width)), mode="constant")
    return arr[:, :target_len]


In [5]:
emg_df = load_emg_dataframe_from_csv()

emg_df["signal"] = emg_df["signal"].apply(parse_signal_cell)
emg_df["signal_fixed"] = emg_df["signal"].apply(fix_length)

print(emg_df["signal_fixed"].apply(lambda x: x.shape).value_counts())
print(emg_df.head())
print(emg_df.shape)


Loaded credentials from: /Users/loso/code/hildieleyser/inkling/keys/inkling-479911-b140632c44c5.json
Using bucket: inkling-ssvep-emg
Accessing blob: EMG-nature/Clean_df/emg_trial_level_df.csv
Downloaded 3078852 bytes from GCS
Loaded DataFrame with shape: (4800, 8)
signal_fixed
(16, 10000)    4800
Name: count, dtype: int64
   Unnamed: 0  participant  day  block  trial_id  position  grasp  \
0           0            1    1      1         1         2      3   
1           1            1    1      1         2         2      3   
2           2            1    1      1         3         2      3   
3           3            1    1      1         4         2      3   
4           4            1    1      1         5         2      3   

                                              signal  \
0  [[3.763498e-05, 1.9842508e-05], [9.071698e-06,...   
1  [[1.0537988e-05, 1.153949e-05], [1.18090165e-0...   
2  [[1.6977565e-05, 1.9937088e-05], [2.1830994e-0...   
3  [[3.6807487e-06, 3.2587977e-06], [

In [6]:
emg_df = emg_df.drop(columns = "Unnamed: 0")

In [7]:
emg_df = emg_df[emg_df['grasp'].isin([1, 5])]

In [8]:
emg_df['grasp'].value_counts()

grasp
1    800
5    800
Name: count, dtype: int64

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score


In [10]:
emg_df.loc[4790]['grasp']

5

In [11]:
(emg_df['grasp'] == 1).astype(int)

5       1
6       1
7       1
8       1
9       1
       ..
4790    0
4791    0
4792    0
4793    0
4794    0
Name: grasp, Length: 1600, dtype: int64

In [12]:
# Optional: drop the raw 'signal' column - not needed anymore
emg_df = emg_df.drop(columns=['signal'])

# Binary labels: 1 = power(1), 0 = open(5)
emg_df['label'] = (emg_df['grasp'] == 1).astype(int)

# Build X from *signal_fixed* -> shape (N, 16, 10000)
X = np.stack(emg_df['signal_fixed'].values)
y = emg_df['label'].values

print("X shape:", X.shape)
print("Label distribution:", np.unique(y, return_counts=True))


X shape: (1600, 16, 10000)
Label distribution: (array([0, 1]), array([800, 800]))


In [13]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape, np.unique(y_train, return_counts=True))
print("Test shape:",  X_test.shape,  np.unique(y_test, return_counts=True))


Train shape: (1120, 16, 10000) (array([0, 1]), array([560, 560]))
Test shape: (480, 16, 10000) (array([0, 1]), array([240, 240]))


# Channel wise normalisation - z_score/standardisation

In [14]:
# X_train: (n_train, 16, 10000)
# Compute mean & std over (batch, time) per channel
train_mean = X_train.mean(axis=(0, 2), keepdims=True)  # (1, 16, 1)
train_std  = X_train.std(axis=(0, 2), keepdims=True) + 1e-8

X_train_norm = (X_train - train_mean) / train_std
X_test_norm  = (X_test  - train_mean) / train_std

print("Normalized train mean (approx):", X_train_norm.mean(), "std:", X_train_norm.std())


Normalized train mean (approx): -1.1884953e-09 std: 0.96461254


# Reshape for Conv1D

In [15]:
# Swap axes: (N, 16, 10000) -> (N, 10000, 16)
X_train = np.transpose(X_train_norm, (0, 2, 1))
X_test  = np.transpose(X_test_norm,  (0, 2, 1))

print("Conv1D input shape:", X_train.shape)  # (n_train, 10000, 16)


Conv1D input shape: (1120, 10000, 16)


# Build the Model (Conv1D)

In [17]:
input_shape = X_train.shape[1:]  # (10000, 16)
#good model end of day 4
model= Sequential([
    layers.Input(shape=input_shape),

    # Block 1
    layers.Conv1D(32, kernel_size=7, padding='same', activation='relu'),
    layers.MaxPool1D(pool_size=4),

    # Block 2
    layers.Conv1D(64, kernel_size=5, padding='same', activation='relu'),
    layers.MaxPool1D(pool_size=4),

    # Block 3
    layers.Conv1D(128, kernel_size=3, padding='same', activation='relu'),
    layers.MaxPool1D(pool_size=4),

    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # binary output: P(power grasp)
])

model.summary()

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', 'recall', 'precision']
)


In [18]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)


In [19]:
es = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

In [20]:
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[es],
    verbose=1
)


Epoch 1/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 69ms/step - accuracy: 0.5190 - loss: 0.6923 - precision: 0.5200 - recall: 0.3521 - val_accuracy: 0.6205 - val_loss: 0.6679 - val_precision: 0.6739 - val_recall: 0.5299
Epoch 2/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step - accuracy: 0.6205 - loss: 0.6593 - precision: 0.6189 - recall: 0.6050 - val_accuracy: 0.6518 - val_loss: 0.6436 - val_precision: 0.6089 - val_recall: 0.9316
Epoch 3/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - accuracy: 0.6641 - loss: 0.6227 - precision: 0.6614 - recall: 0.6569 - val_accuracy: 0.5848 - val_loss: 0.6790 - val_precision: 0.6818 - val_recall: 0.3846
Epoch 4/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - accuracy: 0.7020 - loss: 0.5831 - precision: 0.6803 - recall: 0.7494 - val_accuracy: 0.6562 - val_loss: 0.6558 - val_precision: 0.6639 - val_recall: 0.6923
Epoch 5/100
[1m28/28[0

# Evaluation Block (Confusion Matrix + Report)

In [21]:
print("\n===== Test Performance =====")
results = model.evaluate(X_test, y_test, verbose=0)

for name, value in zip(model.metrics_names, results):
    print(f"{name}: {value:.4f}")

y_prob = model.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)  # keep threshold=0.5

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=["open (0)", "power (1)"]
))

prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
print(f"\nPrecision (power=1): {prec:.4f}")
print(f"Recall    (power=1): {rec:.4f}")



===== Test Performance =====
loss: 0.5792
compile_metrics: 0.7188
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step

Confusion Matrix:
[[181  59]
 [ 76 164]]

Classification Report:
              precision    recall  f1-score   support

    open (0)       0.70      0.75      0.73       240
   power (1)       0.74      0.68      0.71       240

    accuracy                           0.72       480
   macro avg       0.72      0.72      0.72       480
weighted avg       0.72      0.72      0.72       480


Precision (power=1): 0.7354
Recall    (power=1): 0.6833
