In [1]:
# use stratify argument for train_test_split to maintain class balance

In [2]:
# binary_classifier model

In [1]:
import os
from io import BytesIO
from pathlib import Path
import pandas as pd
import numpy as np
from google.cloud import storage
from sklearn.model_selection import train_test_split



In [1]:
from tensorflow.keras.models import Sequential

: 

In [1]:
from tensorflow.keras import callbacks

: 

In [4]:
def load_environment(env_path: str = ".env") -> tuple[str, str]:
    """Load GCP credentials and bucket name from a .env file."""
    env_file = Path(env_path)
    if env_file.is_file():
        for line in env_file.read_text().splitlines():
            stripped = line.strip()
            if not stripped or stripped.startswith("#") or "=" not in stripped:
                continue
            key, value = stripped.split("=", 1)
            os.environ.setdefault(key.strip(), value.strip())
    credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
    bucket_name = os.environ.get("GCP_BUCKET_NAME")
    if not credentials_path or not bucket_name:
        raise EnvironmentError(
            "GOOGLE_APPLICATION_CREDENTIALS and GCP_BUCKET_NAME must be set in the .env file."
        )
    credentials_file = Path(credentials_path)
    if not credentials_file.is_file():
        raise FileNotFoundError(f"Credentials file not found at {credentials_file}")
    # Ensure the env var is exported for the Google client to pick up.
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(credentials_file)
    return str(credentials_file), bucket_name
def load_emg_dataframe(env_path=".env"):
    _, bucket_name = load_environment(env_path)
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob_path = "EMG-nature/Clean_df/emg_trial_level_df.pkl"
    blob = bucket.blob(blob_path)
    buffer = BytesIO()
    blob.download_to_file(buffer)
    buffer.seek(0)
    df = pd.read_pickle(buffer)
    return df
emg_df = load_emg_dataframe()

In [10]:
emg_df['signal_fixed'].apply(lambda x: x.shape).value_counts()

signal_fixed
(16, 9840)    4800
Name: count, dtype: int64

In [11]:
emg_df.head()

Unnamed: 0,participant,day,block,trial_id,position,grasp,signal,signal_fixed
0,1,1,1,1,2,3,"[[3.763498e-05, 1.9842508e-05, 9.071698e-06, 1...","[[3.763498e-05, 1.9842508e-05, 9.071698e-06, 1..."
1,1,1,1,2,2,3,"[[1.0537988e-05, 1.153949e-05, 1.18090165e-05,...","[[1.0537988e-05, 1.153949e-05, 1.18090165e-05,..."
2,1,1,1,3,2,3,"[[1.6977565e-05, 1.9937088e-05, 2.1830994e-05,...","[[1.6977565e-05, 1.9937088e-05, 2.1830994e-05,..."
3,1,1,1,4,2,3,"[[3.6807487e-06, 3.2587977e-06, 2.339907e-06, ...","[[3.6807487e-06, 3.2587977e-06, 2.339907e-06, ..."
4,1,1,1,5,2,3,"[[1.5383765e-05, 1.8471881e-05, 1.6300444e-05,...","[[1.5383765e-05, 1.8471881e-05, 1.6300444e-05,..."


In [12]:
min_len = min(sig.shape[1] for sig in emg_df['signal'].values)
min_len

9840

Pad or trim every trial to same length

This function:

trims longer signals

zero-pads shorter ones (rare in this dataset)

In [13]:
def fix_length(sig, target_len):
    current_len = sig.shape[1]
    if current_len > target_len:
        return sig[:, :target_len]  # trim end
    elif current_len < target_len:
        pad_width = target_len - current_len
        return np.pad(sig, ((0,0),(0,pad_width)), mode='constant')
    return sig

target_len = min_len

emg_df['signal_fixed'] = emg_df['signal'].apply(lambda s: fix_length(s, target_len))

In [15]:
max_len = max(sig.shape[1] for sig in emg_df['signal_fixed'].values)
max_len

9840

In [16]:
# emg_df: trial-level df, columns:
# ['participant', 'day', 'block', 'trial_id', 'position', 'grasp', 'signal_fixed']

# Build X from your fixed-length trial signals
X = np.stack(emg_df['signal_fixed'].values)  # shape: (n_trials, 16, T)
X = np.expand_dims(X, axis=-1)               # shape: (n_trials, 16, T, 1)

REST_GRASP_ID = 6  # adjust if needed
y = (emg_df['grasp'].values != REST_GRASP_ID).astype(int)  # 0=rest, 1=active

# We'll split using indices so we can map back to emg_df easily
indices = np.arange(len(emg_df))

train_idx, test_idx = train_test_split(
    indices,
    test_size=0.30,
    stratify=emg_df['grasp'].values,  # stratify by 6-grasp labels
    random_state=42
)

# Slice X and y with indices
X_train = X[train_idx]
X_test  = X[test_idx]
y_train = y[train_idx]
y_test  = y[test_idx]

In [39]:
# full_dist = emg_df['grasp'].value_counts().sort_index()

In [17]:
train_grasps = emg_df['grasp'].values[train_idx]

In [33]:
test_grasps  = emg_df['grasp'].values[test_idx]

In [35]:
print(pd.Series(train_grasps).value_counts().sort_index())

1    560
2    560
3    560
4    560
5    560
6    560
Name: count, dtype: int64


In [40]:
X_train.shape

(3360, 16, 9840, 1)

In [None]:


# # Infer input shape from X_train
# input_shape = X_train.shape[1:]   # (16, T, 1)

# model = Sequential()
# model.add(layers.Input(shape=input_shape))

# # Conv block 1
# model.add(layers.Conv2D(16, (3, 3), padding='same', activation="relu"))
# model.add(layers.MaxPool2D(pool_size=(1, 4)))   # pool over time dimension

# # Conv block 2
# model.add(layers.Conv2D(32, (3, 3), padding='same', activation="relu"))
# model.add(layers.MaxPool2D(pool_size=(1, 4)))

# # Conv block 3 (optional but helps compression)
# model.add(layers.Conv2D(64, (3, 3), padding='same', activation="relu"))
# model.add(layers.MaxPool2D(pool_size=(1, 4)))

# # Flatten + dense
# model.add(layers.Flatten())
# model.add(layers.Dense(64, activation='relu'))
# model.add(layers.Dense(1, activation='sigmoid'))   # binary output

# model.summary()

# model.compile(
#     loss='binary_crossentropy',
#     optimizer='adam',
#     metrics=['accuracy']
# )

# # Simple training (you can add EarlyStopping later)
# history = model.fit(
#     X_train, y_train,
#     epochs=10,           # start small; tune later
#     batch_size=16,       # adjust according to GPU/CPU memory
#     validation_data=(X_test, y_test),
#     verbose=1
# )