In [1]:
import os

GCP_PROJECT_ID = os.getenv("GCP_PROJECT_ID")
GCP_BUCKET_NAME = os.getenv("GCP_BUCKET_NAME")
GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

In [2]:
from google.cloud import storage
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS

storage_client = storage.Client(project=GCP_PROJECT_ID)
bucket = storage_client.bucket(GCP_BUCKET_NAME)



In [8]:
DATA_PREFIX = "EMG-nature/data/"

blobs = list(bucket.list_blobs(prefix=DATA_PREFIX))

for b in blobs[:30]:
    print(b.name)

EMG-nature/data/participant_1/.DS_Store
EMG-nature/data/participant_1/participant1_day1_block1/.DS_Store
EMG-nature/data/participant_1/participant1_day1_block1/emg_data.hdf5
EMG-nature/data/participant_1/participant1_day1_block1/finger_data.hdf5
EMG-nature/data/participant_1/participant1_day1_block1/glove_data.hdf5
EMG-nature/data/participant_1/participant1_day1_block1/recording_parameters.txt
EMG-nature/data/participant_1/participant1_day1_block1/trials.csv
EMG-nature/data/participant_1/participant1_day1_block2/.DS_Store
EMG-nature/data/participant_1/participant1_day1_block2/emg_data.hdf5
EMG-nature/data/participant_1/participant1_day1_block2/finger_data.hdf5
EMG-nature/data/participant_1/participant1_day1_block2/glove_data.hdf5
EMG-nature/data/participant_1/participant1_day1_block2/recording_parameters.txt
EMG-nature/data/participant_1/participant1_day1_block2/trials.csv
EMG-nature/data/participant_1/participant1_day2_block1/.DS_Store
EMG-nature/data/participant_1/participant1_day2_b

In [9]:
import h5py
import pandas as pd
import numpy as np
from io import BytesIO

participant_id = 1
day = 1
block = 1  # "block1" in the folder name

base_path = "EMG-nature/data/"
participant_dir = f"participant_{participant_id}/"
block_dir = f"participant{participant_id}_day{day}_block{block}/"

gcs_prefix = base_path + participant_dir + block_dir
print("Using prefix:", gcs_prefix)


Using prefix: EMG-nature/data/participant_1/participant1_day1_block1/


In [10]:
emg_blob = bucket.blob(gcs_prefix + "emg_data.hdf5")

# this comes back as bytes in RAM, not a file
emg_bytes = emg_blob.download_as_bytes()

# wrap the bytes in a file-like buffer and open with h5py
emg_file = h5py.File(BytesIO(emg_bytes), "r")

print("Trial keys:", list(emg_file.keys())[:10])


Trial keys: ['0', '1', '10', '100', '101', '102', '103', '104', '105', '106']


In [11]:
labels_blob = bucket.blob(gcs_prefix + "trials.csv")
labels_bytes = labels_blob.download_as_bytes()

labels_df = pd.read_csv(BytesIO(labels_bytes))
print(labels_df.head())
print(labels_df.columns)


   Unnamed: 0  row_number  target_position  grasp  trial_no  block
0           0           0                2      3         0      0
1           1           1                2      3         1      0
2           2           2                2      3         2      0
3           3           3                2      3         3      0
4           4           4                2      3         4      0
Index(['Unnamed: 0', 'row_number', 'target_position', 'grasp', 'trial_no',
       'block'],
      dtype='object')


In [18]:
rows = []

for trial_key in emg_file.keys():            # '0'..'149'
    trial_index = int(trial_key)             # 0..149
    emg_matrix = emg_file[trial_key][()]     # shape (16, N_samples)

    # assume trials.csv row 0 corresponds to trial '0', row 1 -> '1', etc.
    label_row = labels_df.iloc[trial_index]

    # adjust these names if your CSV headers differ slightly
    # e.g. 'position' vs 'target_position'
    pos   = label_row.get("target_position", label_row.get("position"))
    grasp = label_row.get("grasp")
    trial_number  = label_row.get("trial_number")
    block_number  = label_row.get("block_number")

    for ch in range(emg_matrix.shape[0]):    # 16 channels
        rows.append({
            "participant": participant_id,
            "day": day,
            "block": block,
            "trial_id": trial_index + 1,     # make this 1..150 for convenience
            "channel": ch,
            "position": pos,
            "grasp": grasp,
            "trial_number": trial_number,
            "block_number": block_number,
            "signal": emg_matrix[ch, :],     # numpy array
        })

emg_df = pd.DataFrame(rows)
print(emg_df.head())
print(emg_df.columns)


   participant  day  block  trial_id  channel  position  grasp trial_number  \
0            1    1      1         1        0         2      3         None   
1            1    1      1         1        1         2      3         None   
2            1    1      1         1        2         2      3         None   
3            1    1      1         1        3         2      3         None   
4            1    1      1         1        4         2      3         None   

  block_number                                             signal  
0         None  [3.763498e-05, 1.9842508e-05, 9.071698e-06, 1....  
1         None  [2.4820081e-05, 2.6200492e-05, 2.5797071e-05, ...  
2         None  [7.703583e-06, 9.893e-06, 1.0767478e-05, 9.083...  
3         None  [8.394901e-06, 7.721386e-06, 7.572158e-06, 6.6...  
4         None  [-1.6433607e-05, -1.6947637e-05, -2.4275832e-0...  
Index(['participant', 'day', 'block', 'trial_id', 'channel', 'position',
       'grasp', 'trial_number', 'block_numbe

In [43]:
emg_df['block'].value_counts()

block
1    2400
Name: count, dtype: int64

In [36]:
import os
from google.cloud import storage
import h5py
import pandas as pd
import numpy as np
from io import BytesIO

GCP_PROJECT_ID = os.getenv("GCP_PROJECT_ID")
GCP_BUCKET_NAME = os.getenv("GCP_BUCKET_NAME")
GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS

client = storage.Client(project=GCP_PROJECT_ID)
bucket = client.bucket(GCP_BUCKET_NAME)

DATA_PREFIX = "EMG-nature/data/"

all_rows = []

for participant_id in range(1, 9):      # 1..8
    for day in (1, 2):                  # day1, day2
        for block in (1, 2):            # block1, block2
            participant_dir = f"participant_{participant_id}/"
            block_dir = f"participant{participant_id}_day{day}_block{block}/"
            gcs_prefix = DATA_PREFIX + participant_dir + block_dir

            emg_path = gcs_prefix + "emg_data.hdf5"
            labels_path = gcs_prefix + "trials.csv"

            emg_blob = bucket.blob(emg_path)
            labels_blob = bucket.blob(labels_path)

            # skip missing combos (just in case)
            if not emg_blob.exists() or not labels_blob.exists():
                print("Skipping missing:", emg_path)
                continue

            print("Loading:", emg_path)

            # --- load EMG from GCS into memory ---
            emg_bytes = emg_blob.download_as_bytes()
            with h5py.File(BytesIO(emg_bytes), "r") as emg_file:

                # --- load labels from GCS into memory ---
                labels_bytes = labels_blob.download_as_bytes()
                labels_df = pd.read_csv(BytesIO(labels_bytes))

                # build rows: one per (trial, channel)
                for trial_key in emg_file.keys():            # '0'..'149'
                    trial_index = int(trial_key)
                    emg_matrix = emg_file[trial_key][()]     # (16, N_samples)

                    # assume row order in trials.csv matches trial index
                    label_row = labels_df.iloc[trial_index]

                    pos   = label_row.get("target_position",
                                          label_row.get("position"))
                    grasp = label_row.get("grasp")
                    trial_number = label_row.get("trial_number")
                    block_number = label_row.get("block_number")

                    for ch in range(emg_matrix.shape[0]):    # 16 channels
                        all_rows.append({
                            "participant": participant_id,
                            "day": day,
                            "block": block,
                            "trial_id": trial_index + 1,     # 1..150
                            "channel": ch,
                            "position": pos,
                            "grasp": grasp,
                            "trial_number": trial_number,
                            "block_number": block_number,
                            "signal": emg_matrix[ch, :],
                        })

# final big DataFrame
emg_all_df = pd.DataFrame(all_rows)


Loading: EMG-nature/data/participant_1/participant1_day1_block1/emg_data.hdf5
Loading: EMG-nature/data/participant_1/participant1_day1_block2/emg_data.hdf5
Loading: EMG-nature/data/participant_1/participant1_day2_block1/emg_data.hdf5
Loading: EMG-nature/data/participant_1/participant1_day2_block2/emg_data.hdf5
Loading: EMG-nature/data/participant_2/participant2_day1_block1/emg_data.hdf5
Loading: EMG-nature/data/participant_2/participant2_day1_block2/emg_data.hdf5
Loading: EMG-nature/data/participant_2/participant2_day2_block1/emg_data.hdf5
Loading: EMG-nature/data/participant_2/participant2_day2_block2/emg_data.hdf5
Loading: EMG-nature/data/participant_3/participant3_day1_block1/emg_data.hdf5
Loading: EMG-nature/data/participant_3/participant3_day1_block2/emg_data.hdf5
Loading: EMG-nature/data/participant_3/participant3_day2_block1/emg_data.hdf5
Loading: EMG-nature/data/participant_3/participant3_day2_block2/emg_data.hdf5
Loading: EMG-nature/data/participant_4/participant4_day1_block1/

In [40]:
emg_all_df.head(1)

Unnamed: 0,participant,day,block,trial_id,channel,position,grasp,trial_number,block_number,signal
0,1,1,1,1,0,2,3,,,"[3.763498e-05, 1.9842508e-05, 9.071698e-06, 1...."


In [47]:
150*

1200

In [54]:
emg_all_df = emg_all_df.drop(columns=["trial_number", "block_number"])

In [77]:
emg_all_df.sample(4)

Unnamed: 0,participant,day,block,trial_id,channel,position,grasp,signal
64766,7,2,1,98,14,9,6,"[-9.171049e-06, -9.551792e-06, -8.877214e-06, ..."
21191,3,1,1,77,7,2,6,"[-7.0633437e-06, -6.2931563e-06, -5.273978e-06..."
35095,4,2,1,49,7,9,2,"[-1.9681195e-06, -4.362981e-06, -5.7025245e-06..."
11713,2,1,1,84,1,4,6,"[2.047827e-05, 2.084838e-05, 2.0927155e-05, 2...."


In [93]:
emg_all_df.head(1)

Unnamed: 0,participant,day,block,trial_id,channel,position,grasp,signal
0,1,1,1,1,0,2,3,"[3.763498e-05, 1.9842508e-05, 9.071698e-06, 1...."


In [114]:
emg_all_df.head(20)


Unnamed: 0,participant,day,block,trial_id,channel,position,grasp,signal
0,1,1,1,1,0,2,3,"[3.763498e-05, 1.9842508e-05, 9.071698e-06, 1...."
1,1,1,1,1,1,2,3,"[2.4820081e-05, 2.6200492e-05, 2.5797071e-05, ..."
2,1,1,1,1,2,2,3,"[7.703583e-06, 9.893e-06, 1.0767478e-05, 9.083..."
3,1,1,1,1,3,2,3,"[8.394901e-06, 7.721386e-06, 7.572158e-06, 6.6..."
4,1,1,1,1,4,2,3,"[-1.6433607e-05, -1.6947637e-05, -2.4275832e-0..."
5,1,1,1,1,5,2,3,"[-1.6854447e-06, -1.5601315e-06, -4.3219047e-0..."
6,1,1,1,1,6,2,3,"[-5.0224203e-06, -8.243137e-06, -1.3146786e-05..."
7,1,1,1,1,7,2,3,"[-1.0069972e-05, -6.803054e-06, -4.768144e-06,..."
8,1,1,1,1,8,2,3,"[1.7126942e-05, 1.4402639e-05, 1.3872751e-05, ..."
9,1,1,1,1,9,2,3,"[6.147569e-05, 4.3410964e-05, 2.4775052e-05, 1..."


In [96]:
len(emg_all_df['signal'][0])


9980

In [113]:
600*8

4800

In [98]:
4.99 *2000

9980.0

In [None]:
# confirm the length of the trial in seconds = 4.99, hence 9980 samples at 2000 Hz
signal = emg_all_df.loc[0, 'signal']
duration_seconds = len(signal) / 2000
duration_seconds


4.99

In [72]:
#^^one row = one (trial Ã— channel)
#and signal = the entire EMG waveform of that channel for that trial.

In [80]:
# (2400 * 2)*8 <- trials per block (150) * channels per trial (16) = 38400 per day

In [None]:
# 2400 * 4 -> one block (150 * 16) * 4 (blocks per participants -> data points PER participant across 2 days

In [103]:
emg_all_df[ emg_all_df['trial_id'] == 1 ]['channel'].value_counts()

channel
0     32
1     32
2     32
3     32
4     32
5     32
6     32
7     32
8     32
9     32
10    32
11    32
12    32
13    32
14    32
15    32
Name: count, dtype: int64

In [105]:
emg_all_df['position'].value_counts()

position
5    15360
2     7680
4     7680
6     7680
8     7680
1     7680
3     7680
7     7680
9     7680
Name: count, dtype: int64

In [106]:
emg_all_df['position'].unique()

array([2, 5, 4, 6, 8, 1, 3, 7, 9])

In [None]:
# confirm whether all 9 positions were indeed covered for each participant
#also shows that central position is used in ALL blocks
emg_all_df.groupby(
    ['participant', 'day', 'block']
)['position'].unique()

participant  day  block
1            1    1        [2, 5, 4, 6, 8]
                  2        [2, 5, 4, 6, 8]
             2    1        [1, 5, 3, 7, 9]
                  2        [1, 5, 3, 7, 9]
2            1    1        [2, 5, 4, 6, 8]
                  2        [2, 5, 4, 6, 8]
             2    1        [1, 5, 3, 7, 9]
                  2        [1, 5, 3, 7, 9]
3            1    1        [2, 5, 4, 6, 8]
                  2        [2, 5, 4, 6, 8]
             2    1        [1, 5, 3, 7, 9]
                  2        [1, 5, 3, 7, 9]
4            1    1        [2, 5, 4, 6, 8]
                  2        [2, 5, 4, 6, 8]
             2    1        [1, 5, 3, 7, 9]
                  2        [1, 5, 3, 7, 9]
5            1    1        [2, 5, 4, 6, 8]
                  2        [2, 5, 4, 6, 8]
             2    1        [1, 5, 3, 7, 9]
                  2        [1, 5, 3, 7, 9]
6            1    1        [2, 5, 4, 6, 8]
                  2        [2, 5, 4, 6, 8]
             2    1        [1,

Purpose of using the same arm positions across both blocks per day 1? *create a consistent 'within-day' training and testing set*

Purpose of using the same arm positions for the corresponding day2: create a different translation condition

In [None]:
#same check as above, but cleaner
(
    emg_all_df
    .groupby(['participant', 'day', 'block'])['position']
    .unique()                      # list of positions per block
    .groupby('participant')        # now regroup by participant only
    .apply(lambda x: sorted(set().union(*x)))   # flatten + union positions
)


participant
1    [1, 2, 3, 4, 5, 6, 7, 8, 9]
2    [1, 2, 3, 4, 5, 6, 7, 8, 9]
3    [1, 2, 3, 4, 5, 6, 7, 8, 9]
4    [1, 2, 3, 4, 5, 6, 7, 8, 9]
5    [1, 2, 3, 4, 5, 6, 7, 8, 9]
6    [1, 2, 3, 4, 5, 6, 7, 8, 9]
7    [1, 2, 3, 4, 5, 6, 7, 8, 9]
8    [1, 2, 3, 4, 5, 6, 7, 8, 9]
Name: position, dtype: object