In [45]:
!pip install pyriemann
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
from pyriemann.estimation import Covariances
from pyriemann.tangentspace import TangentSpace
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import KFold



## Step 1 : Dataset preparation and augmentation through overlapping window

### Handling of the guided gestures training/validation data

#### Guided training/validation data loading, shape and size checking

In [46]:
# Data loading
X_guided=np.load(r"C:\Users\pavel\OneDrive\Documents\BIG DATA 2024-2025\Session2\Statistical foundations of machine learning\project\F422 EMG project data\guided\guided_dataset_X.npy")

# Shape and size checking
print(X_guided.shape, np.size(X_guided))

(5, 8, 230000) 9200000


The dataset shows the records of 5 sessions, 8 electrodes for 230 0000 points of time. The dataset contains then 9 200 000 records.

#### Guided training/validation data augmentation, shape and size checking

In [47]:
# Data segmentation, window size = 500, overlap=75%
X_guided_augmented=sliding_window_view(X_guided, 500,axis=2)[:,:, ::125]

# Shape and size checking
print(X_guided_augmented.shape,np.size(X_guided_augmented))

(5, 8, 1837, 500) 36740000


The augmented dataset shows the records of 5 sessions, 8 electrodes and 18737 intervals of time (with an overlap of 75%). Each interval of time contains records for 500 points of time. The dataset contains now 36 740 000 records.

In [48]:
# Ratio of augmented data size to original data
print(np.size(X_guided_augmented)/np.size(X_guided))

3.993478260869565


The augmented dataset has a size of about 4 times with respect to the original dataset.

### Handling of the guided gestures training/validation target (hand pose estimation) data

#### Guided training/validation target data  loading, shape and size checking

In [49]:
# Data loading
Y_guided=np.load(r"C:\Users\pavel\OneDrive\Documents\BIG DATA 2024-2025\Session2\Statistical foundations of machine learning\project\F422 EMG project data\guided\guided_dataset_y.npy")

In [50]:
# Shape and size checking
print(Y_guided.shape, np.size(Y_guided))

(5, 51, 230000) 58650000


The dataset shows the records of 51 joint-angles for 5 sessions (5 predefined hand postures) and 230 0000 points of time. The dataset contains then 58 650 000 records.

#### Guided training/validation target data segmentation, keeping of the up bound point of time of each window

In [51]:
# Data segmentation, window size = 500, overlap=75%
Y_guided_slided=sliding_window_view(Y_guided, 500,axis=2)[:,:, ::125]
# Shape checking
print(Y_guided_slided.shape)

(5, 51, 1837, 500)


In [52]:
# Reducing each window to the up bound point of time
Y_guided_slided=Y_guided_slided[:,:,:,-1:]

In [53]:
# Shape checking
print(Y_guided_slided.shape)

(5, 51, 1837, 1)


### Handling of the free gestures training/validation data

#### Free moves training/validation data loading, shape and size checking

In [54]:
# Data loading
X_freemoves=np.load(r"C:\Users\pavel\OneDrive\Documents\BIG DATA 2024-2025\Session2\Statistical foundations of machine learning\project\F422 EMG project data\freemoves\freemoves_dataset_X.npy")

# Shape and size checking
print(X_freemoves.shape, np.size(X_freemoves))

(5, 8, 270000) 10800000


The dataset shows the records of 5 sessions, 8 electrodes for 270 0000 points of time. The dataset contains then 10 800 000 records.

#### Free moves training/validation data augmentation, shape and size checking

In [55]:
# Data segmentation, window size = 500, overlap=75%
X_freemoves_augmented=sliding_window_view(X_freemoves, 500,axis=2)[:,:, ::125]

# Shape and size checking
print(X_freemoves_augmented.shape,np.size(X_freemoves_augmented))

(5, 8, 2157, 500) 43140000


The augmented dataset shows the records of 5 sessions, 8 electrodes and 2157 intervals of time (with an overlap of 75%). Each interval of time contains records for 500 points of time, resulting then in a number of 43 140 000 records for the dataset.

In [56]:
# Ratio of augmented data size to original data
print(np.size(X_freemoves_augmented)/np.size(X_freemoves))

3.9944444444444445


The augmented dataset has a size of about 4 times with respect to the original dataset.

### Handling of the free gestures training/validation target (hand pose estimation) data

#### Free moves training/validation target data  loading, shape and size checking

In [57]:
# Data loading
Y_freemoves=np.load(r"C:\Users\pavel\OneDrive\Documents\BIG DATA 2024-2025\Session2\Statistical foundations of machine learning\project\F422 EMG project data\freemoves\freemoves_dataset_y.npy")

In [58]:
# Shape and size checking
print(Y_freemoves.shape, np.size(Y_freemoves))

(5, 51, 270000) 68850000


The dataset shows the records of 51 joint-angles for 5 sessions (5 predefined hand postures) and 270 0000 points of time. The dataset contains then 68 850 000 records.

#### Free moves training/validation target data segmentation, keeping of the up bound point of time of each window

In [59]:
# Data segmentation, window size = 500, overlap=75%
Y_freemoves_slided=sliding_window_view(Y_freemoves, 500,axis=2)[:,:, ::125]
# Shape checking
print(Y_freemoves_slided.shape)

(5, 51, 2157, 500)


In [60]:
# Reducing each window to the up bound point of time
Y_freemoves_slided=Y_freemoves_slided[:,:,:,-1:]

In [61]:
# Shape checking
print(Y_freemoves_slided.shape)

(5, 51, 2157, 1)


# 2) Determine and implement an adequate cross-validation strategy to validate your regression models, specifying how you organized your data partitions for training and validation

 Step 1 — Window extraction with session labels

In [62]:
def create_overlapping_windows(X, y, window_size=500, overlap=0.75):
    """
    Segments raw sEMG and joint-angle signals into overlapping windows.
    Also records the session index for each window.

    Parameters:
        X (np.ndarray): shape (num_sessions, num_electrodes, num_timesteps)
        y (np.ndarray): shape (num_sessions, num_joints, num_timesteps)
        window_size (int): number of time points per window (default = 500)
        overlap (float): fraction of overlap between consecutive windows (e.g. 0.75 for 75%)

    Returns:
        X_windows (np.ndarray): shape (n_windows, num_electrodes, window_size)
        y_windows (np.ndarray): shape (n_windows, num_joints, window_size)
        session_labels (np.ndarray): shape (n_windows,), session index for each window
    """
    step_size = int(window_size * (1 - overlap))  # Distance between window starts
    X_windows, y_windows, session_labels = [], [], []

    for session_idx in range(X.shape[0]):
        emg = X[session_idx]     # shape (8, T)
        joints = y[session_idx]  # shape (51, T)
        T = emg.shape[1]

        # Slide window over time axis
        for start in range(0, T - window_size + 1, step_size):
            end = start + window_size
            X_windows.append(emg[:, start:end])        # Extract EMG window
            y_windows.append(joints[:, start:end])     # Extract joint-angle window
            session_labels.append(session_idx)         # Label the window by session

    return np.array(X_windows), np.array(y_windows), np.array(session_labels)


Step 2 -  Leave-One-Session-Out Cross-Validation

In [63]:
def leave_one_session_out_cv(X_windows, y_windows, session_labels):
    """
    Yields train/validation splits for Leave-One-Session-Out cross-validation.

    Parameters:
        X_windows (np.ndarray): EMG data windows
        y_windows (np.ndarray): joint-angle windows
        session_labels (np.ndarray): session index for each window

    Yields:
        X_train, X_val, y_train, y_val (each np.ndarray)
    """
    unique_sessions = np.unique(session_labels)  # Usually [0, 1, 2, 3, 4]

    for test_session in unique_sessions:
        val_idx = np.where(session_labels == test_session)[0]    # Indices for validation
        train_idx = np.where(session_labels != test_session)[0]  # Indices for training

        X_train, X_val = X_windows[train_idx], X_windows[val_idx]
        y_train, y_val = y_windows[train_idx], y_windows[val_idx]

        yield X_train, X_val, y_train, y_val


Step 3: Apply to Guided Gestures Dataset

In [64]:
# Segment windows and get session labels
guided_X_win, guided_y_win, guided_labels = create_overlapping_windows(X_guided, Y_guided)

# Run LOSO-CV
for fold, (X_tr, X_val, y_tr, y_val) in enumerate(
    leave_one_session_out_cv(guided_X_win, guided_y_win, guided_labels)
):
    print(f"[Guided Fold {fold+1}] Train shape: {X_tr.shape}, Validation shape: {X_val.shape}")


[Guided Fold 1] Train shape: (7348, 8, 500), Validation shape: (1837, 8, 500)
[Guided Fold 2] Train shape: (7348, 8, 500), Validation shape: (1837, 8, 500)
[Guided Fold 3] Train shape: (7348, 8, 500), Validation shape: (1837, 8, 500)
[Guided Fold 4] Train shape: (7348, 8, 500), Validation shape: (1837, 8, 500)
[Guided Fold 5] Train shape: (7348, 8, 500), Validation shape: (1837, 8, 500)


Step 4: Apply to Free Gestures Dataset

In [65]:
# Segment windows and get session labels
free_X_win, free_y_win, free_labels = create_overlapping_windows(X_freemoves, Y_freemoves)

# Run LOSO-CV
for fold, (X_tr, X_val, y_tr, y_val) in enumerate(
    leave_one_session_out_cv(free_X_win, free_y_win, free_labels)
):
    print(f"[Free Fold {fold+1}] Train shape: {X_tr.shape}, Validation shape: {X_val.shape}")


[Free Fold 1] Train shape: (8628, 8, 500), Validation shape: (2157, 8, 500)
[Free Fold 2] Train shape: (8628, 8, 500), Validation shape: (2157, 8, 500)


[Free Fold 3] Train shape: (8628, 8, 500), Validation shape: (2157, 8, 500)
[Free Fold 4] Train shape: (8628, 8, 500), Validation shape: (2157, 8, 500)
[Free Fold 5] Train shape: (8628, 8, 500), Validation shape: (2157, 8, 500)


We apply Leave-One-Session-Out Cross-Validation separately to both the guided and free gesture datasets. This ensures that training and validation sets are fully separated in time, avoiding any overlap-induced leakage. Each validation set consists of data from a completely unseen recording session, providing a fair and realistic estimate of the model’s generalization performance.