In [1]:
# Downsample ch1..ch16 from 125 Hz to 25 Hz using block means

# Parameters
INPUT_DIR = "/home/jupyter-yin10/EEG_HAR/PLHI-HAR_EEG-2025_new"
FACTOR = 5  # 125 to 25
TARGET_FS = 25
EEG_COLS = [f"ch{i}" for i in range(1, 17)]  # ch1..ch16


In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
from statistics import mode, StatisticsError

inp = Path(INPUT_DIR)
files = sorted([p for p in inp.iterdir() if p.is_file() and p.suffix.lower() == ".csv" and p.name.endswith("_labeled.csv")])
print(f"Found {len(files)} labeled CSV files in {inp}")

def block_mean_downsample(x: np.ndarray, factor: int = FACTOR) -> np.ndarray:
    """Mean over non overlapping blocks of size factor, trims tail."""
    x = np.asarray(x, dtype=np.float64).ravel()
    n_blocks = len(x) // factor
    if n_blocks == 0:
        return np.array([], dtype=np.float64)
    x_trim = x[: n_blocks * factor]
    return x_trim.reshape(n_blocks, factor).mean(axis=1)

def block_mode(arr) -> object:
    """Mode that breaks ties by first occurrence, ignores NaN."""
    s = pd.Series(arr).dropna()
    if s.empty:
        return np.nan
    # fast path for exact mode
    try:
        return mode(s.tolist())
    except StatisticsError:
        # tie, pick the first value in the block
        return s.iloc[0]

def is_numeric_series(s: pd.Series) -> bool:
    return pd.api.types.is_numeric_dtype(s)

results = []
for i, f in enumerate(files, start=1):
    try:
        df = pd.read_csv(f)

        # Check EEG columns exist
        missing = [c for c in EEG_COLS if c not in df.columns]
        if missing:
            print(f"[{i:02d}] SKIP {f.name}: missing EEG columns {missing}")
            results.append((f.name, 'skipped_missing_cols'))
            continue

        n_in = len(df)
        if n_in == 0:
            print(f"[{i:02d}] SKIP {f.name}: empty file")
            results.append((f.name, 'skipped_empty'))
            continue

        # Trim to a multiple of FACTOR so blocks are complete
        n_blocks = n_in // FACTOR
        if n_blocks == 0:
            print(f"[{i:02d}] SKIP {f.name}: too few rows for factor={FACTOR}")
            results.append((f.name, 'skipped_too_short'))
            continue
        n_used = n_blocks * FACTOR
        df_used = df.iloc[:n_used].copy()

        # Build block ids, one id per 5 input rows
        blocks = np.repeat(np.arange(n_blocks), FACTOR)

        # Prepare aggregation map
        agg_map = {}

        # EEG columns use mean over each block
        for ch in EEG_COLS:
            agg_map[ch] = 'mean'

        # Identify other columns
        other_cols = [c for c in df_used.columns if c not in EEG_COLS]

        # For Sample Index, we will regenerate, so skip here
        other_cols_no_sample = [c for c in other_cols if c != 'Sample Index']

        # Decide numeric vs categorical for the rest
        num_cols = [c for c in other_cols_no_sample if is_numeric_series(df_used[c])]
        cat_cols = [c for c in other_cols_no_sample if c not in num_cols]

        for c in num_cols:
            agg_map[c] = 'mean'

        # Categorical like labels, activities, subjects use block_mode
        # Pandas groupby does not accept callables with engine for mixed types,
        # so we will handle categorical after a first pass, or use a dict of lambdas
        # Here we attach a placeholder and fill after groupby if needed
        for c in cat_cols:
            agg_map[c] = 'first'  # temporary, we will overwrite with block_mode

        # Create a helper DataFrame that carries the block ids
        df_used = df_used.copy()
        df_used['_block'] = blocks

        # First, compute numeric and EEG with pandas mean
        to_mean = EEG_COLS + num_cols
        out_mean = (
            df_used[to_mean + ['_block']]
            .groupby('_block', sort=True, as_index=False)
            .mean(numeric_only=True)
            .set_index('_block')
        )

        # Now compute categorical with our mode
        out_cats = {}
        for c in cat_cols:
            vals = (
                df_used[[c, '_block']]
                .groupby('_block', sort=True)[c]
                .apply(lambda s: block_mode(s.values))
            )
            out_cats[c] = vals

        # Combine
        out_df = out_mean.copy()
        for c, s in out_cats.items():
            out_df[c] = s

        # Reorder columns to match original order when possible
        desired_order = [c for c in df.columns if c in out_df.columns]
        out_df = out_df[desired_order]

        # Add EEG columns that may not be in desired_order yet
        for ch in EEG_COLS:
            if ch not in out_df.columns:
                out_df[ch] = out_mean[ch]

        # Reset index to 0..n_blocks-1
        out_df = out_df.reset_index(drop=True)

        # Reset or add Sample Index
        out_df['Sample Index'] = np.arange(len(out_df))

        # Set Sampling Rate to 25 if present, else leave untouched
        if 'Sampling Rate' in out_df.columns:
            out_df['Sampling Rate'] = TARGET_FS

        # Ensure column types are sensible
        for ch in EEG_COLS:
            out_df[ch] = out_df[ch].astype(float)

        out_path = f.with_name(f.stem.replace('_labeled', '') + '_ds25.csv')
        out_df.to_csv(out_path, index=False)

        print(f"[{i:02d}] OK {f.name}: rows_in={n_in}, rows_used={n_used}, rows_out={len(out_df)} -> {out_path.name}")
        results.append((f.name, 'ok'))

    except Exception as e:
        print(f"[{i:02d}] ERROR {f.name}: {e}")
        results.append((f.name, f'error: {e}'))

pd.DataFrame(results, columns=['file','status'])


Found 54 labeled CSV files in /home/jupyter-yin10/EEG_HAR/PLHI-HAR_EEG-2025_new
[01] OK OpenBCISession_s1-chair squats_stacked_labeled.csv: rows_in=57013, rows_used=57010, rows_out=11402 -> OpenBCISession_s1-chair squats_stacked_ds25.csv
[02] OK OpenBCISession_s1-light stationary cycling_stacked_labeled.csv: rows_in=57177, rows_used=57175, rows_out=11435 -> OpenBCISession_s1-light stationary cycling_stacked_ds25.csv
[03] OK OpenBCISession_s1-marching in place_stacked_labeled.csv: rows_in=51235, rows_used=51235, rows_out=10247 -> OpenBCISession_s1-marching in place_stacked_ds25.csv
[04] OK OpenBCISession_s1-seated boxing hooks_stacked_labeled.csv: rows_in=58126, rows_used=58125, rows_out=11625 -> OpenBCISession_s1-seated boxing hooks_stacked_ds25.csv
[05] OK OpenBCISession_s1-seated leg extensions_stacked_labeled.csv: rows_in=77573, rows_used=77570, rows_out=15514 -> OpenBCISession_s1-seated leg extensions_stacked_ds25.csv
[06] OK OpenBCISession_s1-seated medicine ball twists_stacked_la

[49] OK OpenBCISession_s6-seated boxing hooks_stacked_labeled.csv: rows_in=58312, rows_used=58310, rows_out=11662 -> OpenBCISession_s6-seated boxing hooks_stacked_ds25.csv
[50] OK OpenBCISession_s6-seated leg extensions_stacked_labeled.csv: rows_in=73235, rows_used=73235, rows_out=14647 -> OpenBCISession_s6-seated leg extensions_stacked_ds25.csv
[51] OK OpenBCISession_s6-seated medicine ball twists_stacked_labeled.csv: rows_in=57794, rows_used=57790, rows_out=11558 -> OpenBCISession_s6-seated medicine ball twists_stacked_ds25.csv
[52] OK OpenBCISession_s6-seated side bends_stacked_labeled.csv: rows_in=57356, rows_used=57355, rows_out=11471 -> OpenBCISession_s6-seated side bends_stacked_ds25.csv
[53] OK OpenBCISession_s6-side-stepping_stacked_labeled.csv: rows_in=38086, rows_used=38085, rows_out=7617 -> OpenBCISession_s6-side-stepping_stacked_ds25.csv
[54] OK OpenBCISession_s6-wallpushups_stacked_labeled.csv: rows_in=59320, rows_used=59320, rows_out=11864 -> OpenBCISession_s6-wallpushup

Unnamed: 0,file,status
0,OpenBCISession_s1-chair squats_stacked_labeled...,ok
1,OpenBCISession_s1-light stationary cycling_sta...,ok
2,OpenBCISession_s1-marching in place_stacked_la...,ok
3,OpenBCISession_s1-seated boxing hooks_stacked_...,ok
4,OpenBCISession_s1-seated leg extensions_stacke...,ok
5,OpenBCISession_s1-seated medicine ball twists_...,ok
6,OpenBCISession_s1-seated side bends_stacked_la...,ok
7,OpenBCISession_s1-side-stepping_stacked_labele...,ok
8,OpenBCISession_s1-standing heel to toe walk_st...,ok
9,OpenBCISession_s1-wall push-ups_stacked_labele...,ok
