In [1]:
from pathlib import Path
PATH1 = Path("EEG-HAR/eeg-har-downsampled")
PATH2 = Path("/data0/HAR-datasets/PLHI Data for All Trial Subjects-20250212T053126Z-001/PLHI Data for All Trial Subjects")

In [4]:
import pandas as pd
from pathlib import Path
import re

DOWNSAMPLED_DIR = Path("eeg-har-downsampled") 

def load_downsampled_summary(downsampled_dir: Path) -> pd.DataFrame:
    """
    Create a summary dataframe from the downsampled files.
    Files are named: s{subject}_{activity}_stacked_ds25.csv
    """
    if not downsampled_dir.exists():
        print(f"ERROR: Directory does not exist: {downsampled_dir}")
        print(f"Absolute path: {downsampled_dir.absolute()}")
        return pd.DataFrame()
    
    csv_files = sorted(downsampled_dir.glob("s*_*_stacked_ds25.csv"))
    print(f"Found {len(csv_files)} CSV files")
    
    if len(csv_files) == 0:
        print(f"No matching files found in {downsampled_dir}")
        return pd.DataFrame()
    
    rows = []
    pat = re.compile(r"s(\d+)_(.+)_stacked_ds25\.csv")
    
    for csv_file in csv_files:
        m = pat.match(csv_file.name)
        if not m:
            print(f"Skipping file (doesn't match pattern): {csv_file.name}")
            continue
            
        subj = int(m.group(1))
        activity = m.group(2).replace("_", " ").title()
        
        rows.append({
            "subject": subj,
            "activity": activity,
            "n_files": 1,
            "rows": len(pd.read_csv(csv_file, header=None)),
            "output": str(csv_file)
        })
    
    return pd.DataFrame(rows)

# Create summary from downsampled files
summary = load_downsampled_summary(DOWNSAMPLED_DIR)
display(summary)

Found 54 CSV files


  "rows": len(pd.read_csv(csv_file, header=None)),
  "rows": len(pd.read_csv(csv_file, header=None)),
  "rows": len(pd.read_csv(csv_file, header=None)),


Unnamed: 0,subject,activity,n_files,rows,output
0,1,Chair Squats,1,11404,eeg-har-downsampled/s1_chair_squats_stacked_ds...
1,1,Light Stationary Cycling,1,11437,eeg-har-downsampled/s1_light_stationary_cyclin...
2,1,Marching In Place,1,10248,eeg-har-downsampled/s1_marching_in_place_stack...
3,1,Seated Boxing Hooks,1,11627,eeg-har-downsampled/s1_seated_boxing_hooks_sta...
4,1,Seated Leg Extensions,1,15516,eeg-har-downsampled/s1_seated_leg_extensions_s...
5,1,Seated Medicine Ball Twists,1,20832,eeg-har-downsampled/s1_seated_medicine_ball_tw...
6,1,Seated Side Bends,1,21073,eeg-har-downsampled/s1_seated_side_bends_stack...
7,1,Side Stepping,1,11121,eeg-har-downsampled/s1_side_stepping_stacked_d...
8,1,Standing Heel To Toe Walk,1,19676,eeg-har-downsampled/s1_standing_heel_to_toe_wa...
9,1,Wall Push Ups,1,14677,eeg-har-downsampled/s1_wall_push_ups_stacked_d...


In [5]:
test_file = Path("eeg-har-downsampled/s1_chair_squats_stacked_ds25.csv")
df_test = pd.read_csv(test_file, header=None)
print(f"Shape: {df_test.shape}")
print(f"\nColumn 30 (first 5 values):")
print(df_test.iloc[:5, 30])

Shape: (11404, 32)

Column 30 (first 5 values):
0              timestamp
1    2025-02-08 15:27:56
2    2025-02-08 15:27:56
3    2025-02-08 15:27:56
4    2025-02-08 15:27:56
Name: 30, dtype: object


In [6]:
import re, pandas as pd
from pathlib import Path

HAR_ROOT = PATH2
OUT_DIR = Path.cwd() / "EEG_clipped_final_ds25"  
OUT_DIR.mkdir(parents=True, exist_ok=True)

activity_order = [
    "Seated Leg Extensions","Marching In Place","Wall Push-ups","Seated Boxing Hooks",  
    "Standing Heel-to-Toe Walk","Side-Stepping","Seated Side Bends",
    "Seated Medicine Ball Twists","Chair Squats","Light Stationary Cycling"
]
def norm_name(s): return re.sub(r"[\s_\-]+","", str(s).strip().lower())
name2num = {norm_name(n): i+1 for i,n in enumerate(activity_order)}

PAD_OVERRIDES = { 1: { 1: 90, 2: 90 } }
DEFAULT_PAD_S = 0
def pad_seconds(subj:int, act_num:int): 
    return PAD_OVERRIDES.get(subj, {}).get(act_num, DEFAULT_PAD_S)

def har_window_acg_dominant_day(subj:int, act_num:int):
    p = Path(HAR_ROOT) / f"Subject {subj}" / f"Activity {act_num}" / "WEAR_ACG.csv"
    acg = pd.read_csv(p, sep=";")
    ms = pd.to_numeric(acg["t_unix"], errors="coerce")
    dt = pd.to_datetime(ms, unit="ms", errors="coerce").dropna()
    if dt.empty: return None, None, None
    day = dt.dt.floor("D")
    dom_day = day.value_counts().idxmax()
    dt = dt[day == dom_day]
    if dt.empty: return None, None, None
    return dt.min(), dt.max(), dom_day

def clip_one(eeg_path: Path, subj:int, act_num:int, act_label:str):
    t0, t1, har_day = har_window_acg_dominant_day(subj, act_num)
    if t0 is None: 
        return None, dict(status="skip", reason="bad_acg_window", original_rows=0, clipped_rows=0)


    eeg = pd.read_csv(eeg_path, header=0)
    original_rows = len(eeg)

  
    et = pd.to_datetime(eeg['timestamp'], errors="coerce")
    estart_all, eend_all = et.min(), et.max()
    if pd.isna(estart_all):
        return None, dict(status="skip", reason="bad_eeg_time", original_rows=original_rows, clipped_rows=0)

    if estart_all.normalize() != har_day:
        return None, dict(status="no_output", reason="different_day", har_start=t0, har_end=t1,
                          eeg_start_all=estart_all, eeg_end_all=eend_all, original_rows=original_rows, clipped_rows=0)

    pad = pd.Timedelta(seconds=pad_seconds(subj, act_num))
    mask = ((et >= (t0 - pad)) & (et <= (t1 + pad))).fillna(False)
    clipped_rows = int(mask.sum())
    if clipped_rows == 0:
        return None, dict(status="no_output", reason="no_overlap", har_start=t0, har_end=t1,
                          eeg_start_all=estart_all, eeg_end_all=eend_all, original_rows=original_rows, clipped_rows=0)

    clipped = eeg.loc[mask].reset_index(drop=True)
    out_path = OUT_DIR / f"subj{subj}_act{act_num}_{norm_name(act_label)}_clipped_ds25.csv"  
  
    clipped.to_csv(out_path, index=False)

    es, ee = et.loc[mask].min(), et.loc[mask].max()
    return out_path, dict(status="ok", reason="", har_start=t0, har_end=t1,
                          eeg_start_all=estart_all, eeg_end_all=eend_all, eeg_start_in=es, eeg_end_in=ee,
                          original_rows=original_rows, clipped_rows=clipped_rows)

def clip_all(summary_df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for r in summary_df.itertuples(index=False):
        subj = int(r.subject)
        act_raw = str(r.activity)
        eeg_path = Path(r.output)
        act_num = name2num.get(norm_name(act_raw))

        if act_num is None:
            rows.append(dict(subject=subj, har_activity_num=None, eeg_activity=act_raw,
                             status="skip", reason="name_not_in_mapping", output=None, original_rows=0, clipped_rows=0))
            continue
        if not eeg_path.exists():
            rows.append(dict(subject=subj, har_activity_num=act_num, eeg_activity=act_raw,
                             status="skip", reason="missing_eeg_file", output=None, original_rows=0, clipped_rows=0))
            continue

        out_path, info = clip_one(eeg_path, subj, act_num, act_raw)
        rows.append(dict(subject=subj, har_activity_num=act_num, eeg_activity=act_raw,
                         output=str(out_path) if out_path else None, **info))
    return pd.DataFrame(rows).sort_values(["subject","har_activity_num"], na_position="last").reset_index(drop=True)

def calculate_statistics(results_df):
    total_original = results_df['original_rows'].sum()
    total_clipped = results_df['clipped_rows'].sum()
    total_lost = total_original - total_clipped
    ok = results_df[results_df['status']=="ok"]
    
    print(f"Original rows: {total_original:,}")
    print(f"Clipped rows: {total_clipped:,}")
    print(f"Row loss: {(total_lost/total_original)*100:.1f}%")
    print(f"Success rate: {len(ok)}/{len(results_df)} ({len(ok)/len(results_df)*100:.1f}%)")

results = clip_all(summary)
calculate_statistics(results)
display(results[['subject','har_activity_num','eeg_activity','status','original_rows','clipped_rows']])

Original rows: 615,943
Clipped rows: 472,377
Row loss: 23.3%
Success rate: 44/54 (81.5%)


Unnamed: 0,subject,har_activity_num,eeg_activity,status,original_rows,clipped_rows
0,1,1,Seated Leg Extensions,ok,15515,1668
1,1,2,Marching In Place,ok,10247,292
2,1,3,Wall Push Ups,no_output,14676,0
3,1,4,Seated Boxing Hooks,no_output,11626,0
4,1,5,Standing Heel To Toe Walk,no_output,19675,0
5,1,6,Side Stepping,no_output,11120,0
6,1,7,Seated Side Bends,ok,21072,21072
7,1,8,Seated Medicine Ball Twists,ok,20831,20413
8,1,9,Chair Squats,ok,11403,11391
9,1,10,Light Stationary Cycling,ok,11436,11408


In [7]:
display(results[results['status'] != 'ok'])

Unnamed: 0,subject,har_activity_num,eeg_activity,output,status,reason,har_start,har_end,eeg_start_all,eeg_end_all,eeg_start_in,eeg_end_in,original_rows,clipped_rows
2,1,3,Wall Push Ups,,no_output,different_day,2025-02-12 04:08:56.711,2025-02-12 04:13:30.529,2025-02-08 14:49:17.000000000,2025-02-08 14:53:48.000000000,NaT,NaT,14676,0
3,1,4,Seated Boxing Hooks,,no_output,no_overlap,2025-02-08 14:56:00.622,2025-02-08 15:01:58.852,2025-02-08 20:18:45.000000000,2025-02-08 20:23:27.000000000,NaT,NaT,11626,0
4,1,5,Standing Heel To Toe Walk,,no_output,different_day,2025-02-12 04:19:40.417,2025-02-12 04:24:16.706,2025-02-08 14:56:03.000000000,2025-02-08 15:01:57.000000000,NaT,NaT,19675,0
5,1,6,Side Stepping,,no_output,different_day,2025-02-12 04:30:59.445,2025-02-12 04:35:33.819,2025-02-08 15:04:15.000000000,2025-02-08 15:08:42.000000000,NaT,NaT,11120,0
14,2,5,Standing Heel To Toe Walk,,no_output,different_day,2025-02-12 04:54:29.000,2025-02-12 04:59:04.565,2025-02-08 16:29:01.000000000,2025-02-08 16:33:37.000000000,NaT,NaT,11300,0
15,2,6,Side Stepping,,no_output,different_day,2025-02-08 16:36:23.840,2025-02-08 16:41:06.633,1970-01-01 00:00:01.739032584,1970-01-01 00:00:01.739032935,NaT,NaT,11869,0
31,4,2,Marching In Place,,no_output,different_day,2025-02-12 04:39:19.209,2025-02-12 04:43:54.553,2025-02-08 20:08:16.000000000,2025-02-08 20:11:44.000000000,NaT,NaT,7731,0
41,5,3,Wall Push Ups,,no_output,no_overlap,2025-02-08 21:20:44.226,2025-02-08 21:22:46.549,2025-02-08 21:14:50.000000000,2025-02-08 21:19:26.000000000,NaT,NaT,11478,0
49,6,6,Side Stepping,,no_output,different_day,2025-02-12 05:07:25.207,2025-02-12 05:12:01.430,2025-02-08 22:11:04.000000000,2025-02-08 22:14:40.000000000,NaT,NaT,7618,0
51,6,8,Seated Medicine Ball Twists,,no_output,different_day,2025-02-12 04:46:43.634,2025-02-12 04:51:20.766,2025-02-08 22:21:06.000000000,2025-02-08 22:25:44.000000000,NaT,NaT,11559,0
