In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---- settings ----
# FEATURES_CSV = r"C:\Users\lilin\OneDrive\Desktop\Project\EMG\Features\P3_EMG_features.csv"
FEATURES_CSV = r"C:\Users\lilin\OneDrive\Desktop\Project\EMG\Features\P3P4_EMG_butter_low_high.csv"
# ---- load data ----
df = pd.read_csv(FEATURES_CSV)

# features: EMG stats (+ optional condition variables)
# emg_cols = [c for c in df.columns if c.startswith("EMG_CH")]
emg_cols = [c for c in df.columns if c.startswith("EMG_MVC_CH")]
cond_cols = [c for c in ["Lifting Height", "Load Type", "Lifting Depth"] if c in df.columns]

X_base = df[emg_cols + cond_cols]
X = pd.get_dummies(X_base, drop_first=True)

y = df["Box Weight"]
groups = df["Timeline"]

# ---- grouped train/test split (by Timeline) ----
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# ---- random forest regression ----
rf = RandomForestRegressor(
        n_estimators=50,
        max_depth=4,
        min_samples_split=10,
        min_samples_leaf=1,
        bootstrap=True,
        random_state=42,
)
rf.fit(X_train, y_train)

# ---- evaluation ----
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2  = r2_score(y_test, y_test_pred)

train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae  = mean_absolute_error(y_test, y_test_pred)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse  = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Train R^2  : {train_r2:.3f}")
print(f"Test  R^2  : {test_r2:.3f}")
print(f"Train MAE  : {train_mae:.3f}")
print(f"Test  MAE  : {test_mae:.3f}")
print(f"Train RMSE : {train_rmse:.3f}")
print(f"Test  RMSE : {test_rmse:.3f}")


Train R^2  : 0.174
Test  R^2  : -0.271
Train MAE  : 2.255
Test  MAE  : 1.926
Train RMSE : 2.739
Test  RMSE : 2.555


In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# full feature table
df = pd.read_csv(r"C:\Users\lilin\OneDrive\Desktop\Project\EMG\Features\P3P4_EMG_butter_low_high.csv") 
load_types = ["Horizontal box", "Sand bag", "Vertical box"]

for load in load_types:
    print(f"\n=== Load type: {load} ===")
    df_sub = df[df["Load Type"] == load].copy()
    if len(df_sub) < 50:
        print("  Not enough samples, skipping.")
        continue

    # features: all EMG_MVC_* features + optional height/depth
    emg_cols  = [c for c in df_sub.columns if c.startswith("EMG_MVC_CH")]
    cond_cols = [c for c in ["Lifting Height", "Lifting Depth"] if c in df_sub.columns]

    X_base = df_sub[emg_cols + cond_cols]
    X = pd.get_dummies(X_base, drop_first=True)

    y = df_sub["Box Weight"]
    groups = df_sub["Timeline"]

    # grouped split by trial
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, test_idx = next(gss.split(X, y, groups))

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    rf = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=1,
        bootstrap=True,
        random_state=42,
    )
    rf.fit(X_train, y_train)

    y_tr_pred = rf.predict(X_train)
    y_te_pred = rf.predict(X_test)

    train_r2  = r2_score(y_train, y_tr_pred)
    test_r2   = r2_score(y_test, y_te_pred)
    train_mae = mean_absolute_error(y_train, y_tr_pred)
    test_mae  = mean_absolute_error(y_test, y_te_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_tr_pred))
    test_rmse  = np.sqrt(mean_squared_error(y_test, y_te_pred))

    print(f"  N = {len(df_sub)}")
    print(f"  Train R^2  : {train_r2:.3f}")
    print(f"  Test  R^2  : {test_r2:.3f}")
    print(f"  Train MAE  : {train_mae:.3f}")
    print(f"  Test  MAE  : {test_mae:.3f}")
    print(f"  Train RMSE : {train_rmse:.3f}")
    print(f"  Test  RMSE : {test_rmse:.3f}")



=== Load type: Horizontal box ===
  N = 14086
  Train R^2  : 0.490
  Test  R^2  : -0.218
  Train MAE  : 1.683
  Test  MAE  : 1.890
  Train RMSE : 2.166
  Test  RMSE : 2.472

=== Load type: Sand bag ===
  N = 10758
  Train R^2  : 0.703
  Test  R^2  : -0.593
  Train MAE  : 1.256
  Test  MAE  : 2.194
  Train RMSE : 1.635
  Test  RMSE : 2.893

=== Load type: Vertical box ===
  N = 14072
  Train R^2  : 0.564
  Test  R^2  : -0.238
  Train MAE  : 1.526
  Test  MAE  : 1.951
  Train RMSE : 1.984
  Test  RMSE : 2.529


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# full feature table
df = pd.read_csv(r"C:\Users\lilin\OneDrive\Desktop\Project\EMG\Features\P3P4_EMG_butter_low_high.csv")

load_types = ["Horizontal box", "Sand bag", "Vertical box"]

for load in load_types:
    print(f"\n=== Load type: {load} ===")
    df_sub = df[df["Load Type"] == load].copy()
    if len(df_sub) < 50:
        print("  Not enough samples, skipping.")
        continue

    # features: all EMG_MVC_* features + optional height/depth
    emg_cols  = [c for c in df_sub.columns if c.startswith("EMG_MVC_CH")]
    cond_cols = [c for c in ["Lifting Height", "Lifting Depth"] if c in df_sub.columns]

    X_base = df_sub[emg_cols + cond_cols]
    X = pd.get_dummies(X_base, drop_first=True)

    y = df_sub["Box Weight"]
    groups = df_sub["Timeline"]

    # -------- grouped + per-weight train/test split --------
    weights = sorted(df_sub["Box Weight"].round(1).unique())

    train_idx_all = []
    test_idx_all  = []

    for w in weights:
        mask = df_sub["Box Weight"].round(1) == w
        X_w      = X[mask]
        y_w      = y[mask]
        groups_w = groups[mask]

        # need at least 2 timelines to split this weight
        if groups_w.nunique() < 2:
            continue

        gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        tr_idx_w, te_idx_w = next(gss.split(X_w, y_w, groups_w))

        global_idx = np.where(mask.values)[0]
        train_idx_all.extend(global_idx[tr_idx_w])
        test_idx_all.extend(global_idx[te_idx_w])

    train_idx_all = np.array(train_idx_all)
    test_idx_all  = np.array(test_idx_all)

    X_train, X_test = X.iloc[train_idx_all], X.iloc[test_idx_all]
    y_train, y_test = y.iloc[train_idx_all], y.iloc[test_idx_all]

    # -------- random forest regressor --------
    rf = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=1,
        bootstrap=True,
        random_state=42,
    )
    rf.fit(X_train, y_train)

    y_tr_pred = rf.predict(X_train)
    y_te_pred = rf.predict(X_test)

    train_r2  = r2_score(y_train, y_tr_pred)
    test_r2   = r2_score(y_test, y_te_pred)
    train_mae = mean_absolute_error(y_train, y_tr_pred)
    test_mae  = mean_absolute_error(y_test, y_te_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_tr_pred))
    test_rmse  = np.sqrt(mean_squared_error(y_test, y_te_pred))

    print(f"  N = {len(df_sub)}")
    print(f"  Train R^2  : {train_r2:.3f}")
    print(f"  Test  R^2  : {test_r2:.3f}")
    print(f"  Train MAE  : {train_mae:.3f}")
    print(f"  Test  MAE  : {test_mae:.3f}")
    print(f"  Train RMSE : {train_rmse:.3f}")
    print(f"  Test  RMSE : {test_rmse:.3f}")

    # optional: check distribution of weights in train/test
    print("  Train weight counts:\n", y_train.round(1).value_counts())
    print("  Test  weight counts:\n", y_test.round(1).value_counts())



=== Load type: Horizontal box ===
  N = 14086
  Train R^2  : 0.454
  Test  R^2  : -0.257
  Train MAE  : 1.615
  Test  MAE  : 2.662
  Train RMSE : 2.123
  Test  RMSE : 3.254
  Train weight counts:
 Box Weight
6.8     3562
9.1     2002
11.3    1980
4.5     1503
2.3     1438
Name: count, dtype: int64
  Test  weight counts:
 Box Weight
6.8     1168
2.3      692
4.5      654
9.1      593
11.3     494
Name: count, dtype: int64

=== Load type: Sand bag ===
  N = 10758
  Train R^2  : 0.670
  Test  R^2  : -0.350
  Train MAE  : 1.265
  Test  MAE  : 2.703
  Train RMSE : 1.642
  Test  RMSE : 3.408
  Train weight counts:
 Box Weight
6.8     2804
9.1     1446
11.3    1425
4.5     1242
2.3     1140
Name: count, dtype: int64
  Test  weight counts:
 Box Weight
6.8     808
4.5     513
9.1     493
2.3     474
11.3    413
Name: count, dtype: int64

=== Load type: Vertical box ===
  N = 14072
  Train R^2  : 0.492
  Test  R^2  : -0.107
  Train MAE  : 1.523
  Test  MAE  : 2.501
  Train RMSE : 2.024
  Test  