In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import os
import numpy as np
from pathlib import Path


BASE_DIR = Path("DataPaper")


In [4]:
def concatenate(no):
    user = f"user_{no}"    
    user_1_actigraph = pd.read_csv(os.path.join(BASE_DIR, user, "Actigraph.csv"))
    user_1_activity = pd.read_csv(os.path.join(BASE_DIR, user, "Activity.csv"))
    user_1_actigraph.head()
    # print(user_1_actigraph.columns)
    # print(user_1_activity.columns)
    # print(user_1_activity.head(10))
    base=pd.Timestamp("2020-01-01")
    us1=user_1_actigraph
    us1["ts"] = base + pd.to_timedelta(us1["day"] - 1, unit="D") + pd.to_timedelta(us1["time"]) #way to make timestamp unit D is days and default is just time
    #creates a 10s window
    us1["window"]=us1["ts"].dt.floor("10s")

    # print(us1[["day","time","ts","window"]].head(15))
    # print("Unique Windows:",us1["window"].nunique)
    # print(us1.groupby("window").size().describe())
    # Ensure it's sorted by time
    us1 = us1.sort_values("ts").reset_index(drop=True) #sorts and drops the old index to make it non sequential
    us1_1=user_1_activity
    # Build start/end timestamps for intervals
    us1_1["start_ts"] = base + pd.to_timedelta(us1_1["Day"] - 1, unit="D") + pd.to_timedelta(us1_1["Start"] + ":00")
    us1_1["end_ts"]   = base + pd.to_timedelta(us1_1["Day"] - 1, unit="D") + pd.to_timedelta(us1_1["End"] + ":00")

    act1 = us1_1.sort_values("start_ts").reset_index(drop=True)
    # Attach the most recent interval start <= ts, then drop if ts is after end_ts
    us1_labeled = pd.merge_asof(
        us1,
        act1[["start_ts", "end_ts", "Activity"]],
        left_on="ts",
        right_on="start_ts",
        direction="backward"
    )
    us1_labeled["Activity"] = np.where(
        us1_labeled["end_ts"].notna() & (us1_labeled["ts"] <= us1_labeled["end_ts"]),  #if condition met keep Activity if not becomes NaN
        us1_labeled["Activity"],
        np.nan
    )
    MIN_DOM=0.8 #A 10-second window is kept only if at least 80% of its accelerometer rows correspond to the same activity.
    # --- 1) ensure numeric accelerometer columns ---
    feat_cols = ["Axis1", "Axis2", "Axis3", "Vector Magnitude"]
    for c in feat_cols:
        us1_labeled[c]=pd.to_numeric(us1_labeled[c],errors="coerce")
    # --- 2) compute window-level features: mean + std ---
    X_win = us1_labeled.groupby("window")[feat_cols].agg(["mean", "std"])
    X_win.columns = [f"{a}_{b}" for a, b in X_win.columns]
    X_win = X_win.reset_index()


    def dominant_label(s): #s is a column with activity
        s=s.dropna()
        if s.empty:
            return (np.nan,0.0) #return empty list with a dominance of 0
        vc = s.value_counts(normalize=True) #proportions if there a different activity durations
        return (vc.index[0],float(vc.iloc[0]))

    y_win = us1_labeled.groupby("window")["Activity"].apply(dominant_label).reset_index()  #apply the dominant labels
    # print(y_win)
    y_win[["Activity_win", "dominance"]]=pd.DataFrame(y_win["Activity"].to_list(),index=y_win.index)
    y_win = y_win.drop(columns=["Activity"])
    dataset_user1=X_win.merge(y_win,on="window",how="left") #y_win = y_win.drop(columns=["Activity"])

    dataset_user1 = dataset_user1.dropna(subset=["Activity_win"])  
    dataset_user1 = dataset_user1[dataset_user1["dominance"] >= MIN_DOM].reset_index(drop=True) #drops the ones that do not have dominance
    dataset_user1["user"] = user
    return dataset_user1

all_users = []
for i in range(1, 23):
    try:
        all_users.append(concatenate(i))
    except Exception as e:
        print(f"User {i} failed: {e}")

dataset_all = pd.concat(all_users, ignore_index=True)

In [5]:
print("\nFinal dataset:", dataset_all.shape)
print("Users included:", dataset_all["user"].nunique())
print("Activities:", sorted(dataset_all["Activity_win"].unique()))


Final dataset: (111410, 12)
Users included: 22
Activities: [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]


In [6]:
from sklearn.model_selection import train_test_split
feature_cols = [c for c in dataset_all.columns 
                if c not in ["window", "Activity_win", "dominance", "user"]]  #X is what the model sees all the numbers and y is what the model needs to output

X = dataset_all[feature_cols]
y = dataset_all["Activity_win"]
users = dataset_all["user"]
unique_users = users.unique()

train_users, test_users = train_test_split(
    unique_users,
    test_size=0.25,      # e.g. ~5–6 users for testing
    random_state=42
)

train_mask = users.isin(train_users)  #checks if belongs to train or test
test_mask  = users.isin(test_users)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]


In [7]:
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

groups=users[train_mask]
rf=RandomForestClassifier(n_estimators=100,random_state=42,n_jobs=1)

cv = GroupKFold(n_splits=5)
param_dist = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_leaf": randint(1, 15),
    "max_features": ["sqrt", 0.5, 0.7]
}

search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=12,
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=42
)
search.fit(X_train, y_train, groups=groups)

In [8]:
from sklearn.metrics import classification_report
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.58      0.94      0.72      8672
         1.0       0.00      0.00      0.00       556
         2.0       0.19      0.13      0.16      3719
         3.0       0.28      0.81      0.41      4059
         4.0       0.00      0.00      0.00       158
         5.0       0.53      0.15      0.24       509
         6.0       0.13      0.01      0.02      2685
         7.0       0.38      0.07      0.12      6101
         8.0       0.38      0.00      0.00      2746
         9.0       0.00      0.00      0.00       255
        10.0       0.00      0.00      0.00       302
        12.0       0.00      0.00      0.00       330

    accuracy                           0.42     30092
   macro avg       0.21      0.18      0.14     30092
weighted avg       0.36      0.42      0.31     30092



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:

print(dataset_all["Activity_win"].value_counts())

#Drop ultra-rare classes 
dataset_all2 = dataset_all[~dataset_all["Activity_win"].isin([4.0, 10.0])].copy()

#Rebuild X, y, users after dropping classes
feature_cols = [c for c in dataset_all2.columns
                if c not in ["window", "Activity_win", "dominance", "user"]]

X = dataset_all2[feature_cols]
y = dataset_all2["Activity_win"]
users = dataset_all2["user"]

#User-independent split (train/test by user)
unique_users = users.unique()
train_users, test_users = train_test_split(
    unique_users, test_size=0.25, random_state=42
)

train_mask = users.isin(train_users)
test_mask  = users.isin(test_users)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]
groups = users[train_mask]   # groups aligned to X_train rows

# Base RF for tuning (keep trees smaller during search)
rf = RandomForestClassifier(
    n_estimators=150,          # tune faster; increase later for final fit if you want
    class_weight="balanced",
    random_state=42,
    n_jobs=1                   # avoid nested parallelism
)

# 6) Group-aware CV and randomized search space
cv = GroupKFold(n_splits=5)

param_dist = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_leaf": randint(1, 15),
    "max_features": ["sqrt", 0.5, 0.7]
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=12,
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,                 # parallelize over CV/params
    random_state=42,
    verbose=1
)

#Fit search using groups 
search.fit(X_train, y_train, groups=groups)

best_model = search.best_estimator_
print("Best params:", search.best_params_)
print("Best CV macro-F1:", search.best_score_)

# Final test evaluation
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Activity_win
0.0     29860
3.0     24771
2.0     16439
7.0     15499
6.0      9930
8.0      5603
5.0      2749
1.0      1781
9.0      1695
12.0     1235
11.0      937
4.0       518
10.0      393
Name: count, dtype: int64
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params: {'max_depth': 20, 'max_features': 0.5, 'min_samples_leaf': 3}
Best CV macro-F1: 0.15618003330879882
              precision    recall  f1-score   support

         0.0       0.58      0.94      0.72      8672
         1.0       0.01      0.00      0.00       556
         2.0       0.22      0.13      0.16      3719
         3.0       0.35      0.65      0.46      4059
         5.0       0.18      0.19      0.19       509
         6.0       0.13      0.07      0.09      2685
         7.0       0.35      0.12      0.18      6101
         8.0       0.17      0.05      0.07      2746
         9.0       0.01      0.02      0.01       255
        11.0       0.00      0.00      0.00         0
        12

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
pd.Series(y_pred).value_counts().sort_index()

0.0     14039
1.0       145
2.0      2208
3.0      7445
5.0       539
6.0      1542
7.0      2105
8.0       748
9.0       421
11.0      284
12.0      156
Name: count, dtype: int64

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GroupKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.stats import randint

# 1) Copy dataset and define "rare" classes to merge
dataset_all2 = dataset_all.copy()

rare = [1.0, 5.0, 9.0, 11.0, 12.0]   # merge these into Other
OTHER_LABEL = 99.0

dataset_all2["Activity_bin"] = dataset_all2["Activity_win"].where(
    ~dataset_all2["Activity_win"].isin(rare),
    other=OTHER_LABEL
)

print("New label distribution:")
print(dataset_all2["Activity_bin"].value_counts())

# 2) Build X, y, users
feature_cols = [c for c in dataset_all2.columns
                if c not in ["window", "Activity_win", "Activity_bin", "dominance", "user"]]

X = dataset_all2[feature_cols]
y = dataset_all2["Activity_bin"]
users = dataset_all2["user"]

# 3) User-independent split by user
unique_users = users.unique()
train_users, test_users = train_test_split(unique_users, test_size=0.25, random_state=42)

train_mask = users.isin(train_users)
test_mask  = users.isin(test_users)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]
groups = users[train_mask]  # groups aligned with X_train

# 4) RF + group-aware CV + randomized tuning
rf = RandomForestClassifier(
    n_estimators=150,               # keep small during tuning
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=1
)

cv = GroupKFold(n_splits=5)

param_dist = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_leaf": randint(1, 15),
    "max_features": ["sqrt", 0.5, 0.7]
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=12,
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=42,
    verbose=1
)

search.fit(X_train, y_train, groups=groups)

best_model = search.best_estimator_
print("Best params:", search.best_params_)
print("Best CV macro-F1:", search.best_score_)

# 5) Test evaluation
y_pred = best_model.predict(X_test)

print("\nPredicted distribution:")
print(pd.Series(y_pred).value_counts().sort_index())

print("\nClassification report:")
print(classification_report(y_test, y_pred, zero_division=0))


New label distribution:
Activity_bin
0.0     29860
3.0     24771
2.0     16439
7.0     15499
6.0      9930
99.0     8397
8.0      5603
4.0       518
10.0      393
Name: count, dtype: int64
Fitting 5 folds for each of 12 candidates, totalling 60 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best params: {'max_depth': 20, 'max_features': 0.7, 'min_samples_leaf': 11}
Best CV macro-F1: 0.1861761660039662

Predicted distribution:
0.0     14136
2.0      2037
3.0      6453
4.0       225
6.0      2038
7.0      2454
8.0      1246
10.0       71
99.0     1432
Name: count, dtype: int64

Classification report:
              precision    recall  f1-score   support

         0.0       0.58      0.94      0.72      8672
         2.0       0.21      0.12      0.15      3719
         3.0       0.35      0.56      0.43      4059
         4.0       0.03      0.04      0.03       158
         6.0       0.14      0.10      0.12      2685
         7.0       0.36      0.14      0.21      6101
         8.0       0.17      0.08      0.11      2746
        10.0       0.00      0.00      0.00       302
        99.0       0.13      0.11      0.12      1650

    accuracy                           0.41     30092
   macro avg       0.22      0.23      0.21     30092
weighted avg       0.35      0.41   

Macro F1 score perhaps the most important recognizes ,“On average, how well does the model recognise each type of activity, regardless of how frequent it is?”. Had to merge as lower activities were not getting predicted well and hurting the score, merged into another called "99" here.