In [2]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import os
import numpy as np
from pathlib import Path


BASE_DIR = Path("DataPaper")


In [None]:
def concatenate(no):
    user = f"user_{no}"    
    user_1_actigraph = pd.read_csv(os.path.join(BASE_DIR, user, "Actigraph.csv"))
    user_1_activity = pd.read_csv(os.path.join(BASE_DIR, user, "Activity.csv"))
    user_1_actigraph.head()
    # print(user_1_actigraph.columns)
    # print(user_1_activity.columns)
    # print(user_1_activity.head(10))
    base=pd.Timestamp("2020-01-01")
    us1=user_1_actigraph
    us1["ts"] = base + pd.to_timedelta(us1["day"] - 1, unit="D") + pd.to_timedelta(us1["time"]) #way to make timestamp unit D is days and default is just time
    #creates a 10s window
    us1["window"]=us1["ts"].dt.floor("10s")

    # print(us1[["day","time","ts","window"]].head(15))
    # print("Unique Windows:",us1["window"].nunique)
    # print(us1.groupby("window").size().describe())
    # Ensure it's sorted by time
    us1 = us1.sort_values("ts").reset_index(drop=True) #sorts and drops the old index to make it non sequential
    us1_1=user_1_activity
    # Build start/end timestamps for intervals
    us1_1["start_ts"] = base + pd.to_timedelta(us1_1["Day"] - 1, unit="D") + pd.to_timedelta(us1_1["Start"] + ":00")
    us1_1["end_ts"]   = base + pd.to_timedelta(us1_1["Day"] - 1, unit="D") + pd.to_timedelta(us1_1["End"] + ":00")

    act1 = us1_1.sort_values("start_ts").reset_index(drop=True)
    # Attach the most recent interval start <= ts, then drop if ts is after end_ts
    us1_labeled = pd.merge_asof(
        us1,
        act1[["start_ts", "end_ts", "Activity"]],
        left_on="ts",
        right_on="start_ts",
        direction="backward"
    )
    us1_labeled["Activity"] = np.where(
        us1_labeled["end_ts"].notna() & (us1_labeled["ts"] <= us1_labeled["end_ts"]),  #if condition met keep Activity if not becomes NaN
        us1_labeled["Activity"],
        np.nan
    )
    MIN_DOM=0.8 #A 10-second window is kept only if at least 80% of its accelerometer rows correspond to the same activity.
    # --- 1) ensure numeric accelerometer columns ---
    feat_cols = ["Axis1", "Axis2", "Axis3", "Vector Magnitude"]
    for c in feat_cols:
        us1_labeled[c]=pd.to_numeric(us1_labeled[c],errors="coerce")
    # --- 2) compute window-level features: mean + std ---
    X_win = us1_labeled.groupby("window")[feat_cols].agg(["mean", "std"])
    X_win.columns = [f"{a}_{b}" for a, b in X_win.columns]
    X_win = X_win.reset_index()


    def dominant_label(s): #s is a column with activity
        s=s.dropna()
        if s.empty:
            return (np.nan,0.0) #return empty list with a dominance of 0
        vc = s.value_counts(normalize=True) #proportions if there a different activity durations
        return (vc.index[0],float(vc.iloc[0]))

    y_win = us1_labeled.groupby("window")["Activity"].apply(dominant_label).reset_index()  #apply the dominant labels
    # print(y_win)
    y_win[["Activity_win", "dominance"]]=pd.DataFrame(y_win["Activity"].to_list(),index=y_win.index)
    y_win = y_win.drop(columns=["Activity"])
    dataset_user1=X_win.merge(y_win,on="window",how="left") #y_win = y_win.drop(columns=["Activity"])

    dataset_user1 = dataset_user1.dropna(subset=["Activity_win"])  
    dataset_user1 = dataset_user1[dataset_user1["dominance"] >= MIN_DOM].reset_index(drop=True) #drops the ones that do not have dominance
    dataset_user1["user"] = user
    return dataset_user1

all_users = []
for i in range(1, 23):
    try:
        all_users.append(concatenate(i))
    except Exception as e:
        print(f"User {i} failed: {e}")

dataset_all = pd.concat(all_users, ignore_index=True)

In [None]:
print("\nFinal dataset:", dataset_all.shape)
print("Users included:", dataset_all["user"].nunique())
print("Activities:", sorted(dataset_all["Activity_win"].unique()))

In [None]:
from sklearn.model_selection import train_test_split
feature_cols = [c for c in dataset_all.columns 
                if c not in ["window", "Activity_win", "dominance", "user"]]  #X is what the model sees all the numbers and y is what the model needs to output

X = dataset_all[feature_cols]
y = dataset_all["Activity_win"]
users = dataset_all["user"]
unique_users = users.unique()

train_users, test_users = train_test_split(
    unique_users,
    test_size=0.25,      # e.g. ~5–6 users for testing
    random_state=42
)

train_mask = users.isin(train_users)  #checks if belongs to train or test
test_mask  = users.isin(test_users)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]


    day      time                  ts              window
0     1  10:10:22 2020-01-01 10:10:22 2020-01-01 10:10:20
1     1  10:10:23 2020-01-01 10:10:23 2020-01-01 10:10:20
2     1  10:10:24 2020-01-01 10:10:24 2020-01-01 10:10:20
3     1  10:10:25 2020-01-01 10:10:25 2020-01-01 10:10:20
4     1  10:10:26 2020-01-01 10:10:26 2020-01-01 10:10:20
5     1  10:10:27 2020-01-01 10:10:27 2020-01-01 10:10:20
6     1  10:10:28 2020-01-01 10:10:28 2020-01-01 10:10:20
7     1  10:10:29 2020-01-01 10:10:29 2020-01-01 10:10:20
8     1  10:10:30 2020-01-01 10:10:30 2020-01-01 10:10:30
9     1  10:10:31 2020-01-01 10:10:31 2020-01-01 10:10:30
10    1  10:10:32 2020-01-01 10:10:32 2020-01-01 10:10:30
11    1  10:11:01 2020-01-01 10:11:01 2020-01-01 10:11:00
12    1  10:11:02 2020-01-01 10:11:02 2020-01-01 10:11:00
13    1  10:11:03 2020-01-01 10:11:03 2020-01-01 10:11:00
14    1  10:11:04 2020-01-01 10:11:04 2020-01-01 10:11:00
Unique Windows: <bound method IndexOpsMixin.nunique of 0       2020-01-0

In [None]:
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

groups=users[train_mask]
rf=RandomForestClassifier(n_estimators=100,random_state=42,n_jobs=1)

cv = GroupKFold(n_splits=5)
param_dist = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_leaf": randint(1, 15),
    "max_features": ["sqrt", 0.5, 0.7]
}

search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=12,
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=42
)
search.fit(X_train, y_train, groups=groups)

                    ts              window  Activity
0  2020-01-01 10:10:22 2020-01-01 10:10:20       2.0
1  2020-01-01 10:10:23 2020-01-01 10:10:20       2.0
2  2020-01-01 10:10:24 2020-01-01 10:10:20       2.0
3  2020-01-01 10:10:25 2020-01-01 10:10:20       2.0
4  2020-01-01 10:10:26 2020-01-01 10:10:20       2.0
5  2020-01-01 10:10:27 2020-01-01 10:10:20       2.0
6  2020-01-01 10:10:28 2020-01-01 10:10:20       2.0
7  2020-01-01 10:10:29 2020-01-01 10:10:20       2.0
8  2020-01-01 10:10:30 2020-01-01 10:10:30       2.0
9  2020-01-01 10:10:31 2020-01-01 10:10:30       2.0
10 2020-01-01 10:10:32 2020-01-01 10:10:30       2.0
11 2020-01-01 10:11:01 2020-01-01 10:11:00       2.0
12 2020-01-01 10:11:02 2020-01-01 10:11:00       2.0
13 2020-01-01 10:11:03 2020-01-01 10:11:00       2.0
14 2020-01-01 10:11:04 2020-01-01 10:11:00       2.0
15 2020-01-01 10:11:05 2020-01-01 10:11:00       2.0
16 2020-01-01 10:11:51 2020-01-01 10:11:50       2.0
17 2020-01-01 10:11:52 2020-01-01 10:11:50    

In [None]:
from sklearn.metrics import classification_report
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

               window  Axis1_mean  Axis1_std  Axis2_mean  Axis2_std  \
0 2020-01-01 10:10:20   20.375000  34.735480   12.750000  11.548036   
1 2020-01-01 10:10:30   35.666667  25.696952   53.333333  17.243356   
2 2020-01-01 10:11:00   43.400000  33.627370   45.000000  29.197603   
3 2020-01-01 10:11:50   25.555556  25.239409   19.333333  24.402869   
4 2020-01-01 10:12:00   47.800000  33.469057   98.400000  81.124322   

   Axis3_mean   Axis3_std  Vector Magnitude_mean  Vector Magnitude_std  \
0   33.250000   31.349413              45.683750             43.143837   
1   68.333333   76.787586              99.130000             72.752297   
2  120.800000   59.440727             140.454000             63.122547   
3   57.000000   54.703748              71.177778             57.754984   
4  164.500000  125.772502             206.190000            140.164040   

   Activity_win  dominance    user  
0           2.0        1.0  user_1  
1           2.0        1.0  user_1  
2           2.0  

In [None]:

print(dataset_all["Activity_win"].value_counts())

#Drop ultra-rare classes 
dataset_all2 = dataset_all[~dataset_all["Activity_win"].isin([4.0, 10.0])].copy()

#Rebuild X, y, users after dropping classes
feature_cols = [c for c in dataset_all2.columns
                if c not in ["window", "Activity_win", "dominance", "user"]]

X = dataset_all2[feature_cols]
y = dataset_all2["Activity_win"]
users = dataset_all2["user"]

#User-independent split (train/test by user)
unique_users = users.unique()
train_users, test_users = train_test_split(
    unique_users, test_size=0.25, random_state=42
)

train_mask = users.isin(train_users)
test_mask  = users.isin(test_users)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]
groups = users[train_mask]   # groups aligned to X_train rows

# Base RF for tuning (keep trees smaller during search)
rf = RandomForestClassifier(
    n_estimators=150,          # tune faster; increase later for final fit if you want
    class_weight="balanced",
    random_state=42,
    n_jobs=1                   # avoid nested parallelism
)

# 6) Group-aware CV and randomized search space
cv = GroupKFold(n_splits=5)

param_dist = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_leaf": randint(1, 15),
    "max_features": ["sqrt", 0.5, 0.7]
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=12,
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,                 # parallelize over CV/params
    random_state=42,
    verbose=1
)

#Fit search using groups 
search.fit(X_train, y_train, groups=groups)

best_model = search.best_estimator_
print("Best params:", search.best_params_)
print("Best CV macro-F1:", search.best_score_)

# Final test evaluation
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
pd.Series(y_pred).value_counts().sort_index()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GroupKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.stats import randint

# 1) Copy dataset and define "rare" classes to merge
dataset_all2 = dataset_all.copy()

rare = [1.0, 5.0, 9.0, 11.0, 12.0]   # merge these into Other
OTHER_LABEL = 99.0

dataset_all2["Activity_bin"] = dataset_all2["Activity_win"].where(
    ~dataset_all2["Activity_win"].isin(rare),
    other=OTHER_LABEL
)

print("New label distribution:")
print(dataset_all2["Activity_bin"].value_counts())

# 2) Build X, y, users
feature_cols = [c for c in dataset_all2.columns
                if c not in ["window", "Activity_win", "Activity_bin", "dominance", "user"]]

X = dataset_all2[feature_cols]
y = dataset_all2["Activity_bin"]
users = dataset_all2["user"]

# 3) User-independent split by user
unique_users = users.unique()
train_users, test_users = train_test_split(unique_users, test_size=0.25, random_state=42)

train_mask = users.isin(train_users)
test_mask  = users.isin(test_users)

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]
groups = users[train_mask]  # groups aligned with X_train

# 4) RF + group-aware CV + randomized tuning
rf = RandomForestClassifier(
    n_estimators=150,               # keep small during tuning
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=1
)

cv = GroupKFold(n_splits=5)

param_dist = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_leaf": randint(1, 15),
    "max_features": ["sqrt", 0.5, 0.7]
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=12,
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=42,
    verbose=1
)

search.fit(X_train, y_train, groups=groups)

best_model = search.best_estimator_
print("Best params:", search.best_params_)
print("Best CV macro-F1:", search.best_score_)

# 5) Test evaluation
y_pred = best_model.predict(X_test)

print("\nPredicted distribution:")
print(pd.Series(y_pred).value_counts().sort_index())

print("\nClassification report:")
print(classification_report(y_test, y_pred, zero_division=0))


Macro F1 score perhaps the most important recognizes ,“On average, how well does the model recognise each type of activity, regardless of how frequent it is?”. Had to merge as lower activities were not getting predicted well and hurting the score, merged into another called "99" here.