In [4]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, kruskal


In [15]:
#Read Data

# Physiological data
physiological_df = pd.read_csv(
    "combined_output.csv",
    low_memory=False
)

# Activity labels 
activity_df = pd.read_excel("updated_activity_labels_2.xlsx")


In [16]:
#Quick sanity check

print(physiological_df.head())
print(activity_df.head())

print(physiological_df.columns)
print(activity_df.columns)


                   timestamp user    ECG Accelerometer_X Accelerometer_Y  \
0  2023-08-09 09:16:42+00:00    1  0.018        -295.104         -11.561   
1  2023-08-09 09:16:43+00:00    1  0.029        -289.318         -12.666   
2  2023-08-09 09:16:44+00:00    1  0.028        -289.527          -8.734   
3  2023-08-09 09:16:45+00:00    1  0.024        -291.045          -8.093   
4  2023-08-09 09:16:46+00:00    1   0.02        -270.918           6.211   

  Accelerometer_Z Marker      HRV  
0        -905.376    0.0  -16.059  
1         -909.46    0.0  350.336  
2        -909.541    0.0  227.084  
3        -908.504    0.0   211.71  
4        -917.384    0.0  122.782  
   sampling_id       date  activity_number         description  intensity  \
0            1 2023-08-09                1     At the computer        0.0   
1            1 2023-08-09                2            Scooting        1.0   
2            1 2023-08-09                3  Walking and stairs        1.0   
3            1 2023

In [20]:
activity_df = activity_df.copy()

# Ensure date is a datetime date
activity_df["date"] = pd.to_datetime(activity_df["date"]).dt.date

# Convert time_of_day to a consistent HH:MM string
# 1) coerce to string
tod = activity_df["time_of_day"].astype(str).str.strip()

# 2) if it's like '11:30:00', keep only HH:MM - this is a cleaned version of the time_of_day column
tod = tod.str.slice(0, 5)

activity_df["start_time"] = pd.to_datetime(
    activity_df["date"].astype(str) + " " + tod,
    format="%Y-%m-%d %H:%M",
    errors="coerce"
)

# Create end_time using minutes column
activity_df["end_time"] = activity_df["start_time"] + pd.to_timedelta(activity_df["minutes"], unit="m") #Creates end_time column by 
#calculating the duration of each activity based on the start_time and the minutes column.

# 1) Force to string and remove obvious junk header rows
physiological_df["timestamp"] = physiological_df["timestamp"].astype(str).str.strip()

# Drop rows where timestamp is literally the header or blank
physiological_df = physiological_df[~physiological_df["timestamp"].isin(["timestamp", "nan", "NaN", ""])]


# 2) Parse with robust settings
physiological_df["timestamp"] = pd.to_datetime(
    physiological_df["timestamp"],
    utc=True,
    errors="coerce",
    format="mixed"   # handles "2023-08-09 09:16:42+00:00" and other ISO variations
).dt.tz_convert(None)

# 3) Drop any rows that still failed parsing
physiological_df = physiological_df.dropna(subset=["timestamp"])


physiological_df["timestamp"] = pd.to_datetime(
    physiological_df["timestamp"],
    utc=True
).dt.tz_convert(None)

print("Physio:", physiological_df["timestamp"].min(), "→", physiological_df["timestamp"].max())
print("Activity:", activity_df["start_time"].min(), "→", activity_df["end_time"].max())


# Sanity check - check that all the correct columns are there
print("Physiological DF columns:")
for c in physiological_df.columns:
    print("  -", c)

print("\nActivity DF columns:")
for c in activity_df.columns:
    print("  -", c)


Physio: 2023-08-09 09:16:42 → 2025-02-05 15:38:01
Activity: 2023-08-09 08:57:00 → 2024-11-12 23:00:00
Physiological DF columns:
  - timestamp
  - user
  - ECG
  - Accelerometer_X
  - Accelerometer_Y
  - Accelerometer_Z
  - Marker
  - HRV

Activity DF columns:
  - sampling_id
  - date
  - activity_number
  - description
  - intensity
  - minutes
  - time_of_day
  - activity_label
  - start_time
  - end_time


In [21]:
set(physiological_df["user"].unique()) & set(activity_df["sampling_id"].unique()) #Check for common users between physiological and activity data


set()

In [None]:
def assign_activity(ts, user_id, activity_df):
    match = activity_df[
        (activity_df["sampling_id"] == user_id) &
        (activity_df["start_time"] <= ts) &
        (activity_df["end_time"] > ts)
    ]
    if len(match) > 0:
        return match.iloc[0]["activity_label"]
    else:
        return np.nan


physiological_df["activity_label"] = physiological_df.apply(
        lambda row: assign_activity(
        row["timestamp"],
        row["user"],
        activity_df
    ),
    axis=1
)

physiological_df["activity_label"].value_counts(dropna=False)
