## Setup

In [None]:
import pandas as pd
import numpy as np
import warnings

In [None]:
warnings.filterwarnings("ignore")

## Reading Files

Files needed:

- `log` - do_log_final.csv
- `act24_event` - ACT24_directobservationdata_event.csv

In [None]:
log = pd.read_csv("do_log_final.csv")
log

Unnamed: 0,id,do,start_month,start_day,start_year,start_time,stop_time,duration_hrs,duation_min
0,102,1,7,24,2019,8:20:19 AM,11:21:13 AM,3:00:54,180:54:00
1,102,2,7,25,2019,12:42:08 PM,3:42:00 PM,2:59:52,179:52:00
2,116,1,8,20,2019,8:08:45 AM,11:09:00 AM,3:00:15,180:15:00
3,116,2,8,21,2019,11:09:47 AM,1:28:00 PM,2:18:13,138:13:00
4,117,1,8,20,2019,3:17:46 PM,6:18:00 PM,3:00:14,180:14:00
5,117,2,8,21,2019,7:11:45 AM,10:13:00 AM,3:01:15,181:15:00
6,122,1,8,28,2019,10:06:41 AM,1:07:00 PM,3:00:19,180:19:00
7,122,2,8,29,2019,3:19:51 PM,6:23:00 PM,3:03:09,183:09:00
8,124,1,8,28,2019,2:59:09 PM,5:59:05 PM,2:59:56,179:56:00
9,124,2,8,29,2019,9:31:00 AM,12:31:00 PM,3:00:00,180:00:00


In [None]:
act24_event = pd.read_csv("ACT24_directobservationdata_event.csv")
act24_event

Unnamed: 0,Date_Time_Absolute_dmy_hmsf,Date_dmy,Time_Absolute_hms,Time_Absolute_f,Time_Relative_hmsf,Time_Relative_hms,Time_Relative_f,Time_Relative_sf,Duration_sf,Observation,Event_Log,Behavior,Modifier_1,Modifier_2,Modifier_3,Event_Type,Comment,Unnamed: 17,Unnamed: 18
0,24-03-2020 11:17:41.259,24-03-2020,11:17:41,259,##############################################...,0:00:00,1,0.00137,42.59830,ID_102_01_C,Event log,LA- stand,,,,State start,,,
1,24-03-2020 11:17:41.259,24-03-2020,11:17:41,259,##############################################...,0:00:00,1,0.00137,0.00000,ID_102_01_C,Event log,,,,,State point,,,
2,24-03-2020 11:17:41.259,24-03-2020,11:17:41,259,##############################################...,0:00:00,1,0.00137,0.00000,ID_102_01_C,Event log,,,,,State point,,,
3,24-03-2020 11:17:41.259,24-03-2020,11:17:41,259,##############################################...,0:00:00,1,0.00137,53.71450,ID_102_01_C,Event log,WRK- general,,,SP- Education and Health Services,State start,,,
4,24-03-2020 11:18:23.857,24-03-2020,11:18:23,857,00:42.6,0:00:42,597,42.59690,0.00000,ID_102_01_C,Event log,LA- stand,,,,State stop,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16125,20-05-2020 21:24:36.484,20-05-2020,21:24:36,484,02:09.6,0:02:09,594,129.59400,5.42198,ID_154_02_R,Event log,LA- stand and move,,light,,State start,,,
16126,20-05-2020 21:24:41.906,20-05-2020,21:24:41,906,02:15.0,0:02:15,16,135.01600,0.00000,ID_154_02_R,Event log,LA- stand and move,,light,,State stop,,,
16127,20-05-2020 21:24:41.906,20-05-2020,21:24:41,906,02:15.0,0:02:15,16,135.01600,10065.00000,ID_154_02_R,Event log,SB-sitting,,,,State start,,,
16128,21-05-2020 00:12:26.890,21-05-2020,0:12:26,890,50:00.0,2:50:00,0,10200.00000,0.00000,ID_154_02_R,Event log,"EDU- taking class, research, homework",,,,State stop,,,


## Data Wrangling

**GOAL:** Get a second-by-second ground truth file that is ready for mapping.

**Step 1:** Get `id` and `do` columns by parsing `Observation`. Filter to only "State start" rows. The assumption here that once the state has started, it does not change until the next consecutive state has started. Therefore, state end and state point are ignored.

In [None]:
# Remove unwanted columns. Filter to state start
act24 = act24_event.iloc[:,4:][act24_event["Event_Type"]=="State start"]
# Get id and do columns
act24["id"] = act24["Observation"].str.split("_").apply(lambda x: x[1]).astype(int)
act24["do"] = act24["Observation"].str.split("_").apply(lambda x: x[2]).astype(int)
# Display and sort for review
act24.sort_values(["id", "do", "Time_Relative_hmsf"])

Unnamed: 0,Time_Relative_hmsf,Time_Relative_hms,Time_Relative_f,Time_Relative_sf,Duration_sf,Observation,Event_Log,Behavior,Modifier_1,Modifier_2,Modifier_3,Event_Type,Comment,Unnamed: 17,Unnamed: 18,id,do
0,##############################################...,0:00:00,1,0.00137,42.59830,ID_102_01_C,Event log,LA- stand,,,,State start,,,,102,1
3,##############################################...,0:00:00,1,0.00137,53.71450,ID_102_01_C,Event log,WRK- general,,,SP- Education and Health Services,State start,,,,102,1
5,00:42.6,0:00:42,597,42.59690,17.31600,ID_102_01_C,Event log,WA- walk,,moderate,,State start,,,,102,1
169,00:52.4,3:00:52,448,10852.40000,1.66215,ID_102_01_C,Event log,OTHER- non codable,,,,State start,*did not capture the final time change; 13 sec...,,,102,1
171,00:52.4,3:00:52,449,10852.40000,1.66137,ID_102_01_C,Event log,private/not coded,,,,State start,,,,102,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16119,00:49.3,0:00:49,298,49.29830,9.20902,ID_154_02_R,Event log,WA-walk with load,,moderate,,State start,,,,154,2
16121,00:58.5,0:00:58,507,58.50730,37.72030,ID_154_02_R,Event log,WA- ascend stairs,,vigorous,,State start,,,,154,2
16123,01:36.2,0:01:36,228,96.22750,33.36600,ID_154_02_R,Event log,WA-walk with load,,moderate,,State start,,,,154,2
16125,02:09.6,0:02:09,594,129.59400,5.42198,ID_154_02_R,Event log,LA- stand and move,,light,,State start,,,,154,2


**Step 2:** Merge date info from `log` with time and ground truth coding from `act24`. Use this to calculate `date` and `date_time` columns.

**NOTE: Need information from `Time_Relative_hmsf` to deal with duplicate postures**

* Used the milliseconds of duplicate postures to determine which posture to code inplace
* Changed the `date_time` column back to hms format so we are not getting a millisecond-by-millisecond file

In [None]:
# Need to fix Time_Relative_hmsf to not contain weird stuff like #######
def fix_milli_errs(row):
  if "#" in row:
    return '00:00.0'
  else:
    return row

In [None]:
act24_2 = act24.copy()
act24_2["Time_Relative_hmsf"] = act24_2["Time_Relative_hmsf"].apply(fix_milli_errs)

In [None]:
# Join log and act24 data on id, do
sbs_1 = pd.merge(left=log, right=act24_2[["id", "do", "Time_Relative_hms", "Time_Relative_hmsf", "Behavior", "Modifier_1", 'Modifier_2', 'Modifier_3']], how="left", on=["id", "do"])

# Join starting date time columns as string, then convert it to datetime object
sbs_1["start"] = pd.to_datetime(sbs_1["start_month"].astype(str) + "/" + sbs_1["start_day"].astype(str) + "/" + sbs_1["start_year"].astype(str) + " " + sbs_1["start_time"], errors='coerce')

# Convert relative time to timedelta, which measures change in time from starting point. We assume missing values are 00:00:00 relative time
sbs_1["time_rel_ms"] = pd.to_timedelta("00:" + sbs_1["Time_Relative_hmsf"], errors='coerce').fillna(pd.to_timedelta("00:00:00.0"))
sbs_1["time_rel"] = pd.to_timedelta(sbs_1["Time_Relative_hms"], errors='coerce').fillna(pd.to_timedelta("00:00:00"))

# Calculate final date time column by adding relative time to starting datetime
sbs_1["date_time"] = sbs_1["start"] + sbs_1["time_rel"]

# Get column to store milliseconds since provided second, this will help us later when removing duplicate postures
ms = sbs_1["start"] + sbs_1["time_rel_ms"]
sbs_1["milliseconds"] = ms.dt.microsecond.astype(str).str[0].astype(int) * 0.1

# Get date from date time
sbs_1["date"] = sbs_1["date_time"].dt.date

# Filter columns
sbs_1 = sbs_1[["id", "do", "date", "date_time", "milliseconds", "Behavior", "Modifier_1", "Modifier_2", "Modifier_3"]]

# Display for review
sbs_1.sort_values("date_time")

Unnamed: 0,id,do,date,date_time,milliseconds,Behavior,Modifier_1,Modifier_2,Modifier_3
0,102,1,2019-07-24,2019-07-24 08:20:19,0.0,LA- stand,,,
1,102,1,2019-07-24,2019-07-24 08:20:19,0.0,WRK- general,,,SP- Education and Health Services
2,102,1,2019-07-24,2019-07-24 08:21:01,0.6,WA- walk,,moderate,
3,102,1,2019-07-24,2019-07-24 08:21:12,0.7,TRAV- walking,,,
4,102,1,2019-07-24,2019-07-24 08:21:18,0.9,WA- descend stairs,,moderate,
...,...,...,...,...,...,...,...,...,...
7883,154,2,2020-02-23,2020-02-23 13:04:58,0.3,WA-walk with load,,moderate,
7884,154,2,2020-02-23,2020-02-23 13:05:07,0.5,WA- ascend stairs,,vigorous,
7885,154,2,2020-02-23,2020-02-23 13:05:45,0.2,WA-walk with load,,moderate,
7886,154,2,2020-02-23,2020-02-23 13:06:18,0.6,LA- stand and move,,light,


**Step 3:** Separate posture and activity coding into 2 separate tables. Ignore codes that are to be dropped.

In [None]:
# This function splits out states into activity or posture
def get_coding(s):
    s_type = s.split("-")[0].strip()
    if s_type in ["SL", "PC", "HA", "CA", "WRK", "EDU",
                  "ORG", "PUR", "EAT", "LES", "EX", "TRAV"]:
        return "activity_type"
    elif s_type in ["SB", "LA", "WA", "SP"]:
        return "posture_type"
    return np.nan

In [None]:
# Remove empty behavior
sbs_2 = sbs_1.copy().dropna(subset="Behavior")
# Get coding using the custom function
sbs_2["coding"] = sbs_2["Behavior"].fillna("").apply(get_coding)
# Display for review
sbs_2

Unnamed: 0,id,do,date,date_time,milliseconds,Behavior,Modifier_1,Modifier_2,Modifier_3,coding
0,102,1,2019-07-24,2019-07-24 08:20:19,0.0,LA- stand,,,,posture_type
1,102,1,2019-07-24,2019-07-24 08:20:19,0.0,WRK- general,,,SP- Education and Health Services,activity_type
2,102,1,2019-07-24,2019-07-24 08:21:01,0.6,WA- walk,,moderate,,posture_type
3,102,1,2019-07-24,2019-07-24 08:21:12,0.7,TRAV- walking,,,,activity_type
4,102,1,2019-07-24,2019-07-24 08:21:18,0.9,WA- descend stairs,,moderate,,posture_type
...,...,...,...,...,...,...,...,...,...,...
7883,154,2,2020-02-23,2020-02-23 13:04:58,0.3,WA-walk with load,,moderate,,posture_type
7884,154,2,2020-02-23,2020-02-23 13:05:07,0.5,WA- ascend stairs,,vigorous,,posture_type
7885,154,2,2020-02-23,2020-02-23 13:05:45,0.2,WA-walk with load,,moderate,,posture_type
7886,154,2,2020-02-23,2020-02-23 13:06:18,0.6,LA- stand and move,,light,,posture_type


In [None]:
# Get table of postures only
sbs_posture = sbs_2[sbs_2["coding"] == "posture_type"]
sbs_posture["posture_type"] = sbs_posture["Behavior"]
# Rename relevant modifier columns for clarity
sbs_posture["activity_intensity"] = sbs_posture["Modifier_2"]
# Subset columns to relevent ones, drop duplicates if they exist
sbs_posture = sbs_posture[["id", "do", "date", "date_time", "milliseconds", "posture_type",
                           "activity_intensity"]].drop_duplicates()
# Display and review
sbs_posture

Unnamed: 0,id,do,date,date_time,milliseconds,posture_type,activity_intensity
0,102,1,2019-07-24,2019-07-24 08:20:19,0.0,LA- stand,
2,102,1,2019-07-24,2019-07-24 08:21:01,0.6,WA- walk,moderate
4,102,1,2019-07-24,2019-07-24 08:21:18,0.9,WA- descend stairs,moderate
5,102,1,2019-07-24,2019-07-24 08:21:32,0.9,WA- walk,moderate
6,102,1,2019-07-24,2019-07-24 08:21:39,0.9,WA- descend stairs,moderate
...,...,...,...,...,...,...,...
7883,154,2,2020-02-23,2020-02-23 13:04:58,0.3,WA-walk with load,moderate
7884,154,2,2020-02-23,2020-02-23 13:05:07,0.5,WA- ascend stairs,vigorous
7885,154,2,2020-02-23,2020-02-23 13:05:45,0.2,WA-walk with load,moderate
7886,154,2,2020-02-23,2020-02-23 13:06:18,0.6,LA- stand and move,light


In [None]:
# Get table of activities only
sbs_activity = sbs_2[sbs_2["coding"] == "activity_type"]
sbs_activity["activity_type"] = sbs_activity["Behavior"]
# Rename relevant modifier columns for clarity
sbs_activity["exercise_modifier"] = sbs_activity["Modifier_1"]
sbs_activity["work_type"] = sbs_activity["Modifier_3"]
# Subset columns to relevent ones, drop duplicates if they exist
sbs_activity = sbs_activity[["id", "do", "date", "date_time", "activity_type",
                             "exercise_modifier", "work_type"]].drop_duplicates()
sbs_activity

Unnamed: 0,id,do,date,date_time,activity_type,exercise_modifier,work_type
1,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,,SP- Education and Health Services
3,102,1,2019-07-24,2019-07-24 08:21:12,TRAV- walking,,
12,102,1,2019-07-24,2019-07-24 08:30:08,PUR- purchasing goods and services,,
21,102,1,2019-07-24,2019-07-24 08:33:55,TRAV- walking,,
28,102,1,2019-07-24,2019-07-24 08:42:43,WRK- general,,"SP- Office (business, professional services, f..."
...,...,...,...,...,...,...,...
7727,154,1,2020-02-22,2020-02-22 14:11:08,"LES- screen based leisure time (TV, video game...",,
7728,154,1,2020-02-22,2020-02-22 14:13:53,"EX- participating in sport, exercise or recrea...",weight training,
7737,154,1,2020-02-22,2020-02-22 15:11:44,"EX- participating in sport, exercise or recrea...",weight training,
7879,154,1,2020-02-22,2020-02-22 16:04:53,TRAV- walking,,


**Step 4:** Aggregate states so that there is only one row per `id`-`do`-`date_time`. Generally, there should only be at most one `posture_type` per row and one `activity_type` per row. However, there exist 3 rows where there are 2 `posture_type` states in the same second.

Dealing with Duplicate Postures in a Given Second
- "If a particular second has two postures assign the most prevalent (>50% of the second)."

In [None]:
def time_elapse_map(ms, minGrp, maxGrp):
  if minGrp == maxGrp:
    return 0
  else:
    if ms == minGrp:
      time_elapse = maxGrp - minGrp
    else:
      time_elapse = 1 - maxGrp
  return time_elapse

In [None]:
sbs_posture_1 = sbs_posture.copy()
sbs_posture_1["maxGroupValue"] = sbs_posture_1.groupby(["id", "do", "date_time"])["milliseconds"].transform('max')
sbs_posture_1["minGroupValue"] = sbs_posture_1.groupby(["id", "do", "date_time"])["milliseconds"].transform('min')

sbs_posture_1["timeOfPosture"] = sbs_posture_1.apply(lambda x: time_elapse_map(x["milliseconds"], x["minGroupValue"], x["maxGroupValue"]), axis = 1)
sbs_posture_1["maxIdx"] = sbs_posture_1.groupby(["id", "do", "date_time"])["timeOfPosture"].transform('idxmax')

#Keep only rows with most frequent posture
sbs_posture_1 = sbs_posture_1[sbs_posture_1["maxIdx"] == sbs_posture_1.index]

#Get rid of uneccesary columns
sbs_posture_1.drop(columns=["milliseconds", "maxGroupValue", "minGroupValue", "timeOfPosture"], inplace=True)

In [None]:
#Check that each second now only has one per timestamp
sbs_posture_1.groupby(["id", "do", "date_time"])["posture_type"].value_counts().to_frame(name="count").reset_index().sort_values(by="count")

Unnamed: 0,id,do,date_time,posture_type,count
0,102,1,2019-07-24 08:20:19,LA- stand,1
4606,138,2,2019-11-02 10:22:40,LA- stand and move,1
4605,138,2,2019-11-02 10:20:54,SB-sitting,1
4604,138,2,2019-11-02 10:20:26,WA- walk,1
4603,138,2,2019-11-02 10:20:05,WA- running,1
...,...,...,...,...,...
2294,131,1,2019-09-12 11:27:40,LA- stand,1
2293,131,1,2019-09-12 11:27:14,WA- walk,1
2292,131,1,2019-09-12 11:24:47,LA- stand,1
2290,131,1,2019-09-12 11:24:22,LA- stand and move,1


**Above code fixes issue with more than two postures in a given second**

In [None]:
# This function helps us aggregate so there is only one row per unique datetime, especially if there are multiple states in a second
def unique_list(x):
    listx = pd.Series(list(x)).dropna().unique().tolist()
    if len(listx) > 1:
        return listx
    elif len(listx) == 1:
        return listx[0]
    return np.nan

In [None]:
# Merge posture data and activity data
sbs_3 = sbs_posture_1.merge(right=sbs_activity, on=["id", "do", "date", "date_time"], how="outer")
# Aggregate
sbs_3 = sbs_3.groupby(["id", "do", "date", "date_time"]).agg(unique_list).reset_index()
# Display for review
sbs_3.sort_values(["id", "do", "date_time"])

Unnamed: 0,id,do,date,date_time,posture_type,activity_intensity,maxIdx,activity_type,exercise_modifier,work_type
0,102,1,2019-07-24,2019-07-24 08:20:19,LA- stand,,0.0,WRK- general,,SP- Education and Health Services
1,102,1,2019-07-24,2019-07-24 08:21:01,WA- walk,moderate,2.0,,,
2,102,1,2019-07-24,2019-07-24 08:21:12,,,,TRAV- walking,,
3,102,1,2019-07-24,2019-07-24 08:21:18,WA- descend stairs,moderate,4.0,,,
4,102,1,2019-07-24,2019-07-24 08:21:32,WA- walk,moderate,5.0,,,
...,...,...,...,...,...,...,...,...,...,...
7183,154,2,2020-02-23,2020-02-23 13:04:58,WA-walk with load,moderate,7883.0,,,
7184,154,2,2020-02-23,2020-02-23 13:05:07,WA- ascend stairs,vigorous,7884.0,,,
7185,154,2,2020-02-23,2020-02-23 13:05:45,WA-walk with load,moderate,7885.0,,,
7186,154,2,2020-02-23,2020-02-23 13:06:18,LA- stand and move,light,7886.0,,,


In [None]:
# Show rows where there are multiple posture types per datetime (should be none now)
sbs_3[sbs_3['posture_type'].apply(type) == list]

Unnamed: 0,id,do,date,date_time,posture_type,activity_intensity,maxIdx,activity_type,exercise_modifier,work_type


In [None]:
# Show rows where there are multiple activity types per datetime
sbs_3[sbs_3['activity_type'].apply(type) == list]

Unnamed: 0,id,do,date,date_time,posture_type,activity_intensity,maxIdx,activity_type,exercise_modifier,work_type


**Step 5:** Get min and max datetime timestamps for each session using the `log` file

In [None]:
# Get relevant info from log
sbs_time = log[["id", "do", "start_month", "start_day", "start_year", "start_time", "stop_time"]]
# Concatenate min date time into string and then convert that string to datetime object
sbs_time["date_time_min"] = pd.to_datetime(sbs_time["start_month"].astype(str) + "/" + sbs_time["start_day"].astype(str) + "/" +
                                        sbs_time["start_year"].astype(str) + " " + sbs_time["start_time"])
# Concatenate max date time into string and then convert that string to datetime object
sbs_time["date_time_max"] = pd.to_datetime(sbs_time["start_month"].astype(str) + "/" + sbs_time["start_day"].astype(str) + "/" +
                                        sbs_time["start_year"].astype(str) + " " + sbs_time["stop_time"]) + pd.to_timedelta(1, unit="seconds")
# Recreate date column for joining purposes
sbs_time["date"] = sbs_time["date_time_min"].dt.date
# Subset columns
sbs_time = sbs_time[["id", "do", "date", "date_time_min", "date_time_max"]]
# Display for review
sbs_time

Unnamed: 0,id,do,date,date_time_min,date_time_max
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 11:21:14
1,102,2,2019-07-25,2019-07-25 12:42:08,2019-07-25 15:42:01
2,116,1,2019-08-20,2019-08-20 08:08:45,2019-08-20 11:09:01
3,116,2,2019-08-21,2019-08-21 11:09:47,2019-08-21 13:28:01
4,117,1,2019-08-20,2019-08-20 15:17:46,2019-08-20 18:18:01
5,117,2,2019-08-21,2019-08-21 07:11:45,2019-08-21 10:13:01
6,122,1,2019-08-28,2019-08-28 10:06:41,2019-08-28 13:07:01
7,122,2,2019-08-29,2019-08-29 15:19:51,2019-08-29 18:23:01
8,124,1,2019-08-28,2019-08-28 14:59:09,2019-08-28 17:59:06
9,124,2,2019-08-29,2019-08-29 09:31:00,2019-08-29 12:31:01


**Step 6:** Get timestamps for posture states. Fill the time stamps in between so there is one row for every second between start and end of the session. This will result in a second by second table for the posture states.

In [None]:
# Get postures from aggregated table
sbs_posture_2 = sbs_3[['id', 'do', 'date', 'date_time', 'posture_type', 'activity_intensity']].dropna(subset="posture_type")
sbs_posture_2

Unnamed: 0,id,do,date,date_time,posture_type,activity_intensity
0,102,1,2019-07-24,2019-07-24 08:20:19,LA- stand,
1,102,1,2019-07-24,2019-07-24 08:21:01,WA- walk,moderate
3,102,1,2019-07-24,2019-07-24 08:21:18,WA- descend stairs,moderate
4,102,1,2019-07-24,2019-07-24 08:21:32,WA- walk,moderate
5,102,1,2019-07-24,2019-07-24 08:21:39,WA- descend stairs,moderate
...,...,...,...,...,...,...
7183,154,2,2020-02-23,2020-02-23 13:04:58,WA-walk with load,moderate
7184,154,2,2020-02-23,2020-02-23 13:05:07,WA- ascend stairs,vigorous
7185,154,2,2020-02-23,2020-02-23 13:05:45,WA-walk with load,moderate
7186,154,2,2020-02-23,2020-02-23 13:06:18,LA- stand and move,light


In [None]:
# Merge min max datetime data with posture data
sbs_posture_3 = sbs_time.merge(sbs_posture_2[["id", "do", "date", "date_time"]], on=["id", "do", "date"])
# Get end times of states - it is just the start time of the next state
sbs_posture_3["date_time_shift"] = sbs_posture_3.groupby(['id', 'do', 'date'])['date_time'].apply(lambda x: x.shift(-1)).reset_index(drop=True)
# Calculate start of the states and store in date time min. It is the max of the session start or the state start.
sbs_posture_3["date_time_min"] = sbs_posture_3[["date_time_min", "date_time"]].max(axis=1, skipna=True)
# Calculate end of the states and store in date time max. It is the minimum of the session end or the next state start.
sbs_posture_3["date_time_max"] = sbs_posture_3[["date_time_max", "date_time_shift"]].min(axis=1, skipna=True)
# Subset columns
sbs_posture_3 = sbs_posture_3[["id", "do", "date", "date_time_min", "date_time_max"]]
# Calculate time in between states
sbs_posture_3['time_delta'] = sbs_posture_3['date_time_max'] - sbs_posture_3['date_time_min']
# Get a sequence list of all seconds since the start of the state until the end of the state. It will look like [0, 1, 2,...]
sbs_posture_3['seconds_sequence'] = sbs_posture_3['time_delta'].apply(lambda x: list(range(int(x.total_seconds()))))
# Explode the sequence of seconds. We add each second in the list to the start time
# This should result in one row per second for every second in between the session start and end - a second by second table
sbs_posture_3 = sbs_posture_3.drop(['time_delta', 'date_time_max'], axis=1).explode('seconds_sequence')
sbs_posture_3['date_time'] = sbs_posture_3['date_time_min'] + pd.to_timedelta(sbs_posture_3['seconds_sequence'], unit="seconds")
# Subset columns
sbs_posture_3 = sbs_posture_3.drop(['seconds_sequence'], axis=1)
# Display for review
sbs_posture_3

Unnamed: 0,id,do,date,date_time_min,date_time
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:19
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:20
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:21
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:22
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:23
...,...,...,...,...,...
6900,154,2,2020-02-23,2020-02-23 13:06:24,2020-02-23 15:55:56
6900,154,2,2020-02-23,2020-02-23 13:06:24,2020-02-23 15:55:57
6900,154,2,2020-02-23,2020-02-23 13:06:24,2020-02-23 15:55:58
6900,154,2,2020-02-23,2020-02-23 13:06:24,2020-02-23 15:55:59


In [None]:
# Merge with posture states ground truth. The date time min in the second by second table should match up with the date time in the event table
sbs_posture_4 = sbs_posture_3.merge(sbs_posture_2, left_on=["id", "do", "date", "date_time_min"], right_on=["id", "do", "date", "date_time"])
# Subset and rename columns
sbs_posture_4 = sbs_posture_4.drop(["date_time_min", "date_time_y"], axis=1).rename(columns={
    'date_time_x': 'date_time',
    'do': 'observation'
})
# Display for review
sbs_posture_4

Unnamed: 0,id,observation,date,date_time,posture_type,activity_intensity
0,102,1,2019-07-24,2019-07-24 08:20:19,LA- stand,
1,102,1,2019-07-24,2019-07-24 08:20:20,LA- stand,
2,102,1,2019-07-24,2019-07-24 08:20:21,LA- stand,
3,102,1,2019-07-24,2019-07-24 08:20:22,LA- stand,
4,102,1,2019-07-24,2019-07-24 08:20:23,LA- stand,
...,...,...,...,...,...,...
509944,154,2,2020-02-23,2020-02-23 15:55:56,SB-sitting,
509945,154,2,2020-02-23,2020-02-23 15:55:57,SB-sitting,
509946,154,2,2020-02-23,2020-02-23 15:55:58,SB-sitting,
509947,154,2,2020-02-23,2020-02-23 15:55:59,SB-sitting,


**Step 7:** Get timestamps for activity states. Fill the time stamps in between so there is one row for every second between start and end of the session. This will result in a second by second table for the activity states.

In [None]:
# Repeat for activities
# Get activities from aggregated table
sbs_activity_2 = sbs_3[['id', 'do', 'date', 'date_time', 'activity_type', 'exercise_modifier', 'work_type']].dropna(subset="activity_type")
sbs_activity_2

Unnamed: 0,id,do,date,date_time,activity_type,exercise_modifier,work_type
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,,SP- Education and Health Services
2,102,1,2019-07-24,2019-07-24 08:21:12,TRAV- walking,,
11,102,1,2019-07-24,2019-07-24 08:30:08,PUR- purchasing goods and services,,
20,102,1,2019-07-24,2019-07-24 08:33:55,TRAV- walking,,
27,102,1,2019-07-24,2019-07-24 08:42:43,WRK- general,,"SP- Office (business, professional services, f..."
...,...,...,...,...,...,...,...
7029,154,1,2020-02-22,2020-02-22 14:11:08,"LES- screen based leisure time (TV, video game...",,
7030,154,1,2020-02-22,2020-02-22 14:13:53,"EX- participating in sport, exercise or recrea...",weight training,
7038,154,1,2020-02-22,2020-02-22 15:11:44,"EX- participating in sport, exercise or recrea...",weight training,
7180,154,1,2020-02-22,2020-02-22 16:04:53,TRAV- walking,,


In [None]:
# Merge min max datetime data with activities data
sbs_activity_3 = sbs_time.merge(sbs_activity_2[["id", "do", "date", "date_time"]], on=["id", "do", "date"])
# Get end times of states - it is just the start time of the next state
sbs_activity_3["date_time_shift"] = sbs_activity_3.groupby(['id', 'do', 'date'])['date_time'].apply(lambda x: x.shift(-1)).reset_index(drop=True)
# Calculate start of the states and store in date time min. It is the max of the session start or the state start.
sbs_activity_3["date_time_min"] = sbs_activity_3[["date_time_min", "date_time"]].max(axis=1, skipna=True)
# Calculate end of the states and store in date time max. It is the minimum of the session end or the next state start.
sbs_activity_3["date_time_max"] = sbs_activity_3[["date_time_max", "date_time_shift"]].min(axis=1, skipna=True)
# Subset columns
sbs_activity_3 = sbs_activity_3[["id", "do", "date", "date_time_min", "date_time_max"]]
# Calculate time in between states
sbs_activity_3['time_delta'] = sbs_activity_3['date_time_max'] - sbs_activity_3['date_time_min']
# Get a sequence list of all seconds since the start of the state until the end of the state. It will look like [0, 1, 2,...]
sbs_activity_3['seconds_sequence'] = sbs_activity_3['time_delta'].apply(lambda x: list(range(int(x.total_seconds()))))
# Explode the sequence of seconds. We add each second in the list to the start time
# This should result in one row per second for every second in between the session start and end - a second by second table
sbs_activity_3 = sbs_activity_3.drop(['time_delta', 'date_time_max'], axis=1).explode('seconds_sequence')
sbs_activity_3['date_time'] = sbs_activity_3['date_time_min'] + pd.to_timedelta(sbs_activity_3['seconds_sequence'], unit="seconds")
# Subset columns
sbs_activity_3 = sbs_activity_3.drop(['seconds_sequence'], axis=1)
# Display for review
sbs_activity_3

Unnamed: 0,id,do,date,date_time_min,date_time
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:19
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:20
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:21
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:22
0,102,1,2019-07-24,2019-07-24 08:20:19,2019-07-24 08:20:23
...,...,...,...,...,...
742,154,2,2020-02-23,2020-02-23 13:04:09,2020-02-23 15:55:56
742,154,2,2020-02-23,2020-02-23 13:04:09,2020-02-23 15:55:57
742,154,2,2020-02-23,2020-02-23 13:04:09,2020-02-23 15:55:58
742,154,2,2020-02-23,2020-02-23 13:04:09,2020-02-23 15:55:59


In [None]:
# Merge with activity states ground truth. The date time min in the second by second table should match up with the date time in the event table
sbs_activity_4 = sbs_activity_3.merge(sbs_activity_2, left_on=["id", "do", "date", "date_time_min"], right_on=["id", "do", "date", "date_time"])
# Subset and rename columns
sbs_activity_4 = sbs_activity_4.drop(["date_time_min", "date_time_y"], axis=1).rename(columns={
    'date_time_x': 'date_time',
    'do': 'observation'
})
# Display for review
sbs_activity_4

Unnamed: 0,id,observation,date,date_time,activity_type,exercise_modifier,work_type
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,,SP- Education and Health Services
1,102,1,2019-07-24,2019-07-24 08:20:20,WRK- general,,SP- Education and Health Services
2,102,1,2019-07-24,2019-07-24 08:20:21,WRK- general,,SP- Education and Health Services
3,102,1,2019-07-24,2019-07-24 08:20:22,WRK- general,,SP- Education and Health Services
4,102,1,2019-07-24,2019-07-24 08:20:23,WRK- general,,SP- Education and Health Services
...,...,...,...,...,...,...,...
511814,154,2,2020-02-23,2020-02-23 15:55:56,"EDU- taking class, research, homework",,
511815,154,2,2020-02-23,2020-02-23 15:55:57,"EDU- taking class, research, homework",,
511816,154,2,2020-02-23,2020-02-23 15:55:58,"EDU- taking class, research, homework",,
511817,154,2,2020-02-23,2020-02-23 15:55:59,"EDU- taking class, research, homework",,


**Step 8:** Merge the second by second posture states with the second by second activity states to get a final second by second file. All that is left to do is to implement the mappings.

In [None]:
sbs_4 = pd.merge(sbs_posture_4, sbs_activity_4, on=['id', 'observation', 'date', 'date_time'])
sbs_4

Unnamed: 0,id,observation,date,date_time,posture_type,activity_intensity,activity_type,exercise_modifier,work_type
0,102,1,2019-07-24,2019-07-24 08:20:19,LA- stand,,WRK- general,,SP- Education and Health Services
1,102,1,2019-07-24,2019-07-24 08:20:20,LA- stand,,WRK- general,,SP- Education and Health Services
2,102,1,2019-07-24,2019-07-24 08:20:21,LA- stand,,WRK- general,,SP- Education and Health Services
3,102,1,2019-07-24,2019-07-24 08:20:22,LA- stand,,WRK- general,,SP- Education and Health Services
4,102,1,2019-07-24,2019-07-24 08:20:23,LA- stand,,WRK- general,,SP- Education and Health Services
...,...,...,...,...,...,...,...,...,...
509944,154,2,2020-02-23,2020-02-23 15:55:56,SB-sitting,,"EDU- taking class, research, homework",,
509945,154,2,2020-02-23,2020-02-23 15:55:57,SB-sitting,,"EDU- taking class, research, homework",,
509946,154,2,2020-02-23,2020-02-23 15:55:58,SB-sitting,,"EDU- taking class, research, homework",,
509947,154,2,2020-02-23,2020-02-23 15:55:59,SB-sitting,,"EDU- taking class, research, homework",,


# Mappings

## **Broad Activity Type Mapping**

In [None]:
def broad_mapping(row):
    hp = ["SL- sleep", "PC- groom, health-related", "PC- other personal care", "HA- housework", "HA- food prep and cleanup", "HA- interior maintenance, repair, & decoration", "HA- animals and pets", "HA- household management/other household activities","CA- caring for and helping children", "CA- caring for and helping adults"]
    lg = ["HA- exterior maintenance, repair, & decoration", "HA- lawn, garden and houseplants"]
    we = ["WRK- general", "WRK- desk/screen based", "EDU- taking class, research, homework", "EDU- extracurricular", "ORG- church, spiritual", "ORG- volunteer work", "ORG- volunteer", "WRK- screen based"]
    ex = ["EX- participating in sport, exercise or recreation"]
    p = ["PUR- purchasing goods and services"]
    l = ["EAT- eating and drinking, waiting", "LES- socializing, communicating, non-screen based", "LES- screen-based(TV, video game, computer, phone)", "EX- Attending sport, exercise recreation event, or performance", "LES- screen based leisure time (TV, video game, computer)", "LES- socializing, communicating, leisure time not screen", "EX- attending sport, recreational event, or performance"]
    t = ["TRAV- passenger (car/truck/motorcycle)", "TRAV- driver (car/truck/motorcycle)", "TRAV- passenger (bus, train, tram, plane, boat, ship)", "TRAV- general"]
    at = ["TRAV- biking", "TRAV- walking"]

    if row in hp:
        return "household_personal"
    elif row in lg:
        return "lawn_garden"
    elif row in we:
        return "work_education"
    elif row in p:
        return "purchasing"
    elif row in l:
        return "leisure"
    elif row in ex:
        return "exercise"
    elif row in t:
        return "transportation"
    elif row in at:
        return "active_transportation"
    else:
        return np.nan

## **Change Exercise Modifiers to be More Specific**

In [None]:
# To change exercise modifier using modifier_1 (exercise_modifier)
def ex_mapping(behavior, modifier1):
    if behavior == "EX- participating in sport, exercise or recreation":
        return "EX- %s" %modifier1
    else:
        return behavior

## **Sedentary vs Non-sedentary Mapping**

In [None]:
# Sedentary/non-sedentary postures
def sed_map(row):
    sed = ["SB-sitting", "SB- lying", "LA- kneeling/ squatting"]
    no_sed = ["LA- stretching", "LA- stand", "LA- stand and move", "WA- walk", "WA-walk with load", "WA- ascend stairs", "WA- descend stairs", "WA- running", "SP- bike", "SP- other sport movement", "SP- muscle strengthening"]
    if row in sed:
        return "sedentary"
    elif row in no_sed:
        return "not_sedentary"
    else:
        return np.nan

## **Walking vs Not Walking Mapping**

In [None]:
def walk_mapping(row):
    not_walk = ["SB-sitting", "SB- lying", "LA- kneeling/ squatting", "LA- stretching", "LA- stand", "LA- stand and move","SP- bike", "SP- other sport movement", "SP- muscle strengthening"]
    if ~pd.isna(row): #To avoid errors with startswith and NaN
      if row.startswith("WA"):
          return "walking"
      elif row in not_walk:
          return "not_walking"
      else:
          return np.nan
    else:
      return np.nan


## **Intensity Mapping**

In [None]:
def intensity_mapping(behavior, mod2):
    sed = ["SB-sitting", "SB- lying", "LA- kneeling/ squatting"]
    light = ["LA- stand", "LA- stretching"]

    if behavior in sed:
        return "sedentary"
    elif behavior in light:
        return "light"
    else:
        if pd.isna(mod2):
            return np.nan
        else:
            return mod2

## **Apply the Mappings**

In [None]:
def posture_cleanup(s):
    posture = s.split('-')[1].strip()
    return posture

In [None]:
sbs_5 = sbs_4.copy()
# broad activity
sbs_5["broad_activity_type"] = sbs_5["activity_type"].apply(broad_mapping)
# specific exercise mapping
sbs_5["activity_type"] = sbs_5.apply(lambda x: ex_mapping(x['activity_type'], x['exercise_modifier']), axis=1)
# sedentary vs not
sbs_5["sedentary_not"] = sbs_5["posture_type"].apply(sed_map)
# walking vs not walking
sbs_5["walking_not"] = sbs_5["posture_type"].apply(walk_mapping)
# intensity mapping
sbs_5["activity_intensity"] = sbs_5.apply(lambda x: intensity_mapping(x['posture_type'], x['activity_intensity']), axis=1)
sbs_5['posture'] = sbs_5['posture_type'].apply(posture_cleanup)

In [None]:
sbs = sbs_5[['id', 'observation', 'date', 'date_time', 'activity_type', 'broad_activity_type', 'work_type', 'posture', 'sedentary_not', 'walking_not', 'activity_intensity']]
sbs

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
1,102,1,2019-07-24,2019-07-24 08:20:20,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
2,102,1,2019-07-24,2019-07-24 08:20:21,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
3,102,1,2019-07-24,2019-07-24 08:20:22,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
4,102,1,2019-07-24,2019-07-24 08:20:23,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
...,...,...,...,...,...,...,...,...,...,...,...
509944,154,2,2020-02-23,2020-02-23 15:55:56,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary
509945,154,2,2020-02-23,2020-02-23 15:55:57,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary
509946,154,2,2020-02-23,2020-02-23 15:55:58,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary
509947,154,2,2020-02-23,2020-02-23 15:55:59,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary


In [None]:
# Check if there are any NaNs where there shouldn't be
sbs[sbs[["broad_activity_type", "activity_type", "sedentary_not", "walking_not", "activity_intensity", "posture"]].isna().any(axis=1)]

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity


In [None]:
# Check to make sure every time is incremented by 1 second
sbs_check_time = sbs.copy()
sbs_check_time["shift"] = sbs_check_time.groupby(["id", "observation"])["date_time"].shift(-1)
(sbs_check_time["shift"] - sbs_check_time["date_time"]).value_counts()

0 days 00:00:01    509901
dtype: int64

In [None]:
sbs.to_csv("ACT24_groundtruth_secbysec.csv", index=False)

In [None]:
sbs["id"].value_counts().sort_index()

102    21648
116    19069
117    21691
122    21810
124    21598
126    21611
127    21602
128    21361
129    21616
130    21601
131    21026
132    20506
133    21552
134    20694
135    21611
136    21611
138    15608
139    21604
140    21581
141    21670
143    21602
144    21661
150    24563
154    21053
Name: id, dtype: int64