# Space

In [None]:
import sys
import os 
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML

KEY = '1-WORKSPACE'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'./_Data/0-Data_Raw',
    'DATA_RFT': f'./_Data/1-Data_RFT',
    'DATA_CASE': f'./_Data/2-Data_CASE',
    'DATA_AIDATA': f'./_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'./code/external',
    'CODE_FN': f'./code/pipeline', 
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'

print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# Step 1: Cohort and Human

In [None]:
from config.config_record.Cohort import CohortName_to_OneCohortArgs

# %%%%%%%%%%%%%%%%%%%%% user
# CohortName = 'WellDoc2023CVSDeRx'
# CohortName = 'WellDoc2025LLY'
CohortName = 'WellDoc2025ALS'
# CohortName = 'WellDoc2025CVS'
OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
# %%%%%%%%%%%%%%%%%%%%%


In [None]:
[i for i in CohortName_to_OneCohortArgs]

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
HumanName = 'P'
# %%%%%%%%%%%%%%%%%%%%% user


In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort
Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort.setup_fn(cohort_fn)
cohort.initialize_cohort(load_data = True)

In [None]:
cohort.pypath

In [None]:
cohort.RawName_to_dfRaw


In [None]:
from recfldtkn.record_base.human import HumanFn, Human   

human_fn = HumanFn(HumanName, SPACE)
human = Human(HumanName, cohort, human_fn)
human.setup_fn(human_fn)
human.initialize_human(load_data = False)

In [None]:
human.pypath

In [None]:
human.df_Human

# Step 2: OneRecord_Args

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
RecordName = 'Exercise5Min'
# %%%%%%%%%%%%%%%%%%%%% user

## %%%%%%%%%%%%%%%%%%%%% user
OneRecord_Args = {
    'RecordName': 'Exercise5Min', 
    'RecID': 'Exercise5MinID',
    'RecIDChain': ['PID'],

    'RawHumanID': 'PatientID',
    'ParentRecName': 'P', 

    'RecDT': 'DT_s', 
    'RawNameList': ['Exercise'],
    'human_group_size': 100,
    'rec_chunk_size': 100000,
}
## %%%%%%%%%%%%%%%%%%%%% user

In [None]:
RawName_to_dfRaw = cohort.RawName_to_dfRaw
[RawName for RawName in RawName_to_dfRaw]

In [None]:
from recfldtkn.record_base.record import RecordFn, Record

record_fn = None 
record = Record(RecordName, human, record_fn)
record 

In [None]:
OneCohort_Args = cohort.OneCohort_Args
RawName_to_dfRaw = cohort.RawName_to_dfRaw
df_Human = human.df_Human

# Step 3: RecordPrtInfo

In [None]:
OneHuman_Args = human.OneHuman_Args

OneCohort_Args

In [None]:
if OneRecord_Args['ParentRecName'] is not None:

    # full version
    record_prt = Record(OneRecord_Args['ParentRecName'], human)
    record_prt.setup_fn()
    record_prt.setup_prt()
    record_prt.initialize_record()

    # neat version
    # record_prt = Record(OneRecord_Args['ParentRecName'], human)
    # record_prt.setup_fn()
    # record_prt.initialize_record()
else:
    record_prt = None 
    
print('record_prt:', record_prt)

In [None]:
human = human
RecordPrtInfo = record.get_RecordPrtInfo(OneRecord_Args, 
                                         human, 
                                         record_prt) 
df_Human = human.df_Human
RecordPrtInfo
# 

In [None]:
df_Human.columns

# Step 4: Display Record's RawColumns

In [None]:
OneRecord_Args['RawNameList']

In [None]:
RawNameList = OneRecord_Args['RawNameList']
RawName_to_dfRaw = cohort.RawName_to_dfRaw
RawName_to_Sample = record.display_Record_RawNameCols(RawNameList, RawName_to_dfRaw)

In [None]:
for RawName, df_sample in RawName_to_Sample.items():
    print(f'\n===== {RawName} =====')
    display(df_sample)

# Step 5: Select Useful Raw Columns


In [None]:
OneRecord_Args['RawNameList']

In [None]:
## %%%%%%%%%%%%%%%%%%%%% user
RawName_to_RawConfig = {}

RawName = 'Exercise'
raw_columns = ['PatientID', 'ObservationDateTime', 'ObservationEntryDateTime',
 'TimezoneOffset', 'Timezone', 'ExerciseType', 'ExerciseIntensity',
 'TimeSinceExercise', 'ActivityTypeID', 'ExerciseDuration', 'CaloriesBurned',
 'DistanceInMeters', 'ActivityType']

RawConfig = {
    'raw_columns': raw_columns, 
    # 'raw_base_columns': raw_base_columns,
    'rec_chunk_size': 100000,
    # 'raw_datetime_column': raw_datetime_column,
}
RawName_to_RawConfig[RawName] = RawConfig
## %%%%%%%%%%%%%%%%%%%%% user

RawName_to_RawConfig

# Step 6: Get df_HumanSelected

In [None]:
[RawName for RawName in OneRecord_Args['RawNameList']]

In [None]:
RawName_to_dfRaw

In [None]:
OneCohort_Args = cohort.OneCohort_Args
OneHuman_Args = human.OneHuman_Args
OneRecord_Args = OneRecord_Args
df_Human = human.df_Human

df_HumanSelected = record.get_dfHumanSelected(OneCohort_Args, OneHuman_Args, OneRecord_Args, df_Human)
df_HumanSelected

# Step 7: Get df_HumanGroup

In [None]:
for human_group, df_HumanGroup in df_HumanSelected.groupby('human_group'):
    logger.info(f'\n====={human_group}=====')
    display(df_HumanGroup)
    break

# Step 8: Get df_RawRec_for_HumanGroup

In [None]:
df_RawRec_for_HumanGroup = record.get_dfRawRec_from_dfHumanGroup(OneHuman_Args,
                                                                df_HumanGroup,
                                                                RawName_to_RawConfig, 
                                                                RawName_to_dfRaw)
df_RawRec_for_HumanGroup

# Step 9: Get df_RawRecProc_for_HumanGroup

In [None]:
import inspect 

## %%%%%%%%%%%%%%%%%%%%% user
def get_RawRecProc_for_HumanGroup(df_RawRec_for_HumanGroup, OneRecord_Args, df_Human):
    df = df_RawRec_for_HumanGroup
    
    # 1. filter out the records we don't need (optional) 
    df = df[df['TimezoneOffset'].abs() < 1000].reset_index(drop = True)

    # 2. entry type

    # 3. update datetime columns 
    DTCol_list = [
        'ObservationDateTime', 
        'ObservationEntryDateTime',
        # 'EntryCreatedDateTime', 
        # 'ModifiedDateTime',
    ]

    for DTCol in DTCol_list: 
        df[DTCol] = pd.to_datetime(df[DTCol], format = 'mixed')

    # x1. localize the datetime columns to based on time zone. 
    a = len(df)
    df = pd.merge(df, df_Human[['PatientID', 'user_tz']],  how = 'left')
    b = len(df)
    assert a == b
    df['DT_tz'] = df['TimezoneOffset'].replace(0, None).fillna(df['user_tz']).infer_objects(copy=False)
    

    DTCol = 'DT_r'
    DTCol_source = 'ObservationEntryDateTime'
    df[DTCol] = df[DTCol_source]
    df[DTCol] = pd.to_datetime(df[DTCol]) + pd.to_timedelta(df['DT_tz'], 'm')
    assert df[DTCol].isna().sum() == 0

    DTCol = 'DT_s'
    DTCol_source = 'ObservationDateTime'
    df[DTCol] = df[DTCol_source]
    df[DTCol] = pd.to_datetime(df[DTCol]).apply(lambda x: None if x <= pd.to_datetime('2010-01-01') else x)
    df[DTCol] = pd.to_datetime(df[DTCol]) + pd.to_timedelta(df['DT_tz'], 'm')
    df[DTCol] = df[DTCol].fillna(df['DT_r'])
    assert df[DTCol].isna().sum() == 0

    DTCol = 'DT_e'
    DTCol_source = None
    # select
    df['DT_e'] = df['DT_s'] + pd.to_timedelta(df['ExerciseDuration'], 'm')
    assert df[DTCol].isna().sum() == 0
   
    # # x3. drop duplicates
    df = df.drop_duplicates()

    df['DT_tz'] = df['DT_tz'].fillna(0).astype(int)


    

    # 4. select a DT as the RecDT
    # RecDT = 'DT_s'

    def densify_timestamps(df):
        time_interval = pd.Timedelta(minutes=31)
        df_sorted = df.sort_values(by=['PatientID', 'DT_s']).copy()  # Ensure sorting

        def adjust_group(group):
            timestamps = group['DT_s'].tolist()
            updated_timestamps = timestamps.copy()  # Preserve original order
            i = 0
            
            while i < len(timestamps):
                t1 = timestamps[i]
                j = i + 1  # Start checking from the next timestamp
                
                # Adjust following timestamps if they fall within the 15-minute window
                while j < len(timestamps) and timestamps[j] <= t1 + time_interval:
                    updated_timestamps[j] = t1  # Set to t1
                    j += 1
                
                i = j  # Move to the next unprocessed timestamp

            group['DT_s'] = updated_timestamps
            return group

        # Apply to each patient group separately
        columns = df_sorted.columns.tolist()
        columns = [i for i in columns if i != 'PatientID']
        df_updated = df_sorted.groupby('PatientID')[columns].apply(adjust_group)
        
        return df_updated
    

    df = densify_timestamps(df)

    DTCol_list = ['DT_s', 
                  'DT_r', 
                  'DT_e',
                  ] # 
    for DTCol in DTCol_list:
        # DateTimeUnit ='5Min'
        date = df[DTCol].dt.date.astype(str)
        hour = df[DTCol].dt.hour.astype(str)
        minutes = ((df[DTCol].dt.minute / 5).astype(int) * 5).astype(str)
        df[DTCol] = pd.to_datetime(date + ' ' + hour +':' + minutes + ':' + '00')

    df = df.drop_duplicates()
    df['ExerciseDuration'] = df['ExerciseDuration'].astype(float)

    RawHumanID = OneRecord_Args['RawHumanID']
    RecDT = 'DT_s'
    df = df.groupby([RawHumanID, RecDT]).agg(
        {
            # 'PatientID': 'first',
            'DT_r': 'first',
            'DT_tz': 'first',

            "ExerciseType": lambda x: "; ".join(x),
            "ExerciseIntensity": "first",
            'ExerciseDuration': 'sum', # should this be sum?
            "CaloriesBurned": "sum",
            'DistanceInMeters': 'sum',
            "ActivityType": "first",
        }
    ).reset_index()
    df['time_to_last_entry'] = df.groupby('PatientID', group_keys=False)['DT_s'].diff().dt.total_seconds() / 60 / 5
    # ----------------------------------------------------------------- #

    # drop the ExerciseDuration > 1000
    df = df[df['ExerciseDuration'] <= 120].reset_index(drop=True)
    df = df[df['ExerciseDuration'] >=5].reset_index(drop=True)
    
    df_RawRecProc = df
    return df_RawRecProc 
## %%%%%%%%%%%%%%%%%%%%% user

get_RawRecProc_for_HumanGroup.fn_string = inspect.getsource(get_RawRecProc_for_HumanGroup)

In [None]:
print('df_RawRec_for_HumanGroup:', df_RawRec_for_HumanGroup.shape)
df_RawRecProc_for_HumanGroup = get_RawRecProc_for_HumanGroup(df_RawRec_for_HumanGroup, OneRecord_Args, df_Human)
print('df_RawRecProc_for_HumanGroup:', df_RawRecProc_for_HumanGroup.shape)
df_RawRecProc_for_HumanGroup 

# Step 10: Get dfRecAttr from dfRawRec

In [None]:
df_RecAttr_for_HumanGroup = record.get_dfRecAttr_from_dfRawRec(df_RawRecProc_for_HumanGroup, OneRecord_Args, RecordPrtInfo)
df_RecAttr_for_HumanGroup

# Step 11: Attr_Cols

In [None]:
## %%%%%%%%%%%%%%%%%%%%% user
attr_cols = [
    'PID', 'PatientID', 'Exercise5MinID',
    'DT_tz', 'DT_r', 'DT_s', # 'DT_e', 
    # ----- Value
    # 'EntryType',
    'ExerciseDuration', 'ExerciseIntensity', 'CaloriesBurned','DistanceInMeters',
    # 'ActivityTypeID', 
    'ExerciseType',
    'time_to_last_entry',
    ]

## %%%%%%%%%%%%%%%%%%%%% user

In [None]:
df_RecAttr_for_HumanGroup = df_RecAttr_for_HumanGroup[attr_cols].reset_index(drop=True)
df_RecAttr_for_HumanGroup

# Step 12: ds_RecAttr

In [None]:
ds_RecAttr = record.get_dsRecAttr(OneRecord_Args = OneRecord_Args, 
                                  human = human, 
                                  RawName_to_RawConfig = RawName_to_RawConfig,
                                  attr_cols = attr_cols,
                                  get_RawRecProc_for_HumanGroup = get_RawRecProc_for_HumanGroup,
                                  record_prt = record_prt, 
                                  RecordPrtInfo = RecordPrtInfo)
ds_RecAttr

In [None]:
df = ds_RecAttr.to_pandas()
df['ExerciseDuration'].describe()

In [None]:
df

# Step 13: ds_RecIndex

In [None]:
ds_RecIndex = record.get_dsRecIndex(OneHuman_Args, OneRecord_Args, ds_RecAttr)
ds_RecIndex

In [None]:
ds_RecIndex.to_pandas()

# Step 14: Save Fn

In [None]:
pypath = record.pypath
print(pypath)

from recfldtkn.base import Base 
prefix = ['import pandas as pd', 'import numpy as np']
iterative_variables = [OneRecord_Args, RawName_to_RawConfig, attr_cols]
fn_variables = [get_RawRecProc_for_HumanGroup]
pycode = Base.convert_variables_to_pystirng(fn_variables = fn_variables, 
                                            iterative_variables = iterative_variables,
                                            prefix = prefix)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

# Step 15: Test

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort
from config.config_record.Cohort import CohortName_to_OneCohortArgs

# %%%%%%%%%%%%%%%%%%%%% user
# CohortName = 'WellDoc2023CVSDeRx'
OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
# %%%%%%%%%%%%%%%%%%%%%

Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort.setup_fn(cohort_fn)
cohort.initialize_cohort(load_data = True)

In [None]:
from recfldtkn.record_base.human import HumanFn, Human   

# %%%%%%%%%%%%%%%%%%%%% user
HumanName = 'P'
# %%%%%%%%%%%%%%%%%%%%% user

human_fn = HumanFn(HumanName, SPACE)
human = Human(HumanName, cohort, human_fn)
human.setup_fn(human_fn)
human.initialize_human(load_data = True)

In [None]:
RecordName

In [None]:
from recfldtkn.record_base.record import RecordFn, Record
# %%%%%%%%%%%%%%%%%%%%%
# RecordName = 'Exercise' # Pat
# RecordName = 'Exercise5Min'
# %%%%%%%%%%%%%%%%%%%%%

record = Record(RecordName, human)
record.setup_fn()
record.setup_prt() # <--- you can also put a given record_prt here.
record.initialize_record()    

In [None]:
record.ds_RecAttr

In [None]:
record.ds_RecIndex