# Space

In [None]:
import sys
import os 
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML

KEY = '1-WORKSPACE'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'_Data/0-Data_Raw',
    'DATA_RFT': f'_Data/1-Data_RFT',
    'DATA_CASE': f'_Data/2-Data_CASE',
    'DATA_AIDATA': f'_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'code/external',
    'CODE_FN': f'code/pipeline', 
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'

print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# Step 0: CaseFn to Focus

In [None]:
import itertools 
from recfldtkn.case_base.casefnutils.ro import parse_ROName
from recfldtkn.case_base.caseset import get_HumanRecordRecfeatArgs_from_RONameToRONameInfo
from recfldtkn.case_base.case_base import OneCohortTrigger_CaseBase
# from recfldtkn.case_base.case_base import CaseSetManager

############################################# prepare input arguments: ROName_list
CaseFnName = 'DMEInfoAf2h'

Ckpd_to_CkpdObsConfig = {
    'Af2H': {
        'DistStartToPredDT': 1,
        'DistEndToPredDT': 121,
        'TimeUnit': 'min',
        'StartIdx5Min': 1,
        'EndIdx5Min': 24, 
    }, 
}


# (P, CGM5Min)
# (hP, rCGM5Min)
# cBf24H
RO_to_ROName = {
    'Med':      'hP.rMed5Min.cAf2H', 
    'Exercise': 'hP.rExercise5Min.cAf2H', 
    'Diet':     'hP.rDiet5Min.cAf2H', 
}

ROName_list = [v for k, v in RO_to_ROName.items()]
print(ROName_list, '<--ROName_list\n')
ROName_to_RONameInfo = {
    ROName: parse_ROName(ROName) 
    for ROName in ROName_list
}
HumanRecordRecfeat_Args = get_HumanRecordRecfeatArgs_from_RONameToRONameInfo(ROName_to_RONameInfo)
pprint(HumanRecordRecfeat_Args, sort_dicts=False)
############### then you need to write something about your CF. 

In [None]:
# you might need to think about how to get your token in advance.
#############################################

# BGvalue: between 20 and 400 
# threshold: 70, 180

base_idx2tkn = [
        # 'NoObs', 
        'RecNum', # how many records in the RO
        'FirstToNow', # ObsDT (case_example), ROData (df) 
                    # from most recent record to ObsDT of case_example, how many minutes there are. 
        'LastToNow', # ObsDT (case_example), ROData (df) 
                    # from most recent record to ObsDT of case_example, how many minutes there are. 
        ]


item = ['Med', 'Exercise', 'Diet']
idx2tkn = []
for i in item:
    idx2tkn.extend([f'{i}{tkn}' for tkn in base_idx2tkn])

# idx2tkn = []
tkn2tid = {tkn: i for i, tkn in enumerate(idx2tkn)}
COVocab = {
    'idx2tkn': idx2tkn,
    'tkn2tid': tkn2tid,
}

pprint(COVocab, sort_dicts=False, compact=True)
#############################################

# Step 1: OneCohort Record Base and Case Base

In [None]:
from config.config_record.Cohort import CohortName_to_OneCohortArgs
from recfldtkn.record_base import OneCohort_Record_Base
import copy

#############################
CohortName = 'WellDoc2023CVSDeRx'
CohortName_to_OneCohortArgs = {k: v for k, v in CohortName_to_OneCohortArgs.items() if k == CohortName}
TriggerName = 'CGM5MinEntry'
#############################

#### update the HumanRecordRecfeat_Args based on the TriggerCaseBase_Args
Final_HumanRecordRecfeat_Args = copy.deepcopy(HumanRecordRecfeat_Args)
Final_HumanRecordRecfeat_Args['P']['CGM5Min'] = []
print(Final_HumanRecordRecfeat_Args)


onecohort_record_base = OneCohort_Record_Base(CohortName, 
                                              Final_HumanRecordRecfeat_Args,
                                              CohortName_to_OneCohortArgs,
                                              SPACE
                                            )
onecohort_record_base.setup_NameToFn()
onecohort_record_base.initialize_NameToObject()

In [None]:
print(onecohort_record_base.Name_to_HRF) # ['P']['CGM5Min'].OneRecord_Args

In [None]:
# very high level config
# not related to any specific CF

from config.config_case.Flt import FltName_to_FltArgs
from config.config_case.TagRec import TagRec_to_TagRecArgs


Case_Args_Settings = {
    'FltName_to_FltArgs': FltName_to_FltArgs,
    'TagRec_to_TagRecArgs': TagRec_to_TagRecArgs,
}

Case_Proc_Config = {
    'max_trigger_case_num': 10000, 
    'use_task_cache': False, 
    'caseset_chunk_size': 50000,
    'load_casecollection': False,
    'save_data': False, 
    'load_data': False, 
    'via_method': 'ds',
    'n_cpus': 1, 
    'batch_size': 1000,  
}

#### Eventually, we need to build the TriggerCaseBase_Args 
TriggerCaseBase_Args = {
    'Trigger': {
        'TriggerName': TriggerName, 
    },
} 

onecohort_trigger_casebase = OneCohortTrigger_CaseBase(
    onecohort_record_base = onecohort_record_base, 
    TriggerCaseBase_Args = TriggerCaseBase_Args,
    Case_Proc_Config = Case_Proc_Config,
    Case_Args_Settings = Case_Args_Settings,
    SPACE = SPACE 
)

onecohort_trigger_casebase.init_OneCohortTrigger()
CaseSetName_to_caseset = onecohort_trigger_casebase.CaseSetName_to_caseset
for Name, caseset in CaseSetName_to_caseset.items():
    print(caseset)

In [None]:
CaseSetName = list(CaseSetName_to_caseset.keys())[0]
caseset = CaseSetName_to_caseset[CaseSetName]
print(caseset)

df_case = caseset.df_case
df_case

In [None]:
case_examples = caseset.df_case.sample(1, random_state = 1).to_dict(orient = 'list')
case_examples['ObsDT'] = [pd.to_datetime(i) for i in case_examples['ObsDT']]
pprint(case_examples, sort_dicts=False)

# Step 2: HRFDirectory and ROName_to_ROData

In [None]:
from recfldtkn.case_base.casefnutils.hrf import get_HumanDirectoryArgs_ForBatch, get_HRFDirectory_from_HumanDirectory

df_case_batch = pd.DataFrame(case_examples)
HumanDirectory_Args = get_HumanDirectoryArgs_ForBatch(df_case_batch, HumanRecordRecfeat_Args)
# HumanDirectory_Args

HRFDirectory = get_HRFDirectory_from_HumanDirectory(onecohort_record_base, HumanDirectory_Args, HumanRecordRecfeat_Args)
pprint(HRFDirectory, sort_dicts=False)
print([i for i in HRFDirectory])

case_example = df_case_batch.iloc[0].to_dict()
pprint(case_example, sort_dicts=False)

In [None]:
from datetime import datetime 
from recfldtkn.case_base.casefnutils.ro import get_RONameToROInfo, get_RONameToROData_for_OneCaseExample

RO_to_Cache = {}
RCKPD_to_Cache = {}


s = datetime.now()
ROName_to_ROInfo = get_RONameToROInfo(ROName_list, onecohort_record_base, Ckpd_to_CkpdObsConfig, )
print(ROName_to_ROInfo)
e = datetime.now()
print(e - s)

s = datetime.now()
ROName_to_ROData = get_RONameToROData_for_OneCaseExample(case_example, 
                                                         ROName_to_ROInfo, 
                                                         HRFDirectory, 
                                                         RO_to_Cache, 
                                                         RCKPD_to_Cache,
                                                         caseset)
e = datetime.now()
print(e-s)

pprint(case_example)
for ROName, ROData in ROName_to_ROData.items():
    print(ROName)
    display(ROData)


# Step 3: Develop CaseFn

In [None]:
ROName_to_ROData

ROName = ROName_list[0]
ROName 
ROData = ROName_to_ROData[ROName]
ROData

In [None]:
pprint(ROName_to_ROData)
pprint(onecohort_record_base.Name_to_HRF)
pprint(ROName_list)
pprint(case_example, sort_dicts=False)

# developing the code from here.
ROName = ROName_list[0]
#############################################
ROData = ROName_to_ROData[ROName]
df = ROData# .to_pandas() 
df

In [None]:
import inspect

def fn_CaseFn(case_example,     # <--- case to process
               ROName_list,      # <--- from COName
               ROName_to_ROData, # <--- in scope of case_example
               ROName_to_ROInfo, # <--- in scope of CaseFleshingTask
               COVocab,          # <--- in scope of CaseFleshingTask, from ROName_to_ROInfo
               caseset,          # <--- in scope of CaseFleshingTask,
               ):
    
    assert len(ROName_list) == 3
    # ROName = ROName_list[0]

    def map_ROName(ROName):
        if 'Med5Min' in ROName:
            return 'Med'
        elif 'Exercise5Min' in ROName:
            return 'Exercise'
        elif 'Diet5Min' in ROName:
            return 'Diet'
        else:
            return ROName

    #############################################
    d_total = {}

    for ROName in ROName_list:
        d = {}
        ROData = ROName_to_ROData[ROName] # dataframe: RecObsName: Rx-bf24.. RecObsDS: the df: record collection
        if ROData is not None:
            d['RecNum'] = len(ROData)
            if d['RecNum'] == 0: 
                # d['NoObs'] = 1
                d['LastToNow'] = 0.0
                d['FirstToNow'] = 0.0
            else:
                ObsDTName = caseset.ObsDTName
                ObsDTValue = case_example[ObsDTName]# .isoformat()
                # COid = (COName, ObsDTValue)
                ROInfo = ROName_to_ROInfo[ROName]
                RecDT = ROInfo['record'].RecDT
                DT_s_obs = ROData.iloc[ 0][RecDT] # the time of first records
                DT_e_obs = ROData.iloc[-1][RecDT] # pd.to_datetime(dates[idx_e-1]) # the last one smaller than idx_e
                # d['recspan'] = (DT_e_obs - DT_s_obs).total_seconds() / 60 # + 5

                # print('ObsDTValue', ObsDTValue, 'DT_e_obs', DT_e_obs)
                LastToNow   = round((ObsDTValue - DT_e_obs).total_seconds() / 60, 2)
                FirstToNow = round((ObsDTValue - DT_s_obs).total_seconds() / 60, 2)
                d['LastToNow']  = LastToNow
                d['FirstToNow'] = FirstToNow
                # d['NoObs'] = 0
        else:
            d['RecNum'] = 0
            # d['NoObs'] = 1
            d['LastToNow'] = 0.0
            d['FirstToNow'] = 0.0

            # d['recspan'] = 0
            # d['recspan_0'] = 1

        item = map_ROName(ROName)
        for k, v in d.items():
            d_total[f'{item}{k}'] = v

        
    # make sure the d_total's keys are consistent.  
    #############################################
    return d_total

fn_CaseFn.fn_string = inspect.getsource(fn_CaseFn)

In [None]:
print(ROName_list)

COData = fn_CaseFn(case_example,      # <--- case to process
                    ROName_list,      # <--- from COName
                    ROName_to_ROData, # <--- in scope of case_example
                    ROName_to_ROInfo, # <--- in scope of CaseFleshingTask
                    COVocab,          # <--- in scope of CaseFleshingTask, from ROName_to_ROInfo
                    caseset,          # <--- in scope of CaseFleshingTask,
                    )
pprint(COData, sort_dicts=False)

In [None]:
COVocab['tkn2tid']

# Step 4: Save Files

In [None]:
from recfldtkn.base import Base 
from recfldtkn.case_base.casefnutils.casefn import CASESET_CASEFN_PATH
prefix = ['import pandas as pd', 'import numpy as np']

string_variables = [CaseFnName]
iterative_variables = [Ckpd_to_CkpdObsConfig, 
                       RO_to_ROName, 
                       ROName_to_RONameInfo, 
                       HumanRecordRecfeat_Args, 
                       COVocab]

fn_variables = [fn_CaseFn]
pycode = Base.convert_variables_to_pystirng(
    string_variables = string_variables,
    iterative_variables = iterative_variables, 
    fn_variables = fn_variables, 
    prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], CASESET_CASEFN_PATH, f'{CaseFnName}.py')

if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)
display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Step 5: Test

In [None]:
from recfldtkn.case_base.caseset import get_CaseFnTaskArgs_from_CaseFnNameList
CF_list = [
    CaseFnName
]
CaseFnTaskArgs = get_CaseFnTaskArgs_from_CaseFnNameList(CF_list, onecohort_record_base, SPACE)
HumanRecordRecfeat_Args = CaseFnTaskArgs['HumanRecordRecfeat_Args']
CaseFnName_to_CaseFnInfo = CaseFnTaskArgs['CaseFnName_to_CaseFnInfo']
pprint(CaseFnName_to_CaseFnInfo)
print([i for i in CaseFnTaskArgs])

## Test 1

In [None]:
log = []
results = []
s_total = datetime.now()

TEST_NUM = min(1000, len(caseset.df_case))
df_case_batch = caseset.df_case.iloc[: TEST_NUM]

s = datetime.now()
HumanDirectory_Args = get_HumanDirectoryArgs_ForBatch(df_case_batch, HumanRecordRecfeat_Args)
HRFDirectory = get_HRFDirectory_from_HumanDirectory(onecohort_record_base, HumanDirectory_Args, HumanRecordRecfeat_Args)
e = datetime.now()
du = e - s
print(f'Get HRFDirectory for the batch of size {TEST_NUM}', du)

RO_to_Cache = {}
RCKPD_to_Cache = {}

for i in range(TEST_NUM):
    log_i = {}
    case_example = caseset.df_case.iloc[i]
    # print(case_example.to_dict())
    s = datetime.now()
    ROName_to_ROData = get_RONameToROData_for_OneCaseExample(case_example, 
                                                            ROName_to_ROInfo, 
                                                            HRFDirectory, 
                                                            RO_to_Cache, 
                                                            RCKPD_to_Cache,
                                                            caseset)
    e = datetime.now()
    du = e - s
    log_i['ROData'] = du.total_seconds()


    ##########################################
    s = datetime.now()
    CaseData = fn_CaseFn(case_example,     # <--- case to process
                        ROName_list,      # <--- from COName
                        ROName_to_ROData, # <--- in scope of case_example
                        ROName_to_ROInfo, # <--- in scope of CaseFleshingTask
                        COVocab,          # <--- in scope of CaseFleshingTask, from ROName_to_ROInfo
                        caseset,          # <--- in scope of CaseFleshingTask,
                        )
    
    results.append(CaseData)
    e = datetime.now()
    ##########################################
    
    du = e - s
    log_i['CaseData'] = du.total_seconds()
    log.append(log_i)

e_total = datetime.now()
print('Total Time:', e_total - s_total)

df = pd.DataFrame(log)
print(df.mean())
df.plot()

df_results = pd.DataFrame(results)
# print(df_results['num'].value_counts())
df_results.head()

df_case_batch = pd.concat([df_case_batch, df_results], axis = 1)
df_case_batch.head()

## Test 2

In [None]:
from recfldtkn.case_base.casefnutils.casefn import Case_Fn, get_CaseFnNameToCaseFnData_for_OneCaseExample


results = []
log = []
s_total = datetime.now()

TEST_NUM = min(1000, len(caseset.df_case))
df_case_batch = caseset.df_case.iloc[: TEST_NUM]

s = datetime.now()
HumanDirectory_Args = get_HumanDirectoryArgs_ForBatch(df_case_batch, HumanRecordRecfeat_Args)
HRFDirectory = get_HRFDirectory_from_HumanDirectory(onecohort_record_base, HumanDirectory_Args, HumanRecordRecfeat_Args)
e = datetime.now()
du = e - s
print(f'Get HRFDirectory for the batch of size {TEST_NUM}', du)

RO_to_Cache = {}
RCKPD_to_Cache = {}

for i in range(TEST_NUM):
    log_i = {}
    case_example = caseset.df_case.iloc[i]
    # print(case_example.to_dict())
    s = datetime.now()
    ROName_to_ROData = get_RONameToROData_for_OneCaseExample(case_example, 
                                                            ROName_to_ROInfo, 
                                                            HRFDirectory, 
                                                            RO_to_Cache, 
                                                            RCKPD_to_Cache,
                                                            caseset)
    e = datetime.now()
    du = e - s
    log_i['ROData'] = du.total_seconds()


    ##########################################
    s = datetime.now()
    CaseFnName_to_CaseFnData = get_CaseFnNameToCaseFnData_for_OneCaseExample(
                                                  case_example, 
                                                  CaseFnName_to_CaseFnInfo,
                                                  ROName_to_ROInfo,
                                                  ROName_to_ROData,
                                                  caseset, # <--- this is weird, what do you want here? the caseset information?
                                                           # <--- maybe remove it in the future. 
                                                  )
    results.append(CaseFnName_to_CaseFnData)
    e = datetime.now()
    ##########################################
    
    du = e - s
    log_i['CaseData'] = du.total_seconds()
    log.append(log_i)

e_total = datetime.now()
print('Total Time:', e_total - s_total)

df = pd.DataFrame(log)
print(df.mean())
df.plot()

df_results = pd.DataFrame(results)
# print(df_results['NumRxBf1M-num'].value_counts())
df_results.head()
df_case_batch = pd.concat([df_case_batch, df_results], axis = 1)
df_case_batch.tail()