# Space

In [1]:
import os
import sys 
import logging
import random
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML

# WorkSpace
KEY = 'WorkSpace'; WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY; print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)

# Pipeline Space
from proj_space import SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])
# pprint(SPACE)

# Available Packages
import argparse
import datasets
import pandas as pd
from datetime import datetime 


from recfldtkn.configfn import load_cohort_args
from recfldtkn.loadtools import load_module_variables, update_args_to_list
from recfldtkn.observer import get_RecObsName_to_RecObsInfo, CaseObserverTransformer
from config_observer.CKPD import ckpd_to_CkpdObsConfig

logger = logging.getLogger(__name__)
recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')


g:\Shared drives\CDHAI-WellDoc\2024-WellDocTest-SPACE\_WellDoc-AI-CGMGPT-WorkSpace


# [Part 1] Get a Caseset: Case Examples


## [Step 1] Trigger Event

In [2]:
from recfldtkn.loadtools import fetch_TriggerEvent_tools

####################
# TriggerCaseMethod = 'CGM5MinEntry'
TriggerCaseMethod = 'FoodEntryEOD'
####################

Trigger_tools = fetch_TriggerEvent_tools(TriggerCaseMethod, SPACE)
Trigger_tools 

{'TriggerRecName': 'FoodRec',
 'case_id_columns': ['PID', 'ObsDT'],
 'special_columns': ['PID', 'DT_s'],
 'convert_TriggerEvent_to_Caseset': <function FoodEntryEOD.convert_TriggerEvent_to_Caseset(ds_rec, case_id_columns, special_columns, base_config)>}

In [3]:
##################################
CaseSetName = TriggerCaseMethod
case_id_columns = Trigger_tools['case_id_columns']
special_columns = Trigger_tools['special_columns']
##################################

TriggerCasePath = os.path.join(SPACE['DATA_CaseSet'], f'{CaseSetName}.p')
cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)

cohort_args['ckpd_to_CkpdObsConfig'] = ckpd_to_CkpdObsConfig
cohort_args['case_id_columns'] = case_id_columns
cohort_args['ObsDTName'] = 'ObsDT'
cohort_args['PID_ObsDT_columns'] = [cohort_args['RootID'], cohort_args['ObsDTName']]

print(cohort_args)
print(TriggerCasePath)

{'CohortInfo': {'RawData2022_CGM': {'cohort_label': 1, 'cohort_name': 'RawData2022_CGM', 'FolderPath': '../_Data/0-Data_Raw/RawData2022_CGM/'}, 'RawData2023_CVSTDCAug': {'cohort_label': 2, 'cohort_name': 'RawData2023_CVSTDCAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSTDCAug/'}, 'RawData2023_CVSDeRxAug': {'cohort_label': 3, 'cohort_name': 'RawData2023_CVSDeRxAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/'}}, 'RawRootID': 'PatientID', 'RootID': 'PID', 'RecName': 'PRawRecNum', 'recattr_pyfolder': '../pipeline/fn_recattr/', 'fldtkn_pyfolder': '../pipeline/fn_fldtkn/', 'humanrec_pyfolder': '../pipeline/fn_humanrec/', 'inference_pyfolder': '../pipeline/fn_inference/', 'trigger_pyfolder': '../pipeline/fn_trigger/', 'RecName_to_RFT_GROUP_SIZE': {'CGM5Min': 100, 'Default': 5000}, 'RecName_to_RFT_idx_group_size': {'CGM5Min': 100, 'BGManual': 100, 'Default': 10000}, 'RecName_to_RFT_usebucket': {'CGM5Min': True, 'Default': False}, 'RootIDLength': 6, 'recfldtkn_config_p

In [4]:
CaseSetName = 'CGM5MinEntry'
TriggerCasePath = os.path.join(SPACE['DATA_CaseSet'], f'{CaseSetName}.p')
df_case1 = pd.read_pickle(TriggerCasePath) # (1000)

CaseSetName = 'FoodEntryEOD'
TriggerCasePath = os.path.join(SPACE['DATA_CaseSet'], f'{CaseSetName}.p')
df_case2 = pd.read_pickle(TriggerCasePath) # (1000)

df_case = pd.merge(df_case1, df_case2, on=case_id_columns, how='inner')
# df_case = df_case1.sample(1000, random_state=0).reset_index(drop = True)

df_case

Unnamed: 0,PID,ObsDT
0,1000037,2021-01-07
1,1000037,2021-01-08
2,1000048,2021-01-11
3,1000048,2021-01-13
4,1000048,2021-01-14
...,...,...
2701,3000039,2023-08-06
2702,3000039,2023-08-07
2703,3000039,2023-08-08
2704,3000039,2023-08-11


In [5]:
TriggerCasePath = os.path.join(SPACE['DATA_CaseSet'], f'TestCaseSet.p')

df_case.to_pickle(TriggerCasePath)
df_case

Unnamed: 0,PID,ObsDT
0,1000037,2021-01-07
1,1000037,2021-01-08
2,1000048,2021-01-11
3,1000048,2021-01-13
4,1000048,2021-01-14
...,...,...
2701,3000039,2023-08-06
2702,3000039,2023-08-07
2703,3000039,2023-08-08
2704,3000039,2023-08-11


## [Step 2] Generate Arguments

In [6]:
#################################
# case_observations = [
#     't1.f1.CGM:ro.CGM5Min-Bf24H-N2Cin1_ct.nsTknIn5MinTS',
#     't2.f1.CGM:ro.CGM5Min-Af2H-N2Cin1_ct.nsTknIn5MinTS',
# ]
case_observations = [
    't1.f1.Food:ro.FoodRec-Bf24H-NutriN2C_ct.nsSparseTknIn5MinTS',
    't1.f2.Food:ro.FoodRec-Bf24H-SysCate_ct.nsSparseTknIn5MinTS',
    # 't1.f3.CGM:ro.CGM5Min-Bf24H-N2Cin1_ct.nsSparseTknIn5MinTS',


    't2.f1.Food:ro.FoodRec-Af2H-NutriN2C_ct.nsSparseTknIn5MinTS',
    't2.f2.Food:ro.FoodRec-Af2H-SysCate_ct.nsSparseTknIn5MinTS',
    # 't2.f3.CGM:ro.CGM5Min-Af2H-N2Cin1_ct.nsSparseTknIn5MinTS',
]

name_CaseGamma = 'CatCrossSparseTknCrossTS'
#################################

In [7]:
from pprint import pprint 
from recfldtkn.obsname import convert_case_observations_to_co_to_observation
from recfldtkn.obsname import get_RecNameList_and_FldTknList

co_to_COName, co_to_CONameInfo = convert_case_observations_to_co_to_observation(case_observations)
co_to_COName

{'t1.f1.Food': 'ro.FoodRec-Bf24H-NutriN2C_ct.nsSparseTknIn5MinTS',
 't1.f2.Food': 'ro.FoodRec-Bf24H-SysCate_ct.nsSparseTknIn5MinTS',
 't2.f1.Food': 'ro.FoodRec-Af2H-NutriN2C_ct.nsSparseTknIn5MinTS',
 't2.f2.Food': 'ro.FoodRec-Af2H-SysCate_ct.nsSparseTknIn5MinTS'}

In [8]:
# check whether this information is ready. 
PipelineInfo = get_RecNameList_and_FldTknList(co_to_CONameInfo, ckpd_to_CkpdObsConfig)
pprint(PipelineInfo, sort_dicts=False)

{'RecNameList': ['FoodRec'],
 'CkpdNameList': ['Af2H', 'Bf24H'],
 'FldTknList': ['FoodRec-NutriN2C', 'FoodRec-SysCate'],
 'CasePhiList': ['nsSparseTknIn5MinTS']}


In [9]:
COName_List = [CaseName for co, CaseName in co_to_COName.items()]

In [10]:
from recfldtkn.obsname import convert_CONameList_to_CFName

CaseFeatName = convert_CONameList_to_CFName(COName_List, name_CaseGamma)
print(CaseFeatName)

cf.CatCrossSparseTknCrossTS_co.etcWjnnMrD


In [11]:
# part of fetch_caseobs_Phi_tools
CF_Folder = os.path.join(SPACE['DATA_CaseFeat'], CaseFeatName)
if not os.path.exists(CF_Folder): os.makedirs(CF_Folder)
print(CF_Folder)

../_Data/3-Data_CaseFeat\cf.CatCrossSparseTknCrossTS_co.etcWjnnMrD


In [12]:
ds_case = datasets.Dataset.from_pandas(df_case)
ds_case

Dataset({
    features: ['PID', 'ObsDT'],
    num_rows: 2706
})

# [Part 2] Tools for ds_case_obs

## [Step 1] Prepare examples

In [13]:
case_examples = ds_case[:5]
print(case_examples)

{'PID': [1000037, 1000037, 1000048, 1000048, 1000048], 'ObsDT': [Timestamp('2021-01-07 00:00:00'), Timestamp('2021-01-08 00:00:00'), Timestamp('2021-01-11 00:00:00'), Timestamp('2021-01-13 00:00:00'), Timestamp('2021-01-14 00:00:00')]}


In [14]:
length = len(case_examples[list(case_examples.keys())[0]])
case_examples_list = [{k: v[i] for k, v in case_examples.items()} for i in range(length)]
idx_to_examples = {i: case_examples_list[i] for i in range(length)}
pprint(idx_to_examples, sort_dicts=False)

{0: {'PID': 1000037, 'ObsDT': Timestamp('2021-01-07 00:00:00')},
 1: {'PID': 1000037, 'ObsDT': Timestamp('2021-01-08 00:00:00')},
 2: {'PID': 1000048, 'ObsDT': Timestamp('2021-01-11 00:00:00')},
 3: {'PID': 1000048, 'ObsDT': Timestamp('2021-01-13 00:00:00')},
 4: {'PID': 1000048, 'ObsDT': Timestamp('2021-01-14 00:00:00')}}


In [15]:
case_example = idx_to_examples[0]
case_example

{'PID': 1000037, 'ObsDT': Timestamp('2021-01-07 00:00:00')}

## [Step 2] get_CF_id

In [16]:
# check whether this information is ready. 
PipelineInfo = get_RecNameList_and_FldTknList(co_to_CONameInfo, ckpd_to_CkpdObsConfig)
pprint(PipelineInfo, sort_dicts=False)

{'RecNameList': ['FoodRec'],
 'CkpdNameList': ['Af2H', 'Bf24H'],
 'FldTknList': ['FoodRec-NutriN2C', 'FoodRec-SysCate'],
 'CasePhiList': ['nsSparseTknIn5MinTS']}


In [17]:
import inspect
#################################################
def get_CF_id(case_example, case_id_columns, cohort_args):
    # to be update it to a more general function

    ############################## # consider the case_id_columns for DrFirst.
    case_id_columns = cohort_args['case_id_columns'] # ['PID', 'ObsDT', 'PInvID', 'RxID']
    ############################## 

    # case_id_columns = [i for i in case_id_columns if i in case_example]
    li = [col + ':'+ str(case_example[col]) for col in case_id_columns]
    CF_id = '_'.join(li)
    return CF_id 

get_CF_id.fn_string = inspect.getsource(get_CF_id)
#################################################

In [18]:
get_CF_id(case_example, case_id_columns, cohort_args)

'PID:1000037_ObsDT:2021-01-07 00:00:00'

## [Step 3] get_CaseObsName_to_CaseObsInfo

In [19]:
from recfldtkn.observer import get_CaseObsName_to_CaseObsInfo

record_to_ds_rec = {}        # set this to empty dictionary, then we will load data from disk
record_to_ds_rec_info = {}   # set this to empty dictionary, then we will load data from disk
COName_to_COInfo = get_CaseObsName_to_CaseObsInfo(COName_List,
                                                  SPACE, 
                                                  cohort_args, 
                                                  record_to_ds_rec, 
                                                  record_to_ds_rec_info)

[INFO:2024-04-21 09:11:03,893:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn/Record\FoodRec.yaml
[INFO:2024-04-21 09:11:04,166:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn/Record\FoodRec.yaml
[INFO:2024-04-21 09:11:04,381:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn/Record\FoodRec.yaml
[INFO:2024-04-21 09:11:04,577:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn/Record\FoodRec.yaml


In [20]:
for CaseObsName, CaseObsInfo in COName_to_COInfo.items():
    print(CaseObsName)
    print([i for i in CaseObsInfo])

ro.FoodRec-Bf24H-NutriN2C_ct.nsSparseTknIn5MinTS
['RecObsName_List', 'name_CasePhi', 'get_selected_columns', 'CaseObsName', 'ROName_to_ROInfo', 'fn_CasePhi', 'get_CO_id', 'CO_Folder', 'CO_vocab']
ro.FoodRec-Bf24H-SysCate_ct.nsSparseTknIn5MinTS
['RecObsName_List', 'name_CasePhi', 'get_selected_columns', 'CaseObsName', 'ROName_to_ROInfo', 'fn_CasePhi', 'get_CO_id', 'CO_Folder', 'CO_vocab']
ro.FoodRec-Af2H-NutriN2C_ct.nsSparseTknIn5MinTS
['RecObsName_List', 'name_CasePhi', 'get_selected_columns', 'CaseObsName', 'ROName_to_ROInfo', 'fn_CasePhi', 'get_CO_id', 'CO_Folder', 'CO_vocab']
ro.FoodRec-Af2H-SysCate_ct.nsSparseTknIn5MinTS
['RecObsName_List', 'name_CasePhi', 'get_selected_columns', 'CaseObsName', 'ROName_to_ROInfo', 'fn_CasePhi', 'get_CO_id', 'CO_Folder', 'CO_vocab']


In [21]:
COName_to_co = {v: k for k, v in co_to_COName.items()}
COName_to_co

{'ro.FoodRec-Bf24H-NutriN2C_ct.nsSparseTknIn5MinTS': 't1.f1.Food',
 'ro.FoodRec-Bf24H-SysCate_ct.nsSparseTknIn5MinTS': 't1.f2.Food',
 'ro.FoodRec-Af2H-NutriN2C_ct.nsSparseTknIn5MinTS': 't2.f1.Food',
 'ro.FoodRec-Af2H-SysCate_ct.nsSparseTknIn5MinTS': 't2.f2.Food'}

In [22]:
co_to_COvocab = {COName_to_co[COName]: CaseObsInfo['CO_vocab'] for COName, CaseObsInfo in COName_to_COInfo.items()}

for co, vocab in co_to_COvocab.items():
    print(co)
    print(vocab)

t1.f1.Food
{'tid': {'tid2tkn': {0: 'unk', 1: 'Calories:0~100', 2: 'Calories:0~100Level', 3: 'Carbs:0~10', 4: 'Carbs:0~10Level', 5: 'Cholesterol:0~20', 6: 'Cholesterol:0~20Level', 7: 'Fat:0~5', 8: 'Fat:0~5Level', 9: 'Fiber:0~1', 10: 'Fiber:0~1Level', 11: 'MonoUnSaturatedFat:0~100', 12: 'MonoUnSaturatedFat:0~100Level', 13: 'PolyUnSaturatedFat:0~1', 14: 'PolyUnSaturatedFat:0~1Level', 15: 'Potassium:0~100', 16: 'Potassium:0~100Level', 17: 'Protein:0~10', 18: 'Protein:0~10Level', 19: 'SaturatedFat:0~2', 20: 'SaturatedFat:0~2Level', 21: 'ServingSize:0~1', 22: 'ServingSize:0~1Level', 23: 'ServingsConsumed:0~1', 24: 'ServingsConsumed:0~1Level', 25: 'Sodium:0~100', 26: 'Sodium:0~100Level', 27: 'Sugar:0~5', 28: 'Sugar:0~5Level', 29: 'TransFat:0~0', 30: 'TransFat:0~0Level', 31: 'Fiber:1~2', 32: 'Fiber:1~2Level', 33: 'PolyUnSaturatedFat:1~2', 34: 'PolyUnSaturatedFat:1~2Level', 35: 'ServingSize:1~2', 36: 'ServingSize:1~2Level', 37: 'ServingsConsumed:1~2', 38: 'ServingsConsumed:1~2Level', 39: 'Trans

## [Step 3] get_CF_vocab

In [23]:
##################################
def get_CF_vocab(co_to_COvocab):
    CF_vocab = {}
    co_list = [i for i in co_to_COvocab]

    # across all fields
    f_to_coname = {}
    for co in co_list:
        f_list = [i for i in co.split('.') if i[0] == 'f']
        assert len(f_list) == 1
        f = f_list[0]
        f_to_coname[f] = co 
    # print(f_to_coname)
        
    SeqType = 'input_ids'
    idx2tkn_all = ['[PAD]', '[UNK]',  '[CLS]', '[SEP]', '[MASK]', '[BOS]', '[EOS]']
    for f, co in f_to_coname.items():
        CaseObsVocab = co_to_COvocab[co] 
        idx2tkn = [f+':'+ tkn for tid, tkn in CaseObsVocab['tid']['tid2tkn'].items()]
        idx2tkn_all = idx2tkn_all + idx2tkn
    tid2tkn_all = {i: tkn for i, tkn in enumerate(idx2tkn_all)}
    tkn2tid_all = {tkn: i for i, tkn in enumerate(idx2tkn_all)}
    CF_vocab[SeqType] = {'tid2tkn': tid2tkn_all, 'tkn2tid': tkn2tid_all}
    
    # across all wgts. pass.
    CF_vocab['input_wgts'] = {}

    # across all timesteps 
    SeqType = 'timestep_ids'
    CF_vocab[SeqType] = {}
    t_to_coname = {}
    for co in co_list:
        t_list = [i for i in co.split('.') if i[0] == 't']
        assert len(t_list) == 1
        t = t_list[0]
        t_to_coname[t] = co 
    
    
    # SeqType = 'timestep'
    # idx2tkn_all = ['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]', '[BOS]', '[EOS]']
    idx2tkn_all = []
    for t, co in t_to_coname.items():
        CaseObsVocab = co_to_COvocab[co] 
        idx2tkn = [t+':'+ tkn for tid, tkn in CaseObsVocab['timestep']['tid2tkn'].items()]
        idx2tkn_all = idx2tkn_all + idx2tkn
    tid2tkn_all = {i: tkn for i, tkn in enumerate(idx2tkn_all)}
    tkn2tid_all = {tkn: i for i, tkn in enumerate(idx2tkn_all)}
    CF_vocab[SeqType] = {'tid2tkn': tid2tkn_all, 'tkn2tid': tkn2tid_all}
    
    # SeqType_list = [i for i in CF_vocab]
    return CF_vocab
##################################

get_CF_vocab.fn_string = inspect.getsource(get_CF_vocab)

In [24]:
CF_vocab = get_CF_vocab(co_to_COvocab)
for SeqType, vocab in CF_vocab.items():
    print(SeqType, ':', vocab)
    print('\n')

pprint(CF_vocab, sort_dicts=False, compact=True)

input_ids : {'tid2tkn': {0: '[PAD]', 1: '[UNK]', 2: '[CLS]', 3: '[SEP]', 4: '[MASK]', 5: '[BOS]', 6: '[EOS]', 7: 'f1:unk', 8: 'f1:Calories:0~100', 9: 'f1:Calories:0~100Level', 10: 'f1:Carbs:0~10', 11: 'f1:Carbs:0~10Level', 12: 'f1:Cholesterol:0~20', 13: 'f1:Cholesterol:0~20Level', 14: 'f1:Fat:0~5', 15: 'f1:Fat:0~5Level', 16: 'f1:Fiber:0~1', 17: 'f1:Fiber:0~1Level', 18: 'f1:MonoUnSaturatedFat:0~100', 19: 'f1:MonoUnSaturatedFat:0~100Level', 20: 'f1:PolyUnSaturatedFat:0~1', 21: 'f1:PolyUnSaturatedFat:0~1Level', 22: 'f1:Potassium:0~100', 23: 'f1:Potassium:0~100Level', 24: 'f1:Protein:0~10', 25: 'f1:Protein:0~10Level', 26: 'f1:SaturatedFat:0~2', 27: 'f1:SaturatedFat:0~2Level', 28: 'f1:ServingSize:0~1', 29: 'f1:ServingSize:0~1Level', 30: 'f1:ServingsConsumed:0~1', 31: 'f1:ServingsConsumed:0~1Level', 32: 'f1:Sodium:0~100', 33: 'f1:Sodium:0~100Level', 34: 'f1:Sugar:0~5', 35: 'f1:Sugar:0~5Level', 36: 'f1:TransFat:0~0', 37: 'f1:TransFat:0~0Level', 38: 'f1:Fiber:1~2', 39: 'f1:Fiber:1~2Level', 40:

## [Step 4] Fetch $o_{ij}$ List

In [25]:
pprint(idx_to_examples, sort_dicts=False)

{0: {'PID': 1000037, 'ObsDT': Timestamp('2021-01-07 00:00:00')},
 1: {'PID': 1000037, 'ObsDT': Timestamp('2021-01-08 00:00:00')},
 2: {'PID': 1000048, 'ObsDT': Timestamp('2021-01-11 00:00:00')},
 3: {'PID': 1000048, 'ObsDT': Timestamp('2021-01-13 00:00:00')},
 4: {'PID': 1000048, 'ObsDT': Timestamp('2021-01-14 00:00:00')}}


In [26]:
use_CO_from_disk = False
COName_to_FnCaseObsPhi = {}
for COName, COInfo in COName_to_COInfo.items():
    ro_to_ROName = None 
    FnCaseObsPhi = CaseObserverTransformer(ro_to_ROName,
                                            COInfo['ROName_to_ROInfo'], 
                                            COInfo['name_CasePhi'], 
                                            COInfo['fn_CasePhi'], 
                                            COInfo['CO_vocab'], 
                                            COInfo['get_CO_id'],
                                            cohort_args,
                                            COInfo['CO_Folder'], 
                                            df_case,
                                            use_CO_from_disk)
    COName_to_FnCaseObsPhi[COName] = FnCaseObsPhi

In [27]:
from recfldtkn.observer import CaseFeatureTransformer

fetch_examples_with_complete_COs = CaseFeatureTransformer.fetch_examples_with_complete_COs

use_CO_from_disk = False 
results = fetch_examples_with_complete_COs(idx_to_examples, 
                                            COName_to_co, 
                                            COName_to_COInfo, 
                                            COName_to_FnCaseObsPhi)

idx_to_examples, COName_to_FnCaseObsPhi = results

In [28]:
case_observations

['t1.f1.Food:ro.FoodRec-Bf24H-NutriN2C_ct.nsSparseTknIn5MinTS',
 't1.f2.Food:ro.FoodRec-Bf24H-SysCate_ct.nsSparseTknIn5MinTS',
 't2.f1.Food:ro.FoodRec-Af2H-NutriN2C_ct.nsSparseTknIn5MinTS',
 't2.f2.Food:ro.FoodRec-Af2H-SysCate_ct.nsSparseTknIn5MinTS']

In [29]:
for i, example in idx_to_examples.items():
    print(i)
    print(example)

0
{'PID': 1000037, 'ObsDT': Timestamp('2021-01-07 00:00:00'), 't1.f1.Food_tid': [[47, 59, 3, 4, 9, 146, 147, 278, 279, 126, 127, 320, 321, 65, 66, 33, 34, 11, 12, 29, 350, 286, 287, 27, 28], [35, 37, 236, 237, 9, 7, 1, 17, 25, 19, 13, 11, 29, 5, 15, 27]], 't1.f1.Food_wgt': [[1.0, 1.0, 1.0, 0.2, 1.0, 1.0, 0.04, 1.0, 0.99, 1.0, 0.3, 1.0, 0.11, 1.0, 0.89, 1.0, 0.85, 1.0, 0.06, 1.0, 1.0, 1.0, 0.38, 1.0, 0.29], [1.0, 1.0, 1.0, 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]], 't1.f1.Food_timestep': [-86, -78], 't1.f1.Food_timeinfo': ['ObsDTValue', 'TimeStepType', 'TimeStepSize', 'TimeUnit', 'StartIdx-To-EndIdx'], 't1.f1.Food_timevalues': ['2021-01-07T00:00:00', '5Min', '5', 'minutes', '-288:To:0'], 't1.f2.Food_tid': [[3, 10, 13, 21], [4, 10, 15, 19]], 't1.f2.Food_wgt': [[1, 1, 1, 1], [1, 1, 1, 1]], 't1.f2.Food_timestep': [-86, -78], 't1.f2.Food_timeinfo': ['ObsDTValue', 'TimeStepType', 'TimeStepSize', 'TimeUnit', 'StartIdx-To-EndIdx'], 't1.f2.Food_timevalues': ['2021-01-07T

In [30]:
[i for i in example]

['PID',
 'ObsDT',
 't1.f1.Food_tid',
 't1.f1.Food_wgt',
 't1.f1.Food_timestep',
 't1.f1.Food_timeinfo',
 't1.f1.Food_timevalues',
 't1.f2.Food_tid',
 't1.f2.Food_wgt',
 't1.f2.Food_timestep',
 't1.f2.Food_timeinfo',
 't1.f2.Food_timevalues',
 't2.f1.Food_tid',
 't2.f1.Food_wgt',
 't2.f1.Food_timestep',
 't2.f1.Food_timeinfo',
 't2.f1.Food_timevalues',
 't2.f2.Food_tid',
 't2.f2.Food_wgt',
 't2.f2.Food_timestep',
 't2.f2.Food_timeinfo',
 't2.f2.Food_timevalues']

## [Step 3]* Develop $\Gamma$. (To Dev)

In [31]:
idx = 1
case_example = idx_to_examples[idx]
# print('case_example:', case_example)
pprint(case_example, sort_dicts=False, compact = True)

{'PID': 1000037,
 'ObsDT': Timestamp('2021-01-08 00:00:00'),
 't1.f1.Food_tid': [[35, 37, 119, 120, 9, 7, 1, 17, 25, 19, 13, 11, 29, 5, 15,
                     27, 35, 37, 208, 9, 7, 1, 17, 25, 19, 13, 11, 29, 5, 15,
                     27]],
 't1.f1.Food_wgt': [[1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                     1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                     1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]],
 't1.f1.Food_timestep': [-90],
 't1.f1.Food_timeinfo': ['ObsDTValue', 'TimeStepType', 'TimeStepSize',
                         'TimeUnit', 'StartIdx-To-EndIdx'],
 't1.f1.Food_timevalues': ['2021-01-08T00:00:00', '5Min', '5', 'minutes',
                           '-288:To:0'],
 't1.f2.Food_tid': [[4, 10, 14, 20, 4, 10, 14, 19]],
 't1.f2.Food_wgt': [[1, 1, 1, 1, 1, 1, 1, 1]],
 't1.f2.Food_timestep': [-90],
 't1.f2.Food_timeinfo': ['ObsDTValue', 'TimeStepType', 'TimeStepSize',
                         'TimeUnit', 'StartIdx-To-EndIdx

In [32]:
result_case = {}
co_list = [i for i in co_to_COvocab]
co_list

['t1.f1.Food', 't1.f2.Food', 't2.f1.Food', 't2.f2.Food']

In [33]:
SeqType_list = [i for i in CF_vocab]
SeqType_list

['input_ids', 'input_wgts', 'timestep_ids']

In [34]:
co_list = [i for i in co_to_COvocab]

# across all fields
f_to_coname = {}
for co in co_list:
    f_list = [i for i in co.split('.') if i[0] == 'f']
    assert len(f_list) == 1
    f = f_list[0]
    if f in f_to_coname:
        f_to_coname[f].append(co)
    else:
        f_to_coname[f] = [co]
# print(f_to_coname)
f_to_coname

{'f1': ['t1.f1.Food', 't2.f1.Food'], 'f2': ['t1.f2.Food', 't2.f2.Food']}

In [35]:
t_to_coname = {}
for co in co_list:
    t_list = [i for i in co.split('.') if i[0] == 't']
    assert len(t_list) == 1
    t = t_list[0]
    if t in t_to_coname:
        t_to_coname[t].append(co)
    else:
        t_to_coname[t] = [co]
t_to_coname

{'t1': ['t1.f1.Food', 't1.f2.Food'], 't2': ['t2.f1.Food', 't2.f2.Food']}

In [36]:
time_period = 't1'

co_list = t_to_coname[time_period]
co_list

['t1.f1.Food', 't1.f2.Food']

In [37]:
[i for i in case_example]

['PID',
 'ObsDT',
 't1.f1.Food_tid',
 't1.f1.Food_wgt',
 't1.f1.Food_timestep',
 't1.f1.Food_timeinfo',
 't1.f1.Food_timevalues',
 't1.f2.Food_tid',
 't1.f2.Food_wgt',
 't1.f2.Food_timestep',
 't1.f2.Food_timeinfo',
 't1.f2.Food_timevalues',
 't2.f1.Food_tid',
 't2.f1.Food_wgt',
 't2.f1.Food_timestep',
 't2.f1.Food_timeinfo',
 't2.f1.Food_timevalues',
 't2.f2.Food_tid',
 't2.f2.Food_wgt',
 't2.f2.Food_timestep',
 't2.f2.Food_timeinfo',
 't2.f2.Food_timevalues']

In [38]:
case_example['t1.f1.Food_tid']

[[35,
  37,
  119,
  120,
  9,
  7,
  1,
  17,
  25,
  19,
  13,
  11,
  29,
  5,
  15,
  27,
  35,
  37,
  208,
  9,
  7,
  1,
  17,
  25,
  19,
  13,
  11,
  29,
  5,
  15,
  27]]

In [39]:
f_to_coname

{'f1': ['t1.f1.Food', 't2.f1.Food'], 'f2': ['t1.f2.Food', 't2.f2.Food']}

In [40]:
t_to_coname

{'t1': ['t1.f1.Food', 't1.f2.Food'], 't2': ['t2.f1.Food', 't2.f2.Food']}

In [41]:
from functools import reduce
import itertools
import inspect 

############ deal with input_ids ############

# TimePeriod = 't1'
# timestepName = 'timestep'

def concatenate_field_from_same_timeperiod(case_example,
                                           TimePeriod, 
                                           t_to_coname, 
                                           f_to_coname,
                                           co_to_COvocab, 
                                           CF_vocab):
    co_list = t_to_coname[TimePeriod]
    timestepName = 'timestep'
    
    df_final_list = []
    # -------------- input_ids --------------
    SeqType = 'tid'
    SeqTypeForCF = 'input_ids'

    df_list = []
    for co in co_list:
        values = case_example[f'{co}_{SeqType}']
        # print(values, co)
        if len(values) == 1 and len(values[0]) == 1 and int(values[0][0]) == 0: 
            # print(values)
            case_example[f'{co}_{SeqType}'] = []
            case_example[f'{co}_{timestepName}'] = []
            
        f = [k for k, v in f_to_coname.items() if co in v][0]
        co_tid2tkn = co_to_COvocab[co][SeqType]['tid2tkn']
        cf_tkn2tid = CF_vocab[SeqTypeForCF]['tkn2tid']
        values = {}
        values[f'{co}_{SeqType}'] = [[cf_tkn2tid[f + ':' + co_tid2tkn[t]] for t in timestep] for timestep in case_example[f'{co}_{SeqType}']]
        values[f'{co}_{timestepName}'] = [timestep for timestep in case_example[f'{co}_{timestepName}']]
        df = pd.DataFrame(values).rename(columns={f'{co}_{timestepName}': timestepName}) 
        df_list.append(df)
    combined_df = reduce(lambda left, right: pd.merge(left, right, on=timestepName, how = 'outer'), df_list)
    df = combined_df.set_index(timestepName).reset_index()
    df[SeqTypeForCF] = df.apply(lambda x: list(itertools.chain(*[i for i in x.values if type(i) == list])), axis = 1)
    df = df[[timestepName, SeqTypeForCF]].reset_index(drop = True)
    df_final_list.append(df)


    # -------------- wgt --------------
    SeqType = 'wgt'
    SeqTypeForCF = 'input_wgts'
    SeqType_list = [SeqType, timestepName]
    df_list = []
    for co in co_list:
        values = case_example[f'{co}_{SeqType}']
        # print(values, co)
        if len(values) == 1 and len(values[0]) == 1 and int(values[0][0]) == 0: 
            # print(values)
            case_example[f'{co}_{SeqType}'] = []
            case_example[f'{co}_{timestepName}'] = []
            
        f = [k for k, v in f_to_coname.items() if co in v][0]
        # co_tid2tkn = co_to_COvocab[co][SeqType]['tid2tkn']
        # cf_tkn2tid = CF_vocab[SeqTypeForCF]['tkn2tid']
        values = {}
        values[f'{co}_{SeqType}']      = [timestep for timestep in case_example[f'{co}_{SeqType}']]
        values[f'{co}_{timestepName}'] = [timestep for timestep in case_example[f'{co}_{timestepName}']]
        df = pd.DataFrame(values).rename(columns={f'{co}_{timestepName}': timestepName}) 
        df_list.append(df)
    combined_df = reduce(lambda left, right: pd.merge(left, right, on=timestepName, how = 'outer'), df_list)
    df = combined_df.set_index(timestepName).reset_index()
    df[SeqTypeForCF] = df.apply(lambda x: list(itertools.chain(*[i for i in x.values if type(i) == list])), axis = 1)
    df = df[[timestepName, SeqTypeForCF]].reset_index(drop = True)
    df_final_list.append(df)

    # ---- final data ---
    df_TimePeriod = reduce(lambda left, right: pd.merge(left, right, on=timestepName, how = 'outer'), df_final_list)
    df_TimePeriod[timestepName] = df_TimePeriod[timestepName].astype(int)
    return df_TimePeriod


concatenate_field_from_same_timeperiod.fn_string = inspect.getsource(concatenate_field_from_same_timeperiod)

In [42]:
t_to_coname

{'t1': ['t1.f1.Food', 't1.f2.Food'], 't2': ['t2.f1.Food', 't2.f2.Food']}

In [43]:
# TimePeriod = 't2'
df_list = []
timeperiod_list = [t for t in t_to_coname]
for TimePeriod in timeperiod_list:
    df_TimePeriod = concatenate_field_from_same_timeperiod(case_example,
                                                            TimePeriod, 
                                                            t_to_coname, 
                                                            f_to_coname,
                                                            co_to_COvocab, 
                                                            CF_vocab)
    # df_TimePeriod
    df_list.append(df_TimePeriod)
    
df_tids = pd.concat(df_list, axis = 0)
df_tids

Unnamed: 0,timestep,input_ids,input_wgts
0,-90,"[42, 44, 126, 127, 16, 14, 8, 24, 32, 26, 20, ...","[1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [44]:
final_d = {}
for TimePeriod in t_to_coname:


    co_list = t_to_coname[TimePeriod]
    print(co_list)
    # assert len(co_list) == 1

    co = co_list[0]
    values = {}
    values[f'{co}_timeinfo']   = [timestep for timestep in case_example[f'{co}_timeinfo']]
    values[f'{co}_timevalues'] = [timestep for timestep in case_example[f'{co}_timevalues']]
    df = pd.DataFrame(values) # .rename(columns={f'{co}_{timestepName}': timestepName}) 
    # df   
    d = dict(zip(values[f'{co}_timeinfo'], values[f'{co}_timevalues']))
    d['StartIdx-To-EndIdx' + '_' + TimePeriod] = d.pop('StartIdx-To-EndIdx')
    for k, v in d.items(): final_d[k] = v
    # start = d['StartIdx']
    # end = d['EndIdx']

final_d


['t1.f1.Food', 't1.f2.Food']
['t2.f1.Food', 't2.f2.Food']


{'ObsDTValue': '2021-01-08T00:00:00',
 'TimeStepType': '5Min',
 'TimeStepSize': '5',
 'TimeUnit': 'minutes',
 'StartIdx-To-EndIdx_t1': '-288:To:0',
 'StartIdx-To-EndIdx_t2': '1:To:24'}

In [45]:

CF = df_tids.to_dict(orient='list')
for k, v in CF.items(): 
    if len(v) == 0:
        CF[k] = None 
CF['timeinfo'] = [i for i in final_d]
CF['timevalues'] = [final_d[i] for i in final_d]

# CF['TimeStep'] = '5'
# CF['TimeUnit'] = 'minutes'
# CF['TimePeriod'] = 't1'

pprint(CF, compact=True)

{'input_ids': [[42, 44, 126, 127, 16, 14, 8, 24, 32, 26, 20, 18, 36, 12, 22, 34,
                42, 44, 215, 16, 14, 8, 24, 32, 26, 20, 18, 36, 12, 22, 34, 536,
                542, 546, 552, 536, 542, 546, 551]],
 'input_wgts': [[1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1, 1, 1, 1, 1, 1, 1, 1]],
 'timeinfo': ['ObsDTValue', 'TimeStepType', 'TimeStepSize', 'TimeUnit',
              'StartIdx-To-EndIdx_t1', 'StartIdx-To-EndIdx_t2'],
 'timestep': [-90],
 'timevalues': ['2021-01-08T00:00:00', '5Min', '5', 'minutes', '-288:To:0',
                '1:To:24']}


In [46]:
CF_vocab['input_ids']['tkn2tid']

{'[PAD]': 0,
 '[UNK]': 1,
 '[CLS]': 2,
 '[SEP]': 3,
 '[MASK]': 4,
 '[BOS]': 5,
 '[EOS]': 6,
 'f1:unk': 7,
 'f1:Calories:0~100': 8,
 'f1:Calories:0~100Level': 9,
 'f1:Carbs:0~10': 10,
 'f1:Carbs:0~10Level': 11,
 'f1:Cholesterol:0~20': 12,
 'f1:Cholesterol:0~20Level': 13,
 'f1:Fat:0~5': 14,
 'f1:Fat:0~5Level': 15,
 'f1:Fiber:0~1': 16,
 'f1:Fiber:0~1Level': 17,
 'f1:MonoUnSaturatedFat:0~100': 18,
 'f1:MonoUnSaturatedFat:0~100Level': 19,
 'f1:PolyUnSaturatedFat:0~1': 20,
 'f1:PolyUnSaturatedFat:0~1Level': 21,
 'f1:Potassium:0~100': 22,
 'f1:Potassium:0~100Level': 23,
 'f1:Protein:0~10': 24,
 'f1:Protein:0~10Level': 25,
 'f1:SaturatedFat:0~2': 26,
 'f1:SaturatedFat:0~2Level': 27,
 'f1:ServingSize:0~1': 28,
 'f1:ServingSize:0~1Level': 29,
 'f1:ServingsConsumed:0~1': 30,
 'f1:ServingsConsumed:0~1Level': 31,
 'f1:Sodium:0~100': 32,
 'f1:Sodium:0~100Level': 33,
 'f1:Sugar:0~5': 34,
 'f1:Sugar:0~5Level': 35,
 'f1:TransFat:0~0': 36,
 'f1:TransFat:0~0Level': 37,
 'f1:Fiber:1~2': 38,
 'f1:Fiber:1~2

In [47]:
import inspect 

##################################
def fn_CaseGamma(case_example, co_to_COvocab, CF_vocab, cohort_args):
    import itertools
    co_list = [i for i in co_to_COvocab]

    ############### f_to_coname ###############
    f_to_coname = {}
    for co in co_list:
        f_list = [i for i in co.split('.') if i[0] == 'f']
        assert len(f_list) == 1
        f = f_list[0]
        if f in f_to_coname:
            f_to_coname[f].append(co)
        else:
            f_to_coname[f] = [co]
    
    ############### t_to_coname ###############
    t_to_coname = {}
    for co in co_list:
        t_list = [i for i in co.split('.') if i[0] == 't']
        assert len(t_list) == 1
        t = t_list[0]
        if t in t_to_coname:
            t_to_coname[t].append(co)
        else:
            t_to_coname[t] = [co]

    ############### X_tkn_final ###############
    # TimePeriod = 't2'
    df_list = []
    timeperiod_list = [t for t in t_to_coname]
    for TimePeriod in timeperiod_list:
        df_TimePeriod = concatenate_field_from_same_timeperiod(case_example,
                                                                TimePeriod, 
                                                                t_to_coname, 
                                                                f_to_coname,
                                                                co_to_COvocab, 
                                                                CF_vocab)
        # df_TimePeriod
        df_list.append(df_TimePeriod)
        
    df_tids = pd.concat(df_list, axis = 0)
    CF = df_tids.to_dict(orient='list')
    for k, v in CF.items():
        if len(v) == 0: 
            if '_wgts' in k:
                CF[k] =  [[0.]] 
            elif 'timestep' == k:
                CF[k] = [0]
            else:
                CF[k] = [[0]]


    final_d = {}
    for TimePeriod in t_to_coname:
        co_list = t_to_coname[TimePeriod]
        # print(co_list)
        # assert len(co_list) == 1
        co = co_list[0]
        values = {}
        values[f'{co}_timeinfo']   = [timestep for timestep in case_example[f'{co}_timeinfo']]
        values[f'{co}_timevalues'] = [timestep for timestep in case_example[f'{co}_timevalues']]
        df = pd.DataFrame(values) # .rename(columns={f'{co}_{timestepName}': timestepName}) 
        # df   
        d = dict(zip(values[f'{co}_timeinfo'], values[f'{co}_timevalues']))
        d['StartIdx-To-EndIdx' + '_' + TimePeriod] = d.pop('StartIdx-To-EndIdx')
        for k, v in d.items(): final_d[k] = v
        # start = d['StartIdx']
        # end = d['EndIdx']

    CF['timeinfo'] = [i for i in final_d]
    CF['timevalues'] = [final_d[i] for i in final_d]
    # print(CF)
    return CF  
##################################

fn_CaseGamma.fn_string = inspect.getsource(fn_CaseGamma)

In [48]:
CF = fn_CaseGamma(case_example, co_to_COvocab, CF_vocab, cohort_args)
pprint(CF, compact = True)

{'input_ids': [[42, 44, 126, 127, 16, 14, 8, 24, 32, 26, 20, 18, 36, 12, 22, 34,
                42, 44, 215, 16, 14, 8, 24, 32, 26, 20, 18, 36, 12, 22, 34, 536,
                542, 546, 552, 536, 542, 546, 551]],
 'input_wgts': [[1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1, 1, 1, 1, 1, 1, 1, 1]],
 'timeinfo': ['ObsDTValue', 'TimeStepType', 'TimeStepSize', 'TimeUnit',
              'StartIdx-To-EndIdx_t1', 'StartIdx-To-EndIdx_t2'],
 'timestep': [-90],
 'timevalues': ['2021-01-08T00:00:00', '5Min', '5', 'minutes', '-288:To:0',
                '1:To:24']}


## Save to Files

In [49]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = [
    'import pandas as pd', 
    'import numpy as np',
    'import itertools',
    'from functools import reduce',
    ]
fn_variables = [
    get_CF_id, 
    get_CF_vocab, 
    concatenate_field_from_same_timeperiod, 
    fn_CaseGamma
    ]
pycode = convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], 'fn_casegamma', f'gamma_{name_CaseGamma}.py')
with open(pypath, 'w') as file: file.write(pycode)

# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)
display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Application

In [50]:
CaseFeatName

'cf.CatCrossSparseTknCrossTS_co.etcWjnnMrD'

In [51]:
from recfldtkn.observer import get_CaseFeatInfo_for_a_CaseFeatName


record_to_ds_rec = {}
record_to_ds_rec_info = {}

CaseFeatInfo = get_CaseFeatInfo_for_a_CaseFeatName(name_CaseGamma,
                                                   case_observations,
                                                   SPACE, 
                                                   cohort_args, 
                                                   record_to_ds_rec, 
                                                   record_to_ds_rec_info)
[i for i in CaseFeatInfo]

[INFO:2024-04-21 09:11:05,501:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn/Record\FoodRec.yaml
[INFO:2024-04-21 09:11:05,683:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn/Record\FoodRec.yaml
[INFO:2024-04-21 09:11:05,898:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn/Record\FoodRec.yaml
[INFO:2024-04-21 09:11:06,102:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn/Record\FoodRec.yaml


['CaseFeatName',
 'name_CaseGamma',
 'case_observations',
 'co_to_COName',
 'COName_to_COInfo',
 'COName_List',
 'PipelineInfo',
 'fn_CaseGamma',
 'get_CF_id',
 'get_CF_vocab',
 'CF_vocab',
 'CF_Folder']

In [52]:
import random 
from pprint import pprint
from datetime import datetime 
from recfldtkn.observer import CaseFeatureTransformer

In [53]:

co_to_COName = CaseFeatInfo['co_to_COName']
COName_to_COInfo = CaseFeatInfo['COName_to_COInfo']
name_CaseGamma = CaseFeatInfo['name_CaseGamma']
fn_CaseGamma = CaseFeatInfo['fn_CaseGamma']
CF_vocab = CaseFeatInfo['CF_vocab']
get_CF_id = CaseFeatInfo['get_CF_id']
CF_Folder = CaseFeatInfo['CF_Folder']
df_case = ds_case.to_pandas()
use_CF_from_disk = False
use_CO_from_disk = False
batch_size = CaseFeatInfo.get('batch_size', 1000)

FnCaseFeatGamma = CaseFeatureTransformer(co_to_COName,
                                          COName_to_COInfo, 
                                          name_CaseGamma, 
                                          fn_CaseGamma, 
                                          CF_vocab, 
                                          get_CF_id,
                                          cohort_args,
                                          CF_Folder, 
                                          df_case,
                                          use_CF_from_disk,
                                          use_CO_from_disk)

# Check Cache Functions

In [54]:
FnCaseFeatGamma.new_CFs

{}

In [55]:
start = datetime.now()
batch_size = 100
ds_casetkn = ds_case.map(FnCaseFeatGamma, 
                         batched = True, 
                         batch_size = batch_size, 
                         load_from_cache_file = False, 
                         new_fingerprint = CaseFeatName)
end = datetime.now()
print('Elipse Time: ', end - start)
print(ds_casetkn)

Map:   0%|          | 0/2706 [00:00<?, ? examples/s]

Elipse Time:  0:00:42.841695
Dataset({
    features: ['PID', 'ObsDT', 'timestep', 'input_ids', 'input_wgts', 'timeinfo', 'timevalues'],
    num_rows: 2706
})


In [56]:
start = datetime.now()
batch_size = 100
ds_casetkn = ds_case.map(FnCaseFeatGamma, 
                         batched = True, 
                         batch_size = batch_size, 
                         load_from_cache_file = False, 
                         new_fingerprint = CaseFeatName)
end = datetime.now()
print('Elipse Time: ', end - start)
print(ds_casetkn)

Map:   0%|          | 0/2706 [00:00<?, ? examples/s]

Elipse Time:  0:00:01.854305
Dataset({
    features: ['PID', 'ObsDT', 'timestep', 'input_ids', 'input_wgts', 'timeinfo', 'timevalues'],
    num_rows: 2706
})


In [57]:
print(CaseObsName)
random_int = random.randint(0, len(ds_casetkn))
print(random_int)
pprint(ds_casetkn[random_int])
print(CF_vocab)

ro.FoodRec-Af2H-SysCate_ct.nsSparseTknIn5MinTS
1431
{'ObsDT': Timestamp('2021-11-07 00:00:00'),
 'PID': 1001938,
 'input_ids': [[42,
                44,
                215,
                48,
                217,
                218,
                355,
                356,
                179,
                394,
                395,
                145,
                20,
                18,
                36,
                37,
                333,
                334,
                22,
                34,
                35,
                42,
                44,
                171,
                172,
                60,
                153,
                154,
                319,
                320,
                133,
                134,
                386,
                94,
                20,
                18,
                36,
                315,
                316,
                22,
                34,
                35,
                535,
    

In [58]:
ds_casetkn[72]

{'PID': 1000672,
 'ObsDT': Timestamp('2021-08-15 00:00:00'),
 'timestep': [-195, -101],
 'input_ids': [[42,
   56,
   10,
   11,
   16,
   14,
   15,
   8,
   9,
   24,
   25,
   32,
   33,
   52,
   20,
   18,
   36,
   12,
   22,
   34,
   42,
   66,
   10,
   11,
   16,
   14,
   8,
   9,
   24,
   32,
   26,
   20,
   18,
   36,
   12,
   22,
   34,
   42,
   44,
   229,
   16,
   14,
   8,
   24,
   32,
   26,
   20,
   18,
   36,
   12,
   22,
   34,
   42,
   44,
   10,
   16,
   14,
   8,
   24,
   32,
   26,
   20,
   18,
   36,
   12,
   22,
   34,
   535,
   542,
   545,
   552,
   535,
   542,
   545,
   552,
   536,
   542,
   545,
   552,
   535,
   542,
   545,
   551],
  [42,
   44,
   229,
   230,
   48,
   153,
   154,
   355,
   356,
   133,
   134,
   394,
   395,
   135,
   20,
   18,
   36,
   173,
   174,
   343,
   344,
   139,
   140,
   535,
   542,
   547,
   551]],
 'input_wgts': [[1.0,
   1.0,
   1.0,
   0.1,
   1.0,
   1.0,
   0.7,
   1.0,
   0.4,
   1.0,


# Save

In [59]:
FnCaseFeatGamma.df_CF_info

Unnamed: 0_level_0,casefeat_idx_in_data
casefeat_id,Unnamed: 1_level_1


In [60]:
CF_Folder_data = FnCaseFeatGamma.CF_Folder_data
CF_Folder_data

'../_Data/3-Data_CaseFeat\\cf.CatCrossSparseTknCrossTS_co.etcWjnnMrD\\data'

In [61]:
FnCaseFeatGamma.save_new_CFs_to_disk(CF_Folder_data)

In [62]:
FnCaseFeatGamma.df_CF_info

Unnamed: 0_level_0,casefeat_idx_in_data
casefeat_id,Unnamed: 1_level_1


In [63]:
FnCaseFeatGamma.ds_CF_data

In [64]:
CFids = FnCaseFeatGamma.CFids
ds_CF_data, df_CF_info = FnCaseFeatGamma.load_CFs_from_disk(CF_Folder_data, CFids)

In [65]:
ds_CF_data

In [66]:
df_CF_info

Unnamed: 0_level_0,casefeat_idx_in_data
casefeat_id,Unnamed: 1_level_1


# Save Vocab

In [67]:
CF_vocab = FnCaseFeatGamma.CF_vocab
CF_vocab

{'input_ids': {'tid2tkn': {0: '[PAD]',
   1: '[UNK]',
   2: '[CLS]',
   3: '[SEP]',
   4: '[MASK]',
   5: '[BOS]',
   6: '[EOS]',
   7: 'f1:unk',
   8: 'f1:Calories:0~100',
   9: 'f1:Calories:0~100Level',
   10: 'f1:Carbs:0~10',
   11: 'f1:Carbs:0~10Level',
   12: 'f1:Cholesterol:0~20',
   13: 'f1:Cholesterol:0~20Level',
   14: 'f1:Fat:0~5',
   15: 'f1:Fat:0~5Level',
   16: 'f1:Fiber:0~1',
   17: 'f1:Fiber:0~1Level',
   18: 'f1:MonoUnSaturatedFat:0~100',
   19: 'f1:MonoUnSaturatedFat:0~100Level',
   20: 'f1:PolyUnSaturatedFat:0~1',
   21: 'f1:PolyUnSaturatedFat:0~1Level',
   22: 'f1:Potassium:0~100',
   23: 'f1:Potassium:0~100Level',
   24: 'f1:Protein:0~10',
   25: 'f1:Protein:0~10Level',
   26: 'f1:SaturatedFat:0~2',
   27: 'f1:SaturatedFat:0~2Level',
   28: 'f1:ServingSize:0~1',
   29: 'f1:ServingSize:0~1Level',
   30: 'f1:ServingsConsumed:0~1',
   31: 'f1:ServingsConsumed:0~1Level',
   32: 'f1:Sodium:0~100',
   33: 'f1:Sodium:0~100Level',
   34: 'f1:Sugar:0~5',
   35: 'f1:Sugar:0~5

In [68]:
CF_Folder_vocab = FnCaseFeatGamma.CF_Folder_vocab
CF_Folder_vocab

'../_Data/3-Data_CaseFeat\\cf.CatCrossSparseTknCrossTS_co.etcWjnnMrD\\vocab.p'

In [69]:
df_Vocab = pd.DataFrame({CaseObsName: CF_vocab})
df_Vocab.to_pickle(CF_Folder_vocab)
df_Vocab

Unnamed: 0,ro.FoodRec-Af2H-SysCate_ct.nsSparseTknIn5MinTS
input_ids,"{'tid2tkn': {0: '[PAD]', 1: '[UNK]', 2: '[CLS]..."
input_wgts,{}
timestep_ids,"{'tid2tkn': {0: 't1:5Min_-288', 1: 't1:5Min_-2..."
