# Space

In [1]:
import os
import sys 
import logging
import random
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML

# WorkSpace
KEY = 'WorkSpace'; WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY; print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)

# Pipeline Space
from proj_space import SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])
# pprint(SPACE)

# Available Packages
import argparse
import datasets
import pandas as pd
from datetime import datetime 


from recfldtkn.configfn import load_cohort_args
from recfldtkn.loadtools import load_module_variables, update_args_to_list
from recfldtkn.observer import get_RecObsName_to_RecObsInfo, CaseObserverTransformer
from config_observer.CKPD import ckpd_to_CkpdObsConfig

logger = logging.getLogger(__name__)
recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')


g:\Shared drives\CDHAI-WellDoc\2024-WellDocTest-SPACE\_WellDoc-AI-CGMGPT-WorkSpace


# [Part 1] Get a Caseset: Case Examples

## [Step 1] Trigger Event

In [2]:
from recfldtkn.loadtools import fetch_TriggerEvent_tools

####################
TriggerCaseMethod = 'CGM5MinEntry'
####################

Trigger_tools = fetch_TriggerEvent_tools(TriggerCaseMethod, SPACE)
Trigger_tools 

{'TriggerRecName': 'CGM5Min',
 'case_id_columns': ['PID', 'ObsDT'],
 'special_columns': ['PID', 'DT_s'],
 'convert_TriggerEvent_to_Caseset': <function CGM5MinEntry.convert_TriggerEvent_to_Caseset(ds_rec, case_id_columns, special_columns, base_config)>}

In [3]:
##################################
CaseSetName = TriggerCaseMethod
case_id_columns = Trigger_tools['case_id_columns']
special_columns = Trigger_tools['special_columns']
##################################

TriggerCasePath = os.path.join(SPACE['DATA_CaseSet'], f'{CaseSetName}.p')
cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)

cohort_args['ckpd_to_CkpdObsConfig'] = ckpd_to_CkpdObsConfig
cohort_args['case_id_columns'] = case_id_columns
cohort_args['ObsDTName'] = 'ObsDT'
cohort_args['PID_ObsDT_columns'] = [cohort_args['RootID'], cohort_args['ObsDTName']]

print(cohort_args)
print(TriggerCasePath)

df_case = pd.read_pickle(TriggerCasePath) # (1000)
df_case = df_case.sample(1000, random_state=0).reset_index(drop=True)
ds_case = datasets.Dataset.from_pandas(df_case)
ds_case

{'CohortInfo': {'RawData2022_CGM': {'cohort_label': 1, 'cohort_name': 'RawData2022_CGM', 'FolderPath': '../_Data/0-Data_Raw/RawData2022_CGM/'}, 'RawData2023_CVSTDCAug': {'cohort_label': 2, 'cohort_name': 'RawData2023_CVSTDCAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSTDCAug/'}, 'RawData2023_CVSDeRxAug': {'cohort_label': 3, 'cohort_name': 'RawData2023_CVSDeRxAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/'}}, 'RawRootID': 'PatientID', 'RootID': 'PID', 'RecName': 'PRawRecNum', 'recattr_pyfolder': '../pipeline/fn_recattr/', 'fldtkn_pyfolder': '../pipeline/fn_fldtkn/', 'humanrec_pyfolder': '../pipeline/fn_humanrec/', 'inference_pyfolder': '../pipeline/fn_inference/', 'trigger_pyfolder': '../pipeline/fn_trigger/', 'RecName_to_RFT_GROUP_SIZE': {'CGM5Min': 100, 'Default': 5000}, 'RecName_to_RFT_idx_group_size': {'CGM5Min': 100, 'BGManual': 100, 'Default': 10000}, 'RecName_to_RFT_usebucket': {'CGM5Min': True, 'Default': False}, 'RootIDLength': 6, 'recfldtkn_config_p

Dataset({
    features: ['PID', 'ObsDT'],
    num_rows: 1000
})

## [Step 2] Generate Arguments

In [4]:
#################################
case_observations = [
    'BfCGM:ro.CGM5Min-Bf24H_ct.RecNum',
    'AfCGM:ro.CGM5Min-Af2H_ct.RecNum',
]
name_CaseGamma = 'CatUnseqTknsOneTS'
#################################

In [5]:
from pprint import pprint 
from recfldtkn.obsname import convert_case_observations_to_co_to_observation
from recfldtkn.obsname import get_RecNameList_and_FldTknList

co_to_COName, co_to_CONameInfo = convert_case_observations_to_co_to_observation(case_observations)
co_to_COName

{'BfCGM': 'ro.CGM5Min-Bf24H_ct.RecNum', 'AfCGM': 'ro.CGM5Min-Af2H_ct.RecNum'}

In [6]:
# check whether this information is ready. 
PipelineInfo = get_RecNameList_and_FldTknList(co_to_CONameInfo, ckpd_to_CkpdObsConfig)
pprint(PipelineInfo, sort_dicts=False)

{'RecNameList': ['CGM5Min'],
 'CkpdNameList': ['Af2H', 'Bf24H'],
 'FldTknList': [],
 'CasePhiList': ['RecNum']}


In [7]:
COName_List = [CaseName for co, CaseName in co_to_COName.items()]

In [8]:
from recfldtkn.obsname import convert_CONameList_to_CFName

CaseFeatName = convert_CONameList_to_CFName(COName_List, name_CaseGamma)
print(CaseFeatName)

cf.CatUnseqTknsOneTS_co.5oviTuds5X


In [9]:
# part of fetch_caseobs_Phi_tools
CF_Folder = os.path.join(SPACE['DATA_CaseFeat'], CaseFeatName)
if not os.path.exists(CF_Folder): os.makedirs(CF_Folder)
print(CF_Folder)

../_Data/3-Data_CaseFeat\cf.CatUnseqTknsOneTS_co.5oviTuds5X


# [Part 2] Tools for ds_case_obs

## [Step 1] Prepare examples

In [10]:
case_examples = ds_case[:5]
print(case_examples)

{'PID': [1002840, 1002580, 1004670, 1003564, 1003011], 'ObsDT': [Timestamp('2021-12-07 13:50:00'), Timestamp('2021-11-10 12:15:00'), Timestamp('2021-11-13 06:40:00'), Timestamp('2021-09-15 13:35:00'), Timestamp('2021-10-17 17:15:00')]}


In [11]:
length = len(case_examples[list(case_examples.keys())[0]])
case_examples_list = [{k: v[i] for k, v in case_examples.items()} for i in range(length)]
idx_to_examples = {i: case_examples_list[i] for i in range(length)}
pprint(idx_to_examples, sort_dicts=False)

{0: {'PID': 1002840, 'ObsDT': Timestamp('2021-12-07 13:50:00')},
 1: {'PID': 1002580, 'ObsDT': Timestamp('2021-11-10 12:15:00')},
 2: {'PID': 1004670, 'ObsDT': Timestamp('2021-11-13 06:40:00')},
 3: {'PID': 1003564, 'ObsDT': Timestamp('2021-09-15 13:35:00')},
 4: {'PID': 1003011, 'ObsDT': Timestamp('2021-10-17 17:15:00')}}


In [12]:
case_example = idx_to_examples[0]
case_example

{'PID': 1002840, 'ObsDT': Timestamp('2021-12-07 13:50:00')}

## [Step 2] get_CF_id

In [13]:
# check whether this information is ready. 
PipelineInfo = get_RecNameList_and_FldTknList(co_to_CONameInfo, ckpd_to_CkpdObsConfig)
pprint(PipelineInfo, sort_dicts=False)

{'RecNameList': ['CGM5Min'],
 'CkpdNameList': ['Af2H', 'Bf24H'],
 'FldTknList': [],
 'CasePhiList': ['RecNum']}


In [14]:
CaseFeatName

'cf.CatUnseqTknsOneTS_co.5oviTuds5X'

In [15]:
case_id_columns

['PID', 'ObsDT']

In [16]:
case_id_columns

['PID', 'ObsDT']

In [17]:
import inspect
#################################################
def get_CF_id(case_example, case_id_columns, cohort_args):
    # to be update it to a more general function

    ############################## # consider the case_id_columns for DrFirst.
    case_id_columns = cohort_args['case_id_columns'] # ['PID', 'ObsDT', 'PInvID', 'RxID']
    ############################## 
    
    li = [col + ':'+ str(case_example[col]) for col in case_id_columns]
    CF_id = '_'.join(li)
    return CF_id 
get_CF_id.fn_string = inspect.getsource(get_CF_id)
#################################################

In [18]:
get_CF_id(case_example, case_id_columns, cohort_args)

'PID:1002840_ObsDT:2021-12-07 13:50:00'

## [Step 3] get_CaseObsName_to_CaseObsInfo

In [19]:
from recfldtkn.observer import get_CaseObsName_to_CaseObsInfo

record_to_ds_rec = {}        # set this to empty dictionary, then we will load data from disk
record_to_ds_rec_info = {}   # set this to empty dictionary, then we will load data from disk
COName_to_COInfo = get_CaseObsName_to_CaseObsInfo(COName_List,
                                                  SPACE, 
                                                  cohort_args, 
                                                  record_to_ds_rec, 
                                                  record_to_ds_rec_info)

In [20]:
for CaseObsName, CaseObsInfo in COName_to_COInfo.items():
    print(CaseObsName)
    print([i for i in CaseObsInfo])

ro.CGM5Min-Bf24H_ct.RecNum
['RecObsName_List', 'name_CasePhi', 'get_selected_columns', 'CaseObsName', 'ROName_to_ROInfo', 'fn_CasePhi', 'get_CO_id', 'CO_Folder', 'CO_vocab']
ro.CGM5Min-Af2H_ct.RecNum
['RecObsName_List', 'name_CasePhi', 'get_selected_columns', 'CaseObsName', 'ROName_to_ROInfo', 'fn_CasePhi', 'get_CO_id', 'CO_Folder', 'CO_vocab']


In [21]:
COName_to_co = {v: k for k, v in co_to_COName.items()}
COName_to_co

{'ro.CGM5Min-Bf24H_ct.RecNum': 'BfCGM', 'ro.CGM5Min-Af2H_ct.RecNum': 'AfCGM'}

In [22]:
co_to_COvocab = {COName_to_co[COName]: CaseObsInfo['CO_vocab'] for COName, CaseObsInfo in COName_to_COInfo.items()}

for co, vocab in co_to_COvocab.items():
    print(co)
    print(vocab)

BfCGM
{'tid': {'tid2tkn': {0: '[UNK]', 1: 'recnum', 2: 'recspan', 3: 'recnum_0', 4: 'recspan_0'}, 'tkn2tid': {'[UNK]': 0, 'recnum': 1, 'recspan': 2, 'recnum_0': 3, 'recspan_0': 4}}, 'wgt': {}}
AfCGM
{'tid': {'tid2tkn': {0: '[UNK]', 1: 'recnum', 2: 'recspan', 3: 'recnum_0', 4: 'recspan_0'}, 'tkn2tid': {'[UNK]': 0, 'recnum': 1, 'recspan': 2, 'recnum_0': 3, 'recspan_0': 4}}, 'wgt': {}}


## [Step 3] get_CF_vocab

In [23]:
##################################
def get_CF_vocab(co_to_COvocab):
    CF_vocab = {}
    # co_Y_list = [i for i in co_to_COvocab if 'Fut' in i or 'Af' in i]
    # co_X_list = [i for i in co_to_COvocab if i not in co_Y_list]
    co_list = [i for i in co_to_COvocab]
    SeqType = 'input_ids'
    # idx2tkn_all = []
    idx2tkn_all = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[BOS]', '[EOS]']
    for co in co_list:
        CaseObsVocab = co_to_COvocab[co] 
        idx2tkn = [co+':'+ tkn for tid, tkn in CaseObsVocab['tid']['tid2tkn'].items()]
        idx2tkn_all = idx2tkn_all + idx2tkn
    tid2tkn_all = {i: tkn for i, tkn in enumerate(idx2tkn_all)}
    tkn2tid_all = {tkn: i for i, tkn in enumerate(idx2tkn_all)}
    CF_vocab[SeqType] = {'tid2tkn': tid2tkn_all, 'tkn2tid': tkn2tid_all}
    CF_vocab['input_wgts'] = {}
    
    return CF_vocab
##################################

get_CF_vocab.fn_string = inspect.getsource(get_CF_vocab)

In [24]:
CF_vocab = get_CF_vocab(co_to_COvocab)
for SeqType, vocab in CF_vocab.items():
    print(SeqType, ':', vocab)
    print('\n')

input_ids : {'tid2tkn': {0: '[PAD]', 1: '[UNK]', 2: '[CLS]', 3: '[SEP]', 4: '[MASK]', 5: '[BOS]', 6: '[EOS]', 7: 'BfCGM:[UNK]', 8: 'BfCGM:recnum', 9: 'BfCGM:recspan', 10: 'BfCGM:recnum_0', 11: 'BfCGM:recspan_0', 12: 'AfCGM:[UNK]', 13: 'AfCGM:recnum', 14: 'AfCGM:recspan', 15: 'AfCGM:recnum_0', 16: 'AfCGM:recspan_0'}, 'tkn2tid': {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4, '[BOS]': 5, '[EOS]': 6, 'BfCGM:[UNK]': 7, 'BfCGM:recnum': 8, 'BfCGM:recspan': 9, 'BfCGM:recnum_0': 10, 'BfCGM:recspan_0': 11, 'AfCGM:[UNK]': 12, 'AfCGM:recnum': 13, 'AfCGM:recspan': 14, 'AfCGM:recnum_0': 15, 'AfCGM:recspan_0': 16}}


input_wgts : {}




## [Step 4] Fetch $o_{ij}$ List

In [25]:
pprint(idx_to_examples, sort_dicts=False)

{0: {'PID': 1002840, 'ObsDT': Timestamp('2021-12-07 13:50:00')},
 1: {'PID': 1002580, 'ObsDT': Timestamp('2021-11-10 12:15:00')},
 2: {'PID': 1004670, 'ObsDT': Timestamp('2021-11-13 06:40:00')},
 3: {'PID': 1003564, 'ObsDT': Timestamp('2021-09-15 13:35:00')},
 4: {'PID': 1003011, 'ObsDT': Timestamp('2021-10-17 17:15:00')}}


In [26]:
use_CO_from_disk = False
COName_to_FnCaseObsPhi = {}
for COName, COInfo in COName_to_COInfo.items():
    ro_to_ROName = None 
    FnCaseObsPhi = CaseObserverTransformer(ro_to_ROName,
                                            COInfo['ROName_to_ROInfo'], 
                                            COInfo['name_CasePhi'], 
                                            COInfo['fn_CasePhi'], 
                                            COInfo['CO_vocab'], 
                                            COInfo['get_CO_id'],
                                            cohort_args,
                                            COInfo['CO_Folder'], 
                                            df_case,
                                            use_CO_from_disk)
    COName_to_FnCaseObsPhi[COName] = FnCaseObsPhi

In [27]:
from recfldtkn.observer import CaseFeatureTransformer

fetch_examples_with_complete_COs = CaseFeatureTransformer.fetch_examples_with_complete_COs

use_CO_from_disk = False 
results = fetch_examples_with_complete_COs(idx_to_examples, 
                                            COName_to_co, 
                                            COName_to_COInfo, 
                                            COName_to_FnCaseObsPhi)

idx_to_examples, COName_to_FnCaseObsPhi = results

In [28]:
for i, example in idx_to_examples.items():
    print(i)
    print(example)

0
{'PID': 1002840, 'ObsDT': Timestamp('2021-12-07 13:50:00'), 'BfCGM_tid': [1, 2], 'BfCGM_wgt': [289.0, 1440.0], 'AfCGM_tid': [1, 2], 'AfCGM_wgt': [24.0, 115.0]}
1
{'PID': 1002580, 'ObsDT': Timestamp('2021-11-10 12:15:00'), 'BfCGM_tid': [1, 2], 'BfCGM_wgt': [289.0, 1440.0], 'AfCGM_tid': [1, 2], 'AfCGM_wgt': [24.0, 115.0]}
2
{'PID': 1004670, 'ObsDT': Timestamp('2021-11-13 06:40:00'), 'BfCGM_tid': [1, 2], 'BfCGM_wgt': [252.0, 1440.0], 'AfCGM_tid': [1, 2], 'AfCGM_wgt': [24.0, 115.0]}
3
{'PID': 1003564, 'ObsDT': Timestamp('2021-09-15 13:35:00'), 'BfCGM_tid': [1, 2], 'BfCGM_wgt': [289.0, 1440.0], 'AfCGM_tid': [1, 2], 'AfCGM_wgt': [24.0, 115.0]}
4
{'PID': 1003011, 'ObsDT': Timestamp('2021-10-17 17:15:00'), 'BfCGM_tid': [1, 2], 'BfCGM_wgt': [289.0, 1440.0], 'AfCGM_tid': [1, 2], 'AfCGM_wgt': [24.0, 115.0]}


## [Step 3]* Develop $\Gamma$. (To Dev)

In [29]:
idx = 1
case_example = idx_to_examples[idx]
# print('case_example:', case_example)
pprint(case_example, sort_dicts=False)

{'PID': 1002580,
 'ObsDT': Timestamp('2021-11-10 12:15:00'),
 'BfCGM_tid': [1, 2],
 'BfCGM_wgt': [289.0, 1440.0],
 'AfCGM_tid': [1, 2],
 'AfCGM_wgt': [24.0, 115.0]}


In [30]:
##################################
def fn_CaseGamma(case_example, co_to_COvocab, CF_vocab, cohort_args):
    # print(case_example)
    co_list = [i for i in co_to_COvocab]
    
    result_case = {}

    # get the input ids
    SeqType = 'input_ids' 
    X_tid_total = []
    X_wgt_total = []
    tkn2tid_CF = CF_vocab[SeqType]['tkn2tid']

    # print(co_list)
    for co in co_list:
        CO_vocab = co_to_COvocab[co]
        X_tid_co = case_example[co +'_tid']
        X_wgt_co = case_example[co +'_wgt']
        X_tkn_co = [co + ':' + CO_vocab['tid']['tid2tkn'][tid] for tid in X_tid_co]
        X_tid_CF = [tkn2tid_CF[i] for i in X_tkn_co]
        
        X_tid_total = X_tid_total + X_tid_CF
        X_wgt_total = X_wgt_total + X_wgt_co
        
    d = dict(zip(X_tid_total, X_wgt_total))
    d = {k: float(v) for k, v in d.items() if v > 0 and v is not None}
    assert len(d) > 0
    
    # version 1: for DL and ML
    result_case['input_ids']  = [i for i in d]
    result_case['input_wgts'] = [d[i] for i in d]

    return result_case 
##################################

fn_CaseGamma.fn_string = inspect.getsource(fn_CaseGamma)

In [31]:
result = fn_CaseGamma(case_example, co_to_COvocab, CF_vocab, cohort_args)
for SeqType, SeqValue in result.items():
    print(SeqType, ':', len(SeqValue), SeqValue)
    print('\n')

input_ids : 4 [8, 9, 13, 14]


input_wgts : 4 [289.0, 1440.0, 24.0, 115.0]




In [32]:
CF_vocab

{'input_ids': {'tid2tkn': {0: '[PAD]',
   1: '[UNK]',
   2: '[CLS]',
   3: '[SEP]',
   4: '[MASK]',
   5: '[BOS]',
   6: '[EOS]',
   7: 'BfCGM:[UNK]',
   8: 'BfCGM:recnum',
   9: 'BfCGM:recspan',
   10: 'BfCGM:recnum_0',
   11: 'BfCGM:recspan_0',
   12: 'AfCGM:[UNK]',
   13: 'AfCGM:recnum',
   14: 'AfCGM:recspan',
   15: 'AfCGM:recnum_0',
   16: 'AfCGM:recspan_0'},
  'tkn2tid': {'[PAD]': 0,
   '[UNK]': 1,
   '[CLS]': 2,
   '[SEP]': 3,
   '[MASK]': 4,
   '[BOS]': 5,
   '[EOS]': 6,
   'BfCGM:[UNK]': 7,
   'BfCGM:recnum': 8,
   'BfCGM:recspan': 9,
   'BfCGM:recnum_0': 10,
   'BfCGM:recspan_0': 11,
   'AfCGM:[UNK]': 12,
   'AfCGM:recnum': 13,
   'AfCGM:recspan': 14,
   'AfCGM:recnum_0': 15,
   'AfCGM:recspan_0': 16}},
 'input_wgts': {}}

## Save to Files

In [33]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
fn_variables = [get_CF_id, get_CF_vocab, fn_CaseGamma]
pycode = convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], 'fn_casegamma', f'gamma_{name_CaseGamma}.py')
with open(pypath, 'w') as file: file.write(pycode)

# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)
display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Application

In [34]:
CaseFeatName

'cf.CatUnseqTknsOneTS_co.5oviTuds5X'

In [35]:
from recfldtkn.observer import get_CaseFeatInfo_for_a_CaseFeatName


record_to_ds_rec = {}
record_to_ds_rec_info = {}

CaseFeatInfo = get_CaseFeatInfo_for_a_CaseFeatName(name_CaseGamma,
                                                   case_observations,
                                                   SPACE, 
                                                   cohort_args, 
                                                   record_to_ds_rec, 
                                                   record_to_ds_rec_info)
[i for i in CaseFeatInfo]

['CaseFeatName',
 'name_CaseGamma',
 'case_observations',
 'co_to_COName',
 'COName_to_COInfo',
 'COName_List',
 'PipelineInfo',
 'fn_CaseGamma',
 'get_CF_id',
 'get_CF_vocab',
 'CF_vocab',
 'CF_Folder']

In [36]:
import random 
from pprint import pprint
from datetime import datetime 
from recfldtkn.observer import CaseFeatureTransformer

In [37]:

co_to_COName = CaseFeatInfo['co_to_COName']
COName_to_COInfo = CaseFeatInfo['COName_to_COInfo']
name_CaseGamma = CaseFeatInfo['name_CaseGamma']
fn_CaseGamma = CaseFeatInfo['fn_CaseGamma']
CF_vocab = CaseFeatInfo['CF_vocab']
get_CF_id = CaseFeatInfo['get_CF_id']
CF_Folder = CaseFeatInfo['CF_Folder']
df_case = ds_case.to_pandas()
use_CF_from_disk = False
use_CO_from_disk = False
batch_size = CaseFeatInfo.get('batch_size', 1000)

FnCaseFeatGamma = CaseFeatureTransformer(co_to_COName,
                                          COName_to_COInfo, 
                                          name_CaseGamma, 
                                          fn_CaseGamma, 
                                          CF_vocab, 
                                          get_CF_id,
                                          cohort_args,
                                          CF_Folder, 
                                          df_case,
                                          use_CF_from_disk,
                                          use_CO_from_disk)

# Check Cache Functions

In [38]:
FnCaseFeatGamma.new_CFs

{}

In [39]:
start = datetime.now()
batch_size = 100
ds_casetkn = ds_case.map(FnCaseFeatGamma, 
                         batched = True, 
                         batch_size = batch_size, 
                         load_from_cache_file = False, 
                         new_fingerprint = CaseFeatName)
end = datetime.now()
print('Elipse Time: ', end - start)
print(ds_casetkn)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Elipse Time:  0:00:04.846033
Dataset({
    features: ['PID', 'ObsDT', 'input_ids', 'input_wgts'],
    num_rows: 1000
})


In [40]:
start = datetime.now()
batch_size = 100
ds_casetkn = ds_case.map(FnCaseFeatGamma, 
                         batched = True, 
                         batch_size = batch_size, 
                         load_from_cache_file = False, 
                         new_fingerprint = CaseFeatName)
end = datetime.now()
print('Elipse Time: ', end - start)
print(ds_casetkn)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Elipse Time:  0:00:00.024509
Dataset({
    features: ['PID', 'ObsDT', 'input_ids', 'input_wgts'],
    num_rows: 1000
})


In [41]:
print(CaseObsName)
random_int = random.randint(0, len(ds_casetkn))
print(random_int)
pprint(ds_casetkn[random_int])
print(CF_vocab)

ro.CGM5Min-Af2H_ct.RecNum
386
{'ObsDT': Timestamp('2021-08-23 08:10:00'),
 'PID': 1000027,
 'input_ids': [8, 9, 15, 16],
 'input_wgts': [289.0, 1440.0, 1.0, 1.0]}
{'input_ids': {'tid2tkn': {0: '[PAD]', 1: '[UNK]', 2: '[CLS]', 3: '[SEP]', 4: '[MASK]', 5: '[BOS]', 6: '[EOS]', 7: 'BfCGM:[UNK]', 8: 'BfCGM:recnum', 9: 'BfCGM:recspan', 10: 'BfCGM:recnum_0', 11: 'BfCGM:recspan_0', 12: 'AfCGM:[UNK]', 13: 'AfCGM:recnum', 14: 'AfCGM:recspan', 15: 'AfCGM:recnum_0', 16: 'AfCGM:recspan_0'}, 'tkn2tid': {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4, '[BOS]': 5, '[EOS]': 6, 'BfCGM:[UNK]': 7, 'BfCGM:recnum': 8, 'BfCGM:recspan': 9, 'BfCGM:recnum_0': 10, 'BfCGM:recspan_0': 11, 'AfCGM:[UNK]': 12, 'AfCGM:recnum': 13, 'AfCGM:recspan': 14, 'AfCGM:recnum_0': 15, 'AfCGM:recspan_0': 16}}, 'input_wgts': {}}


In [42]:
ds_casetkn[8]

{'PID': 1000261,
 'ObsDT': Timestamp('2021-08-31 07:45:00'),
 'input_ids': [8, 9, 13, 14],
 'input_wgts': [289.0, 1440.0, 24.0, 115.0]}

# Save

In [43]:
FnCaseFeatGamma.df_CF_info

Unnamed: 0_level_0,casefeat_idx_in_data
casefeat_id,Unnamed: 1_level_1


In [44]:
CF_Folder_data = FnCaseFeatGamma.CF_Folder_data
CF_Folder_data

'../_Data/3-Data_CaseFeat\\cf.CatUnseqTknsOneTS_co.5oviTuds5X\\data'

In [45]:
FnCaseFeatGamma.save_new_CFs_to_disk(CF_Folder_data)

In [46]:
FnCaseFeatGamma.df_CF_info

Unnamed: 0_level_0,casefeat_idx_in_data
casefeat_id,Unnamed: 1_level_1


In [47]:
FnCaseFeatGamma.ds_CF_data

In [48]:
CFids = FnCaseFeatGamma.CFids
ds_CF_data, df_CF_info = FnCaseFeatGamma.load_CFs_from_disk(CF_Folder_data, CFids)

In [49]:
ds_CF_data

In [50]:
df_CF_info

Unnamed: 0_level_0,casefeat_idx_in_data
casefeat_id,Unnamed: 1_level_1


# Save Vocab

In [51]:
CF_vocab = FnCaseFeatGamma.CF_vocab
CF_vocab

{'input_ids': {'tid2tkn': {0: '[PAD]',
   1: '[UNK]',
   2: '[CLS]',
   3: '[SEP]',
   4: '[MASK]',
   5: '[BOS]',
   6: '[EOS]',
   7: 'BfCGM:[UNK]',
   8: 'BfCGM:recnum',
   9: 'BfCGM:recspan',
   10: 'BfCGM:recnum_0',
   11: 'BfCGM:recspan_0',
   12: 'AfCGM:[UNK]',
   13: 'AfCGM:recnum',
   14: 'AfCGM:recspan',
   15: 'AfCGM:recnum_0',
   16: 'AfCGM:recspan_0'},
  'tkn2tid': {'[PAD]': 0,
   '[UNK]': 1,
   '[CLS]': 2,
   '[SEP]': 3,
   '[MASK]': 4,
   '[BOS]': 5,
   '[EOS]': 6,
   'BfCGM:[UNK]': 7,
   'BfCGM:recnum': 8,
   'BfCGM:recspan': 9,
   'BfCGM:recnum_0': 10,
   'BfCGM:recspan_0': 11,
   'AfCGM:[UNK]': 12,
   'AfCGM:recnum': 13,
   'AfCGM:recspan': 14,
   'AfCGM:recnum_0': 15,
   'AfCGM:recspan_0': 16}},
 'input_wgts': {}}

In [52]:
CF_Folder_vocab = FnCaseFeatGamma.CF_Folder_vocab
CF_Folder_vocab

'../_Data/3-Data_CaseFeat\\cf.CatUnseqTknsOneTS_co.5oviTuds5X\\vocab.p'

In [53]:
df_Vocab = pd.DataFrame({CaseObsName: CF_vocab})
df_Vocab.to_pickle(CF_Folder_vocab)
df_Vocab

Unnamed: 0,ro.CGM5Min-Af2H_ct.RecNum
input_ids,"{'tid2tkn': {0: '[PAD]', 1: '[UNK]', 2: '[CLS]..."
input_wgts,{}
