In [None]:
import sys
import os
import logging
import pandas as pd
import datasets
from pprint import pprint
KEY = '2-NOTEBOOK'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'_Data/0-Data_Raw',
    'DATA_RFT': f'_Data/1-Data_RFT',
    'DATA_CASE': f'_Data/2-Data_CASE',
    'DATA_AIDATA': f'_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'code/external',
    'CODE_FN': f'code/pipeline',
    'MODEL_ROOT': f'./_Model',
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'
print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

In [None]:
from recfldtkn.aidata_base.entry import EntryAIData_Builder
from recfldtkn.case_base.casefnutils.casefn import Case_Fn
from datasets import DatasetInfo

OneAIDataName = 'DietEventBench'
 
CF_DataName = 'DietEvent-CGM5MinEntry-1ea9d787eef20fb7'
CohortName_list = [
    'WellDoc2022CGM',
    'WellDoc2025ALS',
    'WellDoc2025CVS', 
    'WellDoc2025LLY',
]

######################## get the CF_DataName list
CF_DataName_list = [
    f'{i}/{CF_DataName}' for i in CohortName_list
]
########################
CF_DataName_list


In [None]:
entry = EntryAIData_Builder(SPACE = SPACE)
entry

In [None]:
dataset = entry.merge_one_cf_dataset(CF_DataName_list)
dataset

In [None]:
# dataset.info.config_name

# AI Data

In [None]:
OneEntryArgs = {
    'Split_Part': {
        'SplitMethod': 'SplitFromColumns', # <--- you need to design this function. 
        'Split_to_Selection': {
            'train': {'Rules': [
                                ['split_timebin', 'in', ('train-early', 'valid-early')],
                                ['MEDInfoBf24h-DietRecNum', '>', 0],
                                ['MEDInfoBf24h-DietLastToNow', '>=', 120], 
                                ['MEDInfoBf24h-DietLastToNow', '<=', 420], 
                                ['ObsDT_Minute', '==', 0], 

                                ],
                      'Op': 'and'},



            'valid':  {'Rules': [
                                # ['split', '==', 'valid'],
                                ['split_timebin', 'in', ('train-middle', 'valid-middle')],

                                ['MEDInfoBf24h-DietRecNum', '>', 0],
                                ['MEDInfoBf24h-DietLastToNow', '>=', 120], 
                                ['MEDInfoBf24h-DietLastToNow', '<=', 420], 
                                ['ObsDT_Minute', '==', 0], 


                                ],
                      'Op': 'and'},


            'test-id':  {'Rules': [
                                # ['split', '==', 'test'],
                                ['split_timebin', 'in', ('train-late', 'valid-late')],

                                ['MEDInfoBf24h-DietRecNum', '>', 0],
                                ['MEDInfoBf24h-DietLastToNow', '>=', 120], 
                                ['MEDInfoBf24h-DietLastToNow', '<=', 420], 
                                ['ObsDT_Minute', '==', 0], 


                                ],
                      'Op': 'and'},


            'test-od':  {'Rules': [
                                # ['split', '==', 'test'],
                                ['split_timebin', 'in', ('test-early', 'test-middle', 'test-late')],

                                ['MEDInfoBf24h-DietRecNum', '>', 0],
                                ['MEDInfoBf24h-DietLastToNow', '>=', 120], 
                                ['MEDInfoBf24h-DietLastToNow', '<=', 420], 
                                ['ObsDT_Minute', '==', 0], 


                                ],
                      'Op': 'and'},
        }
    },
    
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': '1TknInStep',
        'CF_list': [
            'CGMValueBf24h',
            # 'CGMValueAf2h',
        ],
        'BeforePeriods': ['Bf24h'],
        # 'AfterPeriods': ['Af2h'],
        'InferenceMode': False, # True, # True, # False, # True, 
        'TargetField': 'CGMValue', 
    }, 


    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'UniLabelRules',
        'CF_list': ['MEDInfoBf24h'],
        'label_rule': {                                          # eat between 0 to 2 hours. 
            1: ('MEDInfoBf24h-DietLastToNow', 'in', [120, 180]), # eat between before 2 to 3 hours
            0: ('MEDInfoBf24h-DietLastToNow', 'in', [180, 420]), # eat between before 3 to 7 hours
            -100: 'others'
        },
        'assertion': [
            ('MEDInfoBf24h-DietLastToNow', 'in', [120, 420]),
        ],
        'set_transform': False,
        'num_proc': 4, 
    },
}

SplitMethod = OneEntryArgs['Split_Part']['SplitMethod']
SplitMethod

In [None]:
from recfldtkn.aidata_base.entry import EntryAIData_Builder

entry = EntryAIData_Builder(OneEntryArgs = OneEntryArgs, 
                            SPACE = SPACE)

split_to_dataset = entry.split_cf_dataset(dataset, config = config)
split_to_dataset

In [None]:
split_to_dataset['train']

In [None]:
split_to_dataset.info

In [None]:
SPACE['DATA_HFDATA'] = '_Data/5-Data_HFDATA'

path = os.path.join(SPACE['DATA_HFDATA'], CF_DataName)

print(path)
# split_to_dataset.

split_to_dataset.save_to_disk(path)

In [None]:
dataset = split_to_dataset['train']
dataset.info

In [None]:
# split_to_dataset.config_name 

In [None]:
Name_to_Data = entry.setup_EntryFn_to_NameToData(split_to_dataset, CF_to_CFvocab, OneEntryArgs)
Name_to_Data

In [None]:
Data = Name_to_Data['train']
ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
batch = ds_tfm[:5]
# pprint(batch)

from pprint import pprint
pprint(batch['input_ids'], compact=True)

In [None]:
pprint(batch['labels'], compact=True)