# Space

In [None]:
import sys
import os 
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML

KEY = '1-WORKSPACE'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'_Data/0-Data_Raw',
    'DATA_RFT': f'_Data/1-Data_RFT',
    'DATA_CASE': f'_Data/2-Data_CASE',
    'DATA_AIDATA': f'_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'code/external',
    'CODE_FN': f'code/pipeline', 
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'

print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# CF Data

In [None]:
import datasets 
from recfldtkn.case_base.casefnutils.casefn import Case_Fn #  import AIDATA_ENTRYINPUT_PATH

######################## get the CF_DataName list
CF_DataName = 'CGMwithDietBf8h-CaseBase-CGM5MinEntry-31ec84c0520b37c1'
CohortName_list = [
    # 'WellDoc2022CGM',
    'WellDoc2025ALS',
    # 'WellDoc2025CVS', 
    # 'WellDoc2025LLY',
]
######################## 

######################## get the CF_DataName list
CF_DataName_list = [
    f'{CF_DataName}/{i}' for i in CohortName_list
]
########################

ds_list = []
ref_config = None
ref_column_names = None
for i, CF_DataName in enumerate(CF_DataName_list):
    path = os.path.join(SPACE['DATA_AIDATA'], CF_DataName)
    ds = datasets.load_from_disk(path)
    print(CF_DataName, ds )
    # config = copy.deepcopy(ds.info.config.__dict__) if hasattr(ds.info, 'config') else {}
    config = ds.config_name
    column_names = ds.column_names
    ds_list.append(ds)

# pprint(config)
dataset = datasets.concatenate_datasets(ds_list)

CF_list = list(set([i.split('--')[0] for i in dataset.column_names if '--tid' in i]))
CF_fn_list = [Case_Fn(CF, SPACE) for CF in CF_list]
CF_to_CFvocab = {CF: CF_fn.COVocab for CF, CF_fn in zip(CF_list, CF_fn_list)}

CF_DataName = config['TriggerCaseBaseName']
TriggerCaseBaseArgs = config['TriggerCaseBaseName_to_TriggerCaseBaseArgs'][CF_DataName]
TriggerName = TriggerCaseBaseArgs['Trigger']['TriggerName']

logger.info(f'set up TriggerName: {TriggerName}')
logger.info(f'set up CF_Config: {[i for i in config]}')
config['CF_to_CFvocab'] = CF_to_CFvocab

print('total', dataset)

In [None]:
Data = {'ds_case': dataset}

# OUTPUT 1: UniLabel

In [None]:
dataset

## Args

In [None]:
OneEntryArgs = {
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': '1TknInStep',
        'CF_list': [
            'CGMValueBf24h',
            # 'CGMValueAf2h',
        ],
        'BeforePeriods': ['Bf24h'],
        # 'AfterPeriods': ['Af2h'],
        'InferenceMode': False, # True, # True, # False, # True, 
        'TargetField': 'CGMValue', 
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'EventPred',
        
        # ------------ one head for time to now ------------
        'EventTimeToNow': 'co.Bf24H_Diet5MinInfo:MinToNow',
        'label_to_id_head1': {'0h': 0, '1h': 1, '2h': 2, 
                              '3h': 3, '4h': 4, '5h': 5},
        'dimensions_head1': ['food_event_time'],
        # ------------ one head for food content ------------
        # 'EventCF_Name': 'cf.Diet5MinBaseLMH_Bf24H',
        'label_to_id_head2': {'low': 0, 'medium': 1, 'high': 2},
        'dimensions_head2': ['carbs', 'fiber','fat', 'protein', 'sugar'],


        'set_transform': True,
        'num_proc': 4, 
    },
}

# Data = {'df_case': caseset.df_case, 'ds_case': caseset.ds_case
EntryOutputMethod = OneEntryArgs['Output_Part']['EntryOutputMethod']
EntryInputMethod = OneEntryArgs['Input_Part']['EntryInputMethod']

In [None]:
from recfldtkn.aidata_base.entry import EntryAIData_Builder


entry = EntryAIData_Builder(OneEntryArgs = OneEntryArgs, 
                            SPACE = SPACE)

tfm_fn_AIInputData = entry.tfm_fn_AIInputData
entry_fn_AIInputData = entry.entry_fn_AIInputData

## Function

In [None]:
import inspect

def get_OUTPUT_CFs(OneEntryArgs):
    if 'Output_Part' not in OneEntryArgs:
        return []
    else:
        return OneEntryArgs['Output_Part'].get('CF_list', [])
get_OUTPUT_CFs.fn_string = inspect.getsource(get_OUTPUT_CFs)

In [None]:
def transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab):
    # print([i for i in examples])
    examples_tfm = tfm_fn_AIInputData(examples, OneEntryArgs, CF_to_CFvocab)
    # print([i for i in examples_tfm]) 


    def convert_token(event_input_ids, tid2tkn, dimensions, label_to_id):
        labels = []
        # event_tokens = []
        for one_datapoint_event_input_ids in event_input_ids:
            token_list = [tid2tkn[str(t)].split(':')[-1] for t in one_datapoint_event_input_ids]
            token_list = [i for i in token_list if ' ' in i]
            try:
                d = {k.split(' ')[1]: k.split(' ')[0] for k in token_list}
            except:
                print(token_list)
                raise
            one_labels = []
            for dim in dimensions:
                if dim in d:
                    label = label_to_id.get(d[dim])
                    one_labels.append(label)
                else:
                    # print(f'{dim} not in {d}')
                    one_labels.append(-100)
            labels.append(one_labels)

        labels = np.array(labels)
        dim_to_labels = {}
        for idx, dim in enumerate(dimensions):
            dim_to_labels[dim] = torch.LongTensor(labels[:, idx])
        return dim_to_labels


    EventTimeToNow = OneEntryArgs['Output_Part']['EventTimeToNow']
    # print(EventTimeToNow)
    min_to_now = examples[EventTimeToNow]
    # print(min_to_now)
    hour_labels = [min(int(minutes // 60), 5) for minutes in min_to_now]
    # print(hour_labels)

    # print([i for i in examples if 'Diet5Min' in i ])
    EventCF_Name = OneEntryArgs['Output_Part']['EventCF_Name']
    event_input_ids = [i[-1] for i in examples[EventCF_Name + '--input_ids']]
    # event_input_ids

    CFvocab = CF_to_CFvocab[EventCF_Name]
    input_vocab = CFvocab['input_ids']# ['tkn2tid']
    tkn2tid = input_vocab['tkn2tid']
    tid2tkn = input_vocab['tid2tkn']

    # tkn2tid
    label_to_id = {'low': 0, 'medium': 1, 'high': 2}
    dimensions = ['carbs', 'fiber','fat', 'protein', 'sugar']
    dim_to_labels_food = convert_token(event_input_ids, tid2tkn, dimensions, label_to_id)
    # pd.DataFrame(event_tokens)
    # labels

    dim_to_labels = {}
    dim_to_labels['food_event_time'] = torch.LongTensor(hour_labels)
    dim_to_labels.update(dim_to_labels_food) # ['food_event_time']
    
    dim_to_labels = {k+"_labels": v for k, v in dim_to_labels.items()}

    examples_tfm.update(dim_to_labels)
    examples_tfm['labels'] = examples_tfm['food_event_time_labels'].clone()
    return examples_tfm

transform_fn_output.fn_string = inspect.getsource(transform_fn_output)


In [None]:
examples = dataset[:64]
examples_tfm = transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab)
print([i for i in examples_tfm])

pprint(OneEntryArgs, sort_dicts=True)
pprint(examples_tfm)

In [None]:

def entry_fn_AITaskData(Data, 
                        CF_to_CFvocab, 
                        OneEntryArgs,
                        tfm_fn_AIInputData = None,
                        entry_fn_AIInputData = None,
                        ):

    # InputCFs = OneEntryArgs['Input_FullArgs']['INPUT_CFs_Args']['InputCFs']
    transform_fn = lambda examples: transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab)
    ds_case = Data['ds_case']

    if type(ds_case) == pd.DataFrame:
        ds_case = datasets.Dataset.from_pandas(ds_case)
        
    # ds_case.set_transform(transform_fn)
    # use_map = OneEntryArgs.get('use_map', False)
    Output_Part = OneEntryArgs['Output_Part']
    num_proc = Output_Part.get('num_proc', 4)
    set_transform = Output_Part.get('set_transform', True)
    if set_transform == True:
        ds_case.set_transform(transform_fn)
        ds_tfm = ds_case
    else:
        old_cols = ds_case.column_names
        ds_tfm = ds_case.map(transform_fn, batched = True, num_proc = num_proc)
        ds_tfm = ds_tfm.remove_columns(old_cols)
        
    Data['ds_tfm'] = ds_tfm
    
    return Data

entry_fn_AITaskData.fn_string = inspect.getsource(entry_fn_AITaskData)

In [None]:
Data = entry_fn_AITaskData(Data, 
                           CF_to_CFvocab, 
                           OneEntryArgs,
                           tfm_fn_AIInputData,
                           entry_fn_AIInputData)

ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
batch = ds_tfm[:4]
for k, v in batch.items():
    print(k, v.shape)

In [None]:
from recfldtkn.base import Base
from recfldtkn.aidata_base.entry import AIDATA_ENTRYOUTPUT_PATH

prefix = [
    'import torch',
    'import pandas as pd', 
    'import numpy as np', 
    'import datasets',
    ]
fn_variables = [
    get_OUTPUT_CFs,
    entry_fn_AITaskData,
]
pycode = Base.convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], AIDATA_ENTRYOUTPUT_PATH, f'{EntryOutputMethod}.py')
print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

# Speed Test

In [None]:

from torch.utils.data import DataLoader
import time
import numpy as np

# 1. Create DataLoader with your actual training parameters
loader = DataLoader(
    dataset=ds_tfm,  # Your dataset with set_transform
    batch_size=32,            # Use your real batch size
    num_workers=1,            # Match your training setup
    pin_memory=True,          # Same as training config
    shuffle=False             # Disable for consistent measurement
)

# 2. Warm-up run (initial batches are slower due to setup)
print("Warming up...")
for _ in loader: pass

# 3. Timed measurement
num_batches = len(loader)
print(f"Testing with {num_batches} batches...")

start_time = time.perf_counter()  # More precise timer
for _ in loader:
    pass
total_time = time.perf_counter() - start_time

# 4. Calculate metrics
throughput = num_batches / total_time
samples_per_sec = len(ds_tfm) / total_time

print(f"\nResults:")
print(f"- Batches/s: {throughput:.1f}")
print(f"- Samples/s: {samples_per_sec:.1f}")
print(f"- Batch time: {1000*total_time/num_batches:.1f}ms")
print(f"- Total time: {total_time:.2f}s")


# Warming up...
# Testing with 1657 batches...

# Results:
# - Batches/s: 47.8
# - Samples/s: 1527.5
# - Batch time: 20.9ms
# - Total time: 34.69s

In [None]:

from torch.utils.data import DataLoader
import time
import numpy as np

# 1. Create DataLoader with your actual training parameters
loader = DataLoader(
    dataset=ds_tfm,  # Your dataset with set_transform
    batch_size=64,            # Use your real batch size
    num_workers=1,            # Match your training setup
    pin_memory=True,          # Same as training config
    shuffle=False             # Disable for consistent measurement
)

# 2. Warm-up run (initial batches are slower due to setup)
print("Warming up...")
for _ in loader: pass

# 3. Timed measurement
num_batches = len(loader)
print(f"Testing with {num_batches} batches...")

start_time = time.perf_counter()  # More precise timer
for _ in loader:
    pass
total_time = time.perf_counter() - start_time

# 4. Calculate metrics
throughput = num_batches / total_time
samples_per_sec = len(ds_tfm) / total_time

print(f"\nResults:")
print(f"- Batches/s: {throughput:.1f}")
print(f"- Samples/s: {samples_per_sec:.1f}")
print(f"- Batch time: {1000*total_time/num_batches:.1f}ms")
print(f"- Total time: {total_time:.2f}s")