# Space

In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

# Step 1: AIData

In [None]:
# Oneday: 288, 24pd. 1/12
from datasets import load_from_disk

AIDataName = 'EventFood2CGM_bf5min_WellDoc_v2' # v2 6 cohorts. 
path = os.path.join(SPACE['DATA_AIDATA'], AIDataName)
print(path)
dataset = load_from_disk(path)
dataset

In [None]:
config = dataset.info.__dict__['config_name']# .features['cf'].feature.vocab
print([i for i in config])
CF_to_CFvocab = config['CF_to_CFvocab']
pprint([i for i in CF_to_CFvocab], sort_dicts=True)

In [None]:
Data = {
    'ds_case': dataset,
}

# Step 2: EntryFn - Input_Part

## Args

In [None]:
OneEntryArgs = {
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'Mto1Period_MultiTknInStep',
        'CF_list': [
            'cf.TargetCGM_Bf24H',
            'cf.TargetCGM_Af2H',

            # 'cf.TimeSparse_Bf24H', 
            # 'cf.TimeSparse_Af2H',

            'cf.Diet5MinBaseLMH_Bf24H',
            'cf.Diet5MinBaseLMH_Af2H',
        ],
        'TargetField': 'TargetCGM',
        # 'TimeField':   'Time',
        'EventFields': [
            # 'Activity',
            'Diet5MinBaseLMH',
        ],
        'BeforePeriods': ['Bf24H'],
        'AfterPeriods': ['Af2H'],
        'InferenceMode': False, # 'WithFutureEvent' #  # 'NoFutureEvent', 'WithFutureEvent', 
    }, 
}

EntryInputMethod = OneEntryArgs['Input_Part']['EntryInputMethod']

## InputCFs

In [None]:
import torch 
import datasets
import inspect
import numpy as np
from scipy.sparse import csr_matrix, hstack
import itertools

## %%%%%%%%%%%%%%%%%%%%% user functions
def get_INPUT_CFs(OneEntryArgs):
    Input_Part = OneEntryArgs['Input_Part']
    CF_list = Input_Part['CF_list']
    ############################ # INPUT_CFs
    assert type(CF_list) == list, f'InputCFs must be a list, but got {type(CF_list)}'
    # INPUT_CFs = sorted(InputCFs_Args)
    INPUT_CFs = CF_list

    InferenceMode = Input_Part['InferenceMode'] 
    BeforePeriods = Input_Part['BeforePeriods']
    TargetField = Input_Part['TargetField']
    if InferenceMode == 'NoFutureEvent':
        INPUT_CFs = [i for i in INPUT_CFs if any([j in i for j in BeforePeriods])]
    elif InferenceMode == 'WithFutureEvent':
        INPUT_CFs = [i for i in INPUT_CFs if any([j in i for j in BeforePeriods]) or TargetField not in i]

    ############################
    return INPUT_CFs

get_INPUT_CFs.fn_string = inspect.getsource(get_INPUT_CFs)

In [None]:
# EntryInputMethod = OneEntryArgs['Input_Part']['EntryInputMethod']
InputCFs = get_INPUT_CFs(OneEntryArgs)
InputCFs

In [None]:
ds_case = Data['ds_case']
ds_case

## Field_to_CFs

In [None]:
# Input_Part = OneEntryArgs['Input_Part']
# TargetField = Input_Part['TargetField']
# TimeField = Input_Part['TimeField']
# EventFields = Input_Part['EventFields']

# FieldList = [TargetField, TimeField] + EventFields
# Field_to_CFs = {
#     field: [i for i in InputCFs if field in i] for field in FieldList
# }

# Field_to_CFs

In [None]:
# CF_to_CFvocab['cf.DietSparse_Bf24H']

## Examples

In [None]:
examples = ds_case.shuffle(seed=42)[:64] # .select(range(5))  
# examples = ds_case[:4] 
pprint(examples, sort_dicts=False, compact=True)

In [None]:
OneEntryArgs

In [None]:
Input_Part = OneEntryArgs['Input_Part']
Input_Part

In [None]:
InputCFs = Input_Part['CF_list']
InputCFs

## TargetCF

In [None]:
TargetField = Input_Part['TargetField']
TargetField

In [None]:
TargetCFs = [i for i in InputCFs if TargetField in i]
TargetCFs

In [None]:
from datetime import datetime 


s = datetime.now()
examples_tfm = {}

############################################################
# # 0:00:00.002059
## method 1:
# df = pd.DataFrame({cf: examples[cf + '--input_ids'] for cf in TargetCFs})
# df['input_ids'] = df.apply(lambda x: list(itertools.chain(*x.values)), axis=1)
# examples_tfm['input_ids'] = torch.LongTensor(np.array(df['input_ids'].to_list())) # ().copy()


############################################################
# # 0:00:00.000868
# method 2: 
# Step 1: Directly access columns as numpy arrays
target_arrays = [np.array(examples[f"{cf}--input_ids"]) for cf in TargetCFs]
# Step 2: Concatenate along columns (axis=1) to combine features
# Assuming each array has shape (batch_size, seq_len)
stacked_ids = np.concatenate(target_arrays, axis=1)
examples_tfm['input_ids'] = torch.LongTensor(stacked_ids)
# examples_tfm['input_ids'] = stacked_ids # torch.LongTensor()


e = datetime.now()
print(f'TargetCFs: {e-s}')
print(examples_tfm['input_ids'].shape)
examples_tfm


## Update the Emptiness 

In [None]:
def detect_empty_values(values):
    if len(values) == 1 and len(values[0]) == 1 and int(values[0][0]) == 0:
        EmptyFlag = True
    else:
        EmptyFlag = False
    return EmptyFlag
detect_empty_values.fn_string = inspect.getsource(detect_empty_values)


def update_emptiness_of_examples(examples, CF):
    # make sure your CF is an EventCF, which means your have steps. 
    batch_to_values = examples[CF + '--input_ids']
    batch_to_empty = [detect_empty_values(values) for values in batch_to_values]
    examples_updated = {}
    for items in ['input_ids', 'input_wgts', 'timestep']:
        if f'{CF}--{items}' not in examples: continue 
        batch_to_values = examples[CF + '--' + items]
        batch_to_values_updated = []
        empty_value = []
        for dp_idx, empty in enumerate(batch_to_empty):
            if empty:
                batch_to_values_updated.append(empty_value)
            else:
                batch_to_values_updated.append(batch_to_values[dp_idx])
        examples_updated[CF + '--' + items] = batch_to_values_updated
    return examples_updated
update_emptiness_of_examples.fn_string = inspect.getsource(update_emptiness_of_examples)


In [None]:
# examples['cf.ActivitySparse_Af2H--input_ids']

In [None]:
# examples_tfm = {}
# examples_tfm['input_ids'] = torch.LongTensor(np.array(df['input_ids'].to_list())) # ().copy()
# examples_tfm

In [None]:
# TimeField = Input_Part['TimeField']
# TimeField

# TimeCFs = [i for i in InputCFs if TimeField in i]
# TimeCFs

In [None]:
# EventCFs = [i for i in InputCFs if TargetField not in i and TimeField not in i]
# EventCFs

In [None]:
EventFields = Input_Part['EventFields']
EventFields

In [None]:
# s = datetime.now()

# for EventCF in EventCFs:
#     examples_updated = update_emptiness_of_examples(examples, EventCF)
#     for k, v in examples_updated.items():
#         examples[k] = v

# e = datetime.now()
# print(f'update_emptiness_of_examples for EventCFs: {e-s}')


## Event to Tensor

In [None]:
OneEvent = EventFields[0]

OneEventCFs = [i for i in InputCFs if OneEvent in i]

print(OneEventCFs)

In [None]:
seqtype = 'input_ids'

# method 1
s = datetime.now()
df = pd.DataFrame({cf: examples[cf + f'--{seqtype}'] for cf in OneEventCFs})
df[seqtype] = df.apply(lambda x: list(itertools.chain(*x.values)), axis=1)
values = df[seqtype].to_list()
e = datetime.now()
print(f'{seqtype}: {e-s}')
df1 = df


# method 2
s = datetime.now()
columns_data = [examples[f"{cf}--{seqtype}"] for cf in OneEventCFs]
values = []
for sample_items in zip(*columns_data):
    combined = list(itertools.chain(*sample_items))
    values.append(combined)
e = datetime.now()
print(f'{seqtype}: {e-s}')
df = pd.DataFrame({cf: examples[cf + f'--{seqtype}'] for cf in OneEventCFs})
df[seqtype] = values
df2 = df

In [None]:
df
# 0, -239. 

In [None]:
# seqtype = 'input_wgts'

# # method 1
# s = datetime.now()
# df = pd.DataFrame({cf: examples[cf + f'--{seqtype}'] for cf in OneEventCFs})
# df[seqtype] = df.apply(lambda x: list(itertools.chain(*x.values)), axis=1)
# values = df[seqtype].to_list()
# e = datetime.now()
# print(f'{seqtype}: {e-s}')
# df1 = df


# # method 2
# s = datetime.now()
# columns_data = [examples[f"{cf}--{seqtype}"] for cf in OneEventCFs]
# values = []
# for sample_items in zip(*columns_data):
#     combined = list(itertools.chain(*sample_items))
#     values.append(combined)
# e = datetime.now()
# print(f'{seqtype}: {e-s}')
# df = pd.DataFrame({cf: examples[cf + f'--{seqtype}'] for cf in OneEventCFs})
# df[seqtype] = values
# df2 = df

# # df2

In [None]:
seqtype = 'timestep'

# method 1
s = datetime.now()
df = pd.DataFrame({cf: examples[cf + f'--{seqtype}'] for cf in OneEventCFs})
df[seqtype] = df.apply(lambda x: list(itertools.chain(*x.values)), axis=1)
values = df[seqtype].to_list()
e = datetime.now()
print(f'{seqtype}: {e-s}')
df1 = df


# method 2
s = datetime.now()
columns_data = [examples[f"{cf}--{seqtype}"] for cf in OneEventCFs]
values = []
for sample_items in zip(*columns_data):
    combined = list(itertools.chain(*sample_items))
    values.append(combined)
e = datetime.now()
print(f'{seqtype}: {e-s}')
df = pd.DataFrame({cf: examples[cf + f'--{seqtype}'] for cf in OneEventCFs})
df[seqtype] = values
df2 = df

In [None]:
df

In [None]:
# # [i for i in examples]
# df = pd.DataFrame({cf: examples[cf + '--input_wgts'] for cf in OneEventCFs})
# df['input_wgts'] = df.apply(lambda x: list(itertools.chain(*x.values)), axis=1)
# df

In [None]:
s = datetime.now()

example_event_info = {}

for seqtype in ['input_ids', 'input_wgts', 'timestep']:
    df = pd.DataFrame({cf: examples[cf + '--' + seqtype] for cf in OneEventCFs})
    df[seqtype] = df.apply(lambda x: list(itertools.chain(*x.values)), axis=1)
    example_event_info[seqtype] = df[seqtype].to_list()

# pprint(example_event_info, sort_dicts=False, compact=True)
e = datetime.now()
print(f'method 1: example_event_info: {e-s}')



s = datetime.now()
example_event_info = {}


max_features = 0
batch_size = 0
for seqtype in ['input_ids', 'input_wgts', 'timestep']:
    columns_data = [examples[f"{cf}--{seqtype}"] for cf in OneEventCFs]
    batch_size = len(columns_data[0])
    values = []
    for sample_items in zip(*columns_data):
        combined = list(itertools.chain(*sample_items))

        # print(combined)
        if seqtype == 'input_ids': 
            if len(combined) == 0:  
                current_max_length = 0
            else:
                current_max_length = max(len(i) for i in combined)

            max_features = max(max_features, current_max_length)
        values.append(combined)
    example_event_info[seqtype] = values
# pprint(example_event_info, sort_dicts=False, compact=True)
e = datetime.now()
print(f'method 2: example_event_info: {e-s}')

print(batch_size, max_features)

if max_features == 0: 
    print('max_features is 0, which means in the batch, no data points contains this event. so we will skip it')

In [None]:
# 678
# tkn = [Group-670, Level-670~680]
# wgt = [1,         0.8]

In [None]:
df_batch = pd.DataFrame(example_event_info)
df_batch

## timestep info

In [None]:
OneEventCFs

In [None]:
CFvocab = CF_to_CFvocab[OneEventCFs[0]]
# CFvocab

In [None]:
def get_timestep_info(examples, OneEventCFs):
    total_index_list = []
    total_orig_ids = []
    total_timedelta_info = []

    for cf in OneEventCFs:
        # Get time metadata in one lookup
        timeinfo = examples[f"{cf}--timeinfo"][0]
        timevalues = examples[f"{cf}--timevalues"][0]
        metadata = dict(zip(timeinfo, timevalues))
        
        # Extract constant values once per CF
        time_unit = metadata['TimeUnit']
        time_step = metadata['TimeStepSize']
        step_type = metadata['TimeStepType']
        
        # Process all index ranges
        for key in (k for k in metadata if 'StartIdx-To-EndIdx' in k):
            start, end = map(int, metadata[key].split(':To:'))
            n = end - start + 1
            
            # Batch generate all entries for this range
            total_index_list.extend(f"{step_type}_{i}" for i in range(start, end+1))
            total_orig_ids.extend(range(start, end+1))
            total_timedelta_info.extend([(time_step, time_unit)] * n)


    index_map = {ts_id: idx for idx, ts_id in enumerate(total_orig_ids)}
    
    timestep_info = {
        'timesteps': total_index_list,
        'timestep_orig_ids': total_orig_ids,
        'index_map': index_map,
        'total_timedelta_info': total_timedelta_info
    }
    return timestep_info

get_timestep_info.fn_string = inspect.getsource(get_timestep_info)

In [None]:
def get_timestep_info_old(examples, OneEventCFs): 

    # do not delete this. 

    total_index_list = []
    total_timedelta_info = []

    TimeUnit = None
    TimeStep = None

    for cf in OneEventCFs:
        timeinfo_col   = f'{cf}--timeinfo'
        timevalues_col = f'{cf}--timevalues'
        d = dict(zip(examples[timeinfo_col][0], examples[timevalues_col][0]))
        TimeUnit = d['TimeUnit'] 
        TimeStep = d['TimeStepSize']
            
        TimeStepType = d['TimeStepType']
        StartIdx_To_EndIdx_columns = [i for i in d if 'StartIdx-To-EndIdx' in i]
        
        for StartIdx_To_EndIdx_col in StartIdx_To_EndIdx_columns:
            start_to_end = d[StartIdx_To_EndIdx_col].split(':To:')
            StartIdx = int(start_to_end[0])
            EndIdx   = int(start_to_end[1])
            index_list = [f'{TimeStepType}_{i}' for i in list(range(StartIdx, EndIdx + 1))]
            timedelta_info = [(TimeStep, TimeUnit)] * len(index_list)
            total_index_list = total_index_list + index_list
            total_timedelta_info = total_timedelta_info + timedelta_info

    timesteps = total_index_list
    # timestep_ids = [FieldVocab['timestep_ids']['tkn2tid'][i] for i in total_index_list]
    timestep_ids = total_index_list
    timestep_orig_ids = [int(i.split('_')[-1]) for i in timesteps]

    timestep_info = {
        'timesteps': timesteps,
        # 'timestep_ids': timestep_ids, 
        'timestep_orig_ids': timestep_orig_ids, 
        'total_timedelta_info': total_timedelta_info,
    }
    # timestep_info
    return timestep_info

get_timestep_info_old.fn_string = inspect.getsource(get_timestep_info_old)

In [None]:
s = datetime.now()
timestep_info_old = get_timestep_info_old(examples, OneEventCFs)
e = datetime.now()
print(f'get_timestep_info_old: {e-s}')
# timestep_info


s = datetime.now()
timestep_info = get_timestep_info(examples, OneEventCFs)
e = datetime.now()
print(f'get_timestep_info_new: {e-s}')
# timestep_info

In [None]:
# pprint(timestep_info, sort_dicts=False, compact=True)

In [None]:
# pprint(timestep_info, sort_dicts=False, compact=True)

In [None]:
# [i for i in examples]
timestep_orig_ids = timestep_info['timestep_orig_ids']

## Batch Array Filling with Sparse Events Crowd Features

In [None]:
# pprint(timestep_info['timestep_orig_ids'], sort_dicts=False, compact=True)

max_timesteps = len(timestep_info['timestep_orig_ids'])
print(batch_size, max_timesteps, max_features)

In [None]:
names = [i for i in example_event_info]
for items_sample in zip(*example_event_info.values()):
    print(items_sample)
    single_data_point = dict(zip(names, items_sample))
    break


In [None]:
single_data_point['timestep']

In [None]:
# old method that takes too much time. 

timestep_orig_ids = timestep_info['timestep_orig_ids']
index_map = timestep_info['index_map']
timestep = single_data_point['timestep']
ts_indices = [index_map[ts_id] for ts_id in timestep]
print(timestep)
print(ts_indices)

template_len = len(timestep_orig_ids)
UNK_ID = 1


input_ids  = np.zeros((template_len, max_features), dtype=np.int64)
input_wgts = np.zeros((template_len, max_features), dtype=np.float32)
input_ids[:, 0] = UNK_ID
input_wgts[:, 0] = 1.0

for idx, ts_idx in zip(single_data_point['input_ids'], ts_indices):
    print(ts_idx, idx)
    input_ids[ts_idx, :len(idx)] = idx


for wgt, ts_idx in zip(single_data_point['input_wgts'], ts_indices):
    print(ts_idx, wgt)
    input_wgts[ts_idx, :len(wgt)] = wgt

In [None]:
# another method that takes too much time. 

s1 = datetime.now()
input_ids = np.zeros((batch_size, template_len, max_features), dtype=np.int64)
e1 = datetime.now()
print(f'create templates:                    {e1-s1}')

s2 = datetime.now()
input_ids[:, :, 0] = UNK_ID
e2 = datetime.now()
print(f'assign UNK_ID:                       {e2-s2}') # we do not need to assign UNK_ID here anymore

### Prepare Event's Tensor Outputs

In [None]:
s0 = datetime.now()

s1 = datetime.now()
input_ids_batch = np.zeros((batch_size, template_len, max_features), dtype=np.int64)
e1 = datetime.now()
print(f'create input_ids:                     {e1-s1}')
print(input_ids_batch.shape)

s1 = datetime.now()
input_wgts_batch = np.zeros((batch_size, template_len, max_features), dtype=np.float32)
e1 = datetime.now()
print(f'create input_wgts:                    {e1-s1}')
print(input_wgts_batch.shape)


s = datetime.now()
event_indicators_batch = np.zeros((batch_size, template_len), dtype=np.int64)
e = datetime.now()
print(f'create event_indicators:              {e-s}')
print(event_indicators_batch.shape)

s = datetime.now()
# timestep_orig_ids_batch = np.array([timestep_orig_ids] * batch_size)
timestep_orig_ids_batch = np.tile(timestep_orig_ids, (batch_size, 1))
e = datetime.now()
print(f'create timestep_orig_ids:             {e-s}')

e0 = datetime.now()
print(f'total time:                           {e0-s0}')

In [None]:
event_info_final = {
    'input_ids': input_ids_batch,
    'input_wgts': input_wgts_batch,
    'timestep_orig_ids': timestep_orig_ids_batch,
    'event_indicators': event_indicators_batch
}

In [None]:
s = datetime.now()


index_map = timestep_info['index_map']


names_orig = [i for i in example_event_info]
for dp_idx, items_sample in enumerate(zip(*example_event_info.values())):
    
    single_data_point = dict(zip(names_orig, items_sample))


    timestep = single_data_point['timestep']
    ts_indices = [index_map[ts_id] for ts_id in timestep]


    single_input_ids = event_info_final['input_ids'][dp_idx]
    for idx, ts_idx in zip(single_data_point['input_ids'], ts_indices):
        # print(ts_idx, idx)
        single_input_ids[ts_idx, :len(idx)] = idx


    single_input_wgts = event_info_final['input_wgts'][dp_idx]
    for wgt, ts_idx in zip(single_data_point['input_wgts'], ts_indices):
        # print(ts_idx, wgt)
        single_input_wgts[ts_idx, :len(wgt)] = wgt

    single_event_indicators = event_info_final['event_indicators'][dp_idx]
    single_event_indicators[ts_indices] = 1


e = datetime.now()
print('New method: ', e-s)

for k, v in event_info_final.items():
    print(k, v.shape)


# Old method:  0:00:00.014089
# New method:  0:00:00.011910

# Old method:  0:00:00.004705
# New method:  0:00:00.001887

# Old method:  0:00:00.004795
# New method:  0:00:00.001790

# Old method:  0:00:00.003432
# New method:  0:00:00.000898


# food-event, ---> tokenize it ---> [tkn1, tkn2, tkn3] 

### Test the accuracy of the event_indicators


In [None]:
examples_tfm['input_ids'][0]

In [None]:
event_indicators = event_info_final['event_indicators']
event_indicators[0]#.shape

In [None]:
event_indicators.sum(axis=1)

In [None]:
input_ids = event_info_final['input_ids']
input_wgts = event_info_final['input_wgts'] 
input_ids.shape

In [None]:
########################
idx = 10 # datapoint idx from batch.
########################

one_input_ids = input_ids[idx]
one_input_wgts = input_wgts[idx]
one_event_indicators = event_indicators[idx] 
print(one_input_ids.shape)
print(one_event_indicators.shape)
print('total event number:', one_event_indicators.sum())

for idx, (step_ids, event_indicator) in enumerate(zip(one_input_ids, one_event_indicators)):
    # 
    if event_indicator != 1: continue 
    print('At the step', idx)
    print('event_indicator', event_indicator)
    print('input_ids at one step', step_ids)
    step_weights = one_input_wgts[idx]
    print('input_wgts at one step', step_weights)
    print('-'*100)

In [None]:
matrix = event_info_final['event_indicators']
print(matrix.shape)
pprint(matrix.tolist(), sort_dicts=False, compact=True)

## Special Field: Time

In [None]:
TimeCFs

In [None]:
timestep_info = get_timestep_info(examples, TimeCFs)
print([i for i in timestep_info])

In [None]:
FirstTimeCF = TimeCFs[0]
FirstTimeCF

In [None]:
CFArgs = CF_to_CFArgs[FirstTimeCF]
CFArgs

In [None]:
CFvocab = CF_to_CFvocab[TimeCFs[0]]
pprint(CFvocab, sort_dicts=False, compact=True)

In [None]:
timestep_orig_ids = np.array(timestep_info['timestep_orig_ids'])
timedelta_info = timestep_info['total_timedelta_info']
obs_dt = np.array(pd.to_datetime(examples['ObsDT']), dtype='datetime64[ns]')
# obs_dt

In [None]:
time_deltas = np.array([
    pd.Timedelta(orig_id * int(ts), unit=unit)# .total_seconds() * 1e9
    for orig_id, (ts, unit) in zip(timestep_orig_ids, timedelta_info)
], dtype='timedelta64[ns]')
# time_deltas

In [None]:
# Vectorized datetime calculation
datetimes = obs_dt[:, None] + time_deltas  # shape: (n_samples, n_timesteps)
# datetimes

In [None]:
import numpy as np

def get_timestepinfo_array(examples, timestep_info, CFvocab):
    # Extract base components
    timestep_orig_ids = np.array(timestep_info['timestep_orig_ids'])
    timedelta_info = timestep_info['total_timedelta_info']
    obs_dt = np.array(pd.to_datetime(examples['ObsDT']), dtype='datetime64[ns]')
    
    # Precompute time deltas in nanoseconds
    time_deltas = np.array([
        pd.Timedelta(orig_id * int(ts), unit=unit)# .total_seconds() * 1e9
        for orig_id, (ts, unit) in zip(timestep_orig_ids, timedelta_info)
    ], dtype='timedelta64[ns]')

    # Vectorized datetime calculation
    datetimes = obs_dt[:, None] + time_deltas  # shape: (n_samples, n_timesteps)

    # Vectorized feature extraction
    hours   = (datetimes.astype('datetime64[h]') - datetimes.astype('datetime64[D]')).astype(int)
    minutes = (datetimes.astype('datetime64[m]') - datetimes.astype('datetime64[h]')).astype(int)

    # Precompute vocabulary mappings
    hour_tokens   = [f'f1:Hour{i}' for i in range(24)]
    minute_tokens = [f'f1:Min{i}' for i in range(60)]
    
    hour_ids   = np.array([CFvocab['input_ids']['tkn2tid'].get(t, 0) for t in hour_tokens])
    minute_ids = np.array([CFvocab['input_ids']['tkn2tid'].get(t, 0) for t in minute_tokens])

    # Vectorized ID lookup
    input_ids = np.stack([
        hour_ids[hours],
        minute_ids[minutes]
    ], axis=-1)  # shape: (n_samples, n_timesteps, 2)

    # Reshape to match expected format
    # input_ids = input_ids.reshape(len(examples['ObsDT']), -1).tolist()

    field_info_final = {
        'input_ids': input_ids, # pad_with_numpy(input_ids),
        'timestep_orig_ids': timestep_orig_ids.tolist()
    }
    return field_info_final

get_timestepinfo_array.fn_string = inspect.getsource(get_timestepinfo_array)

In [None]:
CFvocab = CF_to_CFvocab[TimeCFs[0]]

s = datetime.now()
field_info_final_new = get_timestepinfo_array(examples, timestep_info, CFvocab)
e = datetime.now()
print(f'get_timestepinfo_array_old: {e-s}')
# pprint(field_info_final, sort_dicts=False, compact=True)

# get_timestepinfo_array_old: 0:00:00.005049

In [None]:
# field_info_final_old['input_ids']

In [None]:
# field_info_final_new['input_ids']

field_info_final_new['input_ids'].shape

x = field_info_final_new['input_ids'][0]
x.shape
# 313: -288, 24
x[0]

In [None]:
CFvocab['input_ids']['tid2tkn'][42]
# 10: 10

## tfm_fn_AIInputData

In [None]:
from datetime import datetime


def tfm_fn_AIInputData(examples, OneEntryArgs, CF_to_CFvocab):
    # s1 = datetime.now()
    Input_Part = OneEntryArgs['Input_Part']
    InputCFs = get_INPUT_CFs(OneEntryArgs)
    # e1 = datetime.now()
    # print(f'get_INPUT_CFs: {e1-s1}')

    examples_tfm = {}
    # s2 = datetime.now()
    # ------------------------------------------------------------ # 
    TargetField = Input_Part['TargetField']
    TargetCFs = [i for i in InputCFs if TargetField in i]
    
    # df = pd.DataFrame({cf: examples[cf + '--input_ids'] for cf in TargetCFs})
    # df['input_ids'] = df.apply(lambda x: list(itertools.chain(*x.values)), axis=1)
    # examples_tfm['input_ids'] = torch.LongTensor(np.array(df['input_ids'].to_list())) # ().copy()
    
    target_arrays = [np.array(examples[f"{cf}--input_ids"]) for cf in TargetCFs]
    stacked_ids = np.concatenate(target_arrays, axis=1) # # Assuming each array has shape (batch_size, seq_len)
    examples_tfm['input_ids'] = torch.LongTensor(stacked_ids)

    # e2 = datetime.now()
    # print(f'TargetField: {e2-s2}')


    # ------------------------------------------------------------ # 
    # already ordered
    # s3 = datetime.now()
    EventFields = Input_Part.get('EventFields', [])
    if len(EventFields) > 0:
        TimeField = Input_Part.get('TimeField', None)
        if TimeField is not None:
            EventCFs = [i for i in InputCFs if TargetField not in i and TimeField not in i]
        else:
            EventCFs = [i for i in InputCFs if TargetField not in i]
    else:
        EventCFs = []
    # e3 = datetime.now()
    # print(f'Get different Field Information: {e3-s3}')

    # s4 = datetime.now()
    # update emptiness of examples
    for EventCF in EventCFs:
        examples_updated = update_emptiness_of_examples(examples, EventCF)
        for k, v in examples_updated.items(): examples[k] = v
    # e4 = datetime.now()
    # print(f'update_emptiness_of_examples: {e4-s4}')

   
    # Multi EventCFs
    timestep_info = None
    for OneEvent in EventFields:
        # s5 = datetime.now()

        OneEventCFs = [i for i in InputCFs if OneEvent in i]
        # s6 = datetime.now()
        ############################################################
        example_event_info = {}
        max_features = 0
        batch_size = 0

        for seqtype in ['input_ids', 'input_wgts', 'timestep']:
            columns_data = [examples[f"{cf}--{seqtype}"] for cf in OneEventCFs]
            batch_size = len(columns_data[0])
            values = []
            for sample_items in zip(*columns_data):
                combined = list(itertools.chain(*sample_items))

                if seqtype == 'input_ids': 
                    if len(combined) == 0:
                        current_max_length = 0
                    else:
                        current_max_length = max(len(i) for i in combined)
                    max_features = max(max_features, current_max_length)

                values.append(combined)
            example_event_info[seqtype] = values
        ############################################################
        # e6 = datetime.now()
        # print(f'example_event_info: {e6-s6}')
        if max_features == 0: 
            # skip this event. 
            continue 

        # Precompute timestep info once per Event type
        if timestep_info is None: 
            timestep_info = get_timestep_info(examples, OneEventCFs)
            # name = [i for i in examples]
            timestep_orig_ids = timestep_info['timestep_orig_ids']
            template_len = len(timestep_orig_ids) 

        # s7 = datetime.now()
        input_ids_batch = np.zeros((batch_size, template_len, max_features), dtype=np.int64)
        input_wgts_batch = np.zeros((batch_size, template_len, max_features), dtype=np.float32)
        event_indicators_batch = np.zeros((batch_size, template_len), dtype=np.int64)
        timestep_orig_ids_batch = np.tile(timestep_orig_ids, (batch_size, 1))

        event_info_final = {
            'input_ids': input_ids_batch,
            'input_wgts': input_wgts_batch,
            'timestep_orig_ids': timestep_orig_ids_batch,
            'event_indicators': event_indicators_batch
        }
        
        index_map = timestep_info['index_map']
        names_orig = [i for i in example_event_info]
        for dp_idx, items_sample in enumerate(zip(*example_event_info.values())):
            
            single_data_point = dict(zip(names_orig, items_sample))
            timestep = single_data_point['timestep']
            ts_indices = [index_map[ts_id] for ts_id in timestep]

            single_input_ids = event_info_final['input_ids'][dp_idx]
            for idx, ts_idx in zip(single_data_point['input_ids'], ts_indices):
                # print(ts_idx, idx)
                single_input_ids[ts_idx, :len(idx)] = idx

            single_input_wgts = event_info_final['input_wgts'][dp_idx]
            for wgt, ts_idx in zip(single_data_point['input_wgts'], ts_indices):
                # print(ts_idx, wgt)
                single_input_wgts[ts_idx, :len(wgt)] = wgt

            single_event_indicators = event_info_final['event_indicators'][dp_idx]
            single_event_indicators[ts_indices] = 1

        # e7 = datetime.now()
        # print(f'update_seqtype_base_on_timestep: {e7-s7}')


        # s9 = datetime.now()
        for k, v in event_info_final.items():
            if '_wgt' in k:
                event_info_final[k] = torch.FloatTensor(v)
            else:
                event_info_final[k] = torch.LongTensor(v)
            examples_tfm[OneEvent + '--' + k] = event_info_final[k]
        
        # e9 = datetime.now()
        # print(f'turn to tensor: {e9-s9}')


        # e5 = datetime.now()
        # print(f'Multi EventCFs -- {OneEvent}: {e5-s5}')


    # s6 = datetime.now()
    # ------------------------------------------------------------ # 
    TimeField = Input_Part.get('TimeField', None)
    # TimeField
    if TimeField is not None:
        TimeCFs = [i for i in InputCFs if TimeField in i]
        CFvocab = CF_to_CFvocab[TimeCFs[0]]
        if timestep_info is None: timestep_info = get_timestep_info(examples, TimeCFs)

        time_info_final = get_timestepinfo_array(examples, timestep_info, CFvocab)
        for k, v in time_info_final.items():
            if '_wgt' in k:
                time_info_final[k] = torch.FloatTensor(v)
            else:
                time_info_final[k] = torch.LongTensor(v)

            examples_tfm[TimeField + '--' + k] = time_info_final[k]
    # e6 = datetime.now()
    # print(f'TimeField: {e6-s6}')

    return examples_tfm


tfm_fn_AIInputData.fn_string = inspect.getsource(tfm_fn_AIInputData)

In [None]:
# examples

In [None]:
# print('\n==============================================\n')
from datetime import datetime
s = datetime.now()

# preprocess_fn(examples)
examples_tfm = tfm_fn_AIInputData(examples, OneEntryArgs, CF_to_CFvocab)
e = datetime.now()
print(f'tfm_fn_AIInputData_new: {e-s}')


# old version
# get_INPUT_CFs: 0:00:00.000003
# TargetField: 0:00:00.001757
# EventFields: 0:00:00.000003
# update_emptiness_of_examples: 0:00:00.000056
# Multi EventCFs -- Diet: 0:00:00.470472
# TimeField: 0:00:00.205376
# tfm_fn_AIInputData: 0:00:00.678363


# Diet and Time
# tfm_fn_AIInputData_new: 0:00:00.047536 


# v0202
# TargetField: 0:00:00.001437
# example_event_info: 0:00:00.000135
# update_seqtype_base_on_timestep: 0:00:00.004341
# turn to tensor: 0:00:00.000072
# Multi EventCFs -- Diet: 0:00:00.004749
# TimeField: 0:00:00.005821
# tfm_fn_AIInputData_new: 0:00:00.012326
# 

In [None]:
examples_tfm

In [None]:
for k, v in examples_tfm.items():
    print(k, v.shape)

## entry_fn_AIInputData

In [None]:
def entry_fn_AIInputData(Data, 
                         CF_to_CFvocab, 
                         OneEntryArgs,
                         tfm_fn_AIInputData = None):
    
    # Input feaures. 
    # INPUT_CFs = get_INPUT_CFs(OneEntryArgs)
    # print(INPUT_CFs)
    transform_fn = lambda examples: tfm_fn_AIInputData(examples, OneEntryArgs, CF_to_CFvocab)

    # ds_case 
    ds_case = Data['ds_case']
    if type(ds_case) == pd.DataFrame:
        ds_case = datasets.Dataset.from_pandas(ds_case) 

    use_map = OneEntryArgs.get('use_map', False)
    num_proc = OneEntryArgs.get('num_proc', 4)
    if use_map == False:
        ds_case.set_transform(transform_fn)
        ds_tfm = ds_case
    else:
        ds_tfm = ds_case.map(transform_fn, batched = True, num_proc = num_proc)
    
    Data['ds_tfm'] = ds_tfm
    return Data

tfm_fn_AIInputData.fn_string = inspect.getsource(tfm_fn_AIInputData)
entry_fn_AIInputData.fn_string = inspect.getsource(entry_fn_AIInputData)

## Examine

In [None]:
Data = entry_fn_AIInputData(Data, 
                            CF_to_CFvocab, 
                            OneEntryArgs,
                            tfm_fn_AIInputData)

ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
batch = ds_tfm[:4]
# batch

for k, v in batch.items():
    print(k, v.shape)

##  Save Entry Fn

In [None]:
from recfldtkn.aidata_base.entry import AIDATA_ENTRYINPUT_PATH
from recfldtkn.base import Base

pypath = os.path.join(SPACE['CODE_FN'],  AIDATA_ENTRYINPUT_PATH, f'{EntryInputMethod}.py')
# print(pypath) 

prefix = [
    'import copy',
    'import itertools',
    'import pandas as pd', 
    'import numpy as np', 
    'import datasets',
    'import torch',
    'from datetime import datetime',
    ]

fn_variables = [
    # vectorized_pad,
    detect_empty_values,
    get_timestep_info, 
    update_emptiness_of_examples,
    # update_seqtype_base_on_timestep, 
    # pad_with_numpy,
    get_INPUT_CFs,
    # extract_datetime_components_as_list,
    get_timestepinfo_array,
    tfm_fn_AIInputData,
    entry_fn_AIInputData,
]

pycode = Base.convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)

print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

# Step 3: EntryFn - Output_Part: NTP

## Args

In [None]:
# TaskType = 'MLUniLabel'
SeriesName  = 'Bf24.Af2H'
OneTaskName = 'cgm_lhm_bf24h_af2h_5min'
OneEntryArgs = {
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'Mto1Period_MultiTknInStep',
        'CF_list': [
            'cf.TargetCGM_Bf24H',
            'cf.TargetCGM_Af2H',

            'cf.ActivitySparse_Bf24H',
            'cf.ActivitySparse_Af2H',

            'cf.TimeSparse_Bf24H', 
            'cf.TimeSparse_Af2H',
            'cf.DietSparse_Bf24H',
            'cf.DietSparse_Af2H',
        ],
        'TargetField': 'TargetCGM',
        'TimeField':   'Time',
        'EventFields': [
            'Activity',
            'Diet',
        ],
        'BeforePeriods': ['Bf24H'],
        'AfterPeriods': ['Af2H'],
        'InferenceMode': False, # 'WithFutureEvent' #  # 'NoFutureEvent', 'WithFutureEvent', 
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'NTP',
    },

    # ----------------- Task Part -----------------
    'Task_Part': {
        'Tagging': [],
        'Filtering': [], 
    },
}

# Data = {'df_case': caseset.df_case, 'ds_case': caseset.ds_case}

EntryOutputMethod = OneEntryArgs['Output_Part']['EntryOutputMethod']
CF_to_CFvocab = case_base.TriggerCaseBaseName_to_CFtoCFvocab[TriggerCaseBaseName]
print([i for i in CF_to_CFvocab])

## Function

In [None]:
## %%%%%%%%%%%%%%%%%%%%%
# UniLabel
import inspect 
import numpy as np 
# from recfldtkn.loadtools import convert_variables_to_pystirng

def get_OUTPUT_CFs(OneEntryArgs):
    if 'Output_Part' not in OneEntryArgs:
        return []
    else:
        return OneEntryArgs['Output_Part'].get('CF_list', [])
get_OUTPUT_CFs.fn_string = inspect.getsource(get_OUTPUT_CFs)


def transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab):
    examples_tfm = tfm_fn_AIInputData(examples, OneEntryArgs, CF_to_CFvocab)
    # examples_tfm['labels'] = torch.LongTensor([[i] for i in examples['Labeling']])
    examples_tfm['labels'] = examples_tfm['input_ids'].clone() 
    return examples_tfm

transform_fn_output.fn_string = inspect.getsource(transform_fn_output)


def entry_fn_AITaskData(Data, 
                        CF_to_CFvocab, 
                        OneEntryArgs,
                        tfm_fn_AIInputData = None,
                        entry_fn_AIInputData = None,
                        ):

    # InputCFs = OneEntryArgs['Input_FullArgs']['INPUT_CFs_Args']['InputCFs']
    transform_fn = lambda examples: transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab)
    ds_case = Data['ds_case']

    if type(ds_case) == pd.DataFrame:
        ds_case = datasets.Dataset.from_pandas(ds_case)
        
    # ds_case.set_transform(transform_fn)
    use_map = OneEntryArgs.get('use_map', False)
    num_proc = OneEntryArgs.get('num_proc', 4)
    if use_map == False:
        ds_case.set_transform(transform_fn)
        ds_tfm = ds_case
    else:
        ds_tfm = ds_case.map(transform_fn, batched = True, num_proc = num_proc)

    Data['ds_tfm'] = ds_tfm
    
    return Data

entry_fn_AITaskData.fn_string = inspect.getsource(entry_fn_AITaskData)

In [None]:
Data = entry_fn_AITaskData(Data, 
                           CF_to_CFvocab, 
                           OneEntryArgs,
                           tfm_fn_AIInputData,
                           entry_fn_AIInputData)

ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
batch = ds_tfm[:4]
batch

In [None]:
for k, v in batch.items():
    print(k, v.shape)

In [None]:
# ntp. 

# pretrain dataset. 



# GPT2: token ids: [1, 3324, 3453]
# GPT2:    labels: [1, 3324, 3453]

In [None]:
from recfldtkn.base import Base
from recfldtkn.aidata_base.entry import AIDATA_ENTRYOUTPUT_PATH

prefix = [
    'import torch',
    'import pandas as pd', 
    'import numpy as np', 
    'import datasets',
    ]
fn_variables = [
    get_OUTPUT_CFs,
    transform_fn_output,
    entry_fn_AITaskData,
]
pycode = Base.convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], AIDATA_ENTRYOUTPUT_PATH, f'{EntryOutputMethod}.py')
print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

# Step 4: EntryFn - Output_Part: MaskedLM

## Args

In [None]:
# TaskType = 'MLUniLabel'
SeriesName  = 'Bf24.Af2H'
OneTaskName = 'cgm_lhm_bf24h_af2h_5min'
OneEntryArgs = {
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'Mto1Period_MultiTknInStep',
        'CF_list': [
            'cf.TargetCGM_Bf24H',
            'cf.TargetCGM_Af2H',

            # 'cf.ActivitySparse_Bf24H',
            # 'cf.ActivitySparse_Af2H',

            'cf.TimeSparse_Bf24H', 
            'cf.TimeSparse_Af2H',


            # 'cf.DietSparse_Bf24H',
            # 'cf.DietSparse_Af2H',
        ],
        'TargetField': 'TargetCGM',
        'TimeField':   'Time',
        'EventFields': [
            # 'Activity',
            # 'Diet',
        ],
        'BeforePeriods': ['Bf24H'],
        'AfterPeriods': ['Af2H'],
        'InferenceMode': False, # 'WithFutureEvent' #  # 'NoFutureEvent', 'WithFutureEvent', 
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'MaskedLM',
        'MaskingRate': 0.15,
    },

    # ----------------- Task Part -----------------
    'Task_Part': {
        'Tagging': [],
        'Filtering': [], 
    },
}

# Data = {'df_case': caseset.df_case, 'ds_case': caseset.ds_case}

EntryOutputMethod = OneEntryArgs['Output_Part']['EntryOutputMethod']
CF_to_CFvocab = case_base.TriggerCaseBaseName_to_CFtoCFvocab[TriggerCaseBaseName]
print([i for i in CF_to_CFvocab])

## Function

In [None]:
## %%%%%%%%%%%%%%%%%%%%%
# UniLabel
import inspect 
import numpy as np 
# from recfldtkn.loadtools import convert_variables_to_pystirng

def get_OUTPUT_CFs(OneEntryArgs):
    if 'Output_Part' not in OneEntryArgs:
        return []
    else:
        return OneEntryArgs['Output_Part'].get('CF_list', [])
get_OUTPUT_CFs.fn_string = inspect.getsource(get_OUTPUT_CFs)


In [None]:
[i for i in CF_to_CFvocab]

# CF_to_CFvocab['cf.TargetCGM_Af2H']

In [None]:
TargetField = OneEntryArgs['Input_Part']['TargetField']

TargetField

In [None]:
OneEntryArgs

In [None]:
# def transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab):
#     examples_tfm = tfm_fn_AIInputData(examples, OneEntryArgs, CF_to_CFvocab)
#     # examples_tfm['labels'] = torch.LongTensor([[i] for i in examples['Labeling']])
#     examples_tfm['labels'] = examples_tfm['input_ids'].clone() 

#     masking_rate = OneEntryArgs['Output_Part']['MaskingRate']

#     return examples_tfm

# transform_fn_output.fn_string = inspect.getsource(transform_fn_output)


def transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab):
    examples_tfm = tfm_fn_AIInputData(examples, OneEntryArgs, CF_to_CFvocab)
    
    masking_rate = OneEntryArgs['Output_Part']['MaskingRate']

    TargetField = OneEntryArgs['Input_Part']['TargetField']
    TargetField_CF = [i for i in CF_to_CFvocab if TargetField in i][0]
    CFvocab = CF_to_CFvocab[TargetField_CF]
    tkn2tid = CFvocab['input_ids']['tkn2tid']
    mask_token_id = tkn2tid['[MASK]']
    

    
    original_input_ids = examples_tfm['input_ids'].clone()
    device = original_input_ids.device
    
    # Create mask where tokens are selected for masking
    mask = torch.rand(original_input_ids.shape, device=device) < masking_rate
    mask_indices = mask.nonzero(as_tuple=True)
    original_token_ids = original_input_ids[mask_indices]
    
    # Determine replacement strategy for masked tokens
    random_tensor = torch.rand(original_token_ids.shape, device=device)
    mask_selected = (random_tensor <= 1)        

    # Apply [MASK] replacements
    replaced_token_ids = torch.where(
        mask_selected,
        torch.tensor(mask_token_id, device=device),
        original_token_ids
    )
    # Generate masked input_ids
    masked_input_ids = original_input_ids.clone()
    masked_input_ids[mask_indices] = replaced_token_ids
    
    # Create labels with non-masked tokens ignored (-100)
    labels = original_input_ids.clone()
    labels[~mask] = -100
    
    examples_tfm['input_ids'] = masked_input_ids
    examples_tfm['labels'] = labels
    
    return examples_tfm

transform_fn_output.fn_string = inspect.getsource(transform_fn_output)

In [None]:
examples_tfm = transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab)

In [None]:
for k, v in examples_tfm.items():
    print(k, v.shape)

In [None]:
input_ids[0]

In [None]:
input_ids = examples_tfm['input_ids']
labels = examples_tfm['labels']
labels[0]

In [None]:


def entry_fn_AITaskData(Data, 
                        CF_to_CFvocab, 
                        OneEntryArgs,
                        tfm_fn_AIInputData = None,
                        entry_fn_AIInputData = None,
                        ):

    # InputCFs = OneEntryArgs['Input_FullArgs']['INPUT_CFs_Args']['InputCFs']
    transform_fn = lambda examples: transform_fn_output(examples, tfm_fn_AIInputData, OneEntryArgs, CF_to_CFvocab)
    ds_case = Data['ds_case']

    if type(ds_case) == pd.DataFrame:
        ds_case = datasets.Dataset.from_pandas(ds_case)
        
    # ds_case.set_transform(transform_fn)
    use_map = OneEntryArgs.get('use_map', False)
    num_proc = OneEntryArgs.get('num_proc', 4)
    if use_map == False:
        ds_case.set_transform(transform_fn)
        ds_tfm = ds_case
    else:
        ds_tfm = ds_case.map(transform_fn, batched = True, num_proc = num_proc)

    Data['ds_tfm'] = ds_tfm
    
    return Data

entry_fn_AITaskData.fn_string = inspect.getsource(entry_fn_AITaskData)

In [None]:
Data = entry_fn_AITaskData(Data, 
                           CF_to_CFvocab, 
                           OneEntryArgs,
                           tfm_fn_AIInputData,
                           entry_fn_AIInputData)

ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
batch = ds_tfm[:4]
for k, v in batch.items():
    print(k, v.shape)

In [None]:
from recfldtkn.base import Base
from recfldtkn.aidata_base.entry import AIDATA_ENTRYOUTPUT_PATH

prefix = [
    'import torch',
    'import pandas as pd', 
    'import numpy as np', 
    'import datasets',
    ]
fn_variables = [
    get_OUTPUT_CFs,
    transform_fn_output,
    entry_fn_AITaskData,
]
pycode = Base.convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], AIDATA_ENTRYOUTPUT_PATH, f'{EntryOutputMethod}.py')
print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

# Step 5: EntryFn - Output_Part: EventPrediction

# Test

In [None]:

from torch.utils.data import DataLoader
import time
import numpy as np

# 1. Create DataLoader with your actual training parameters
loader = DataLoader(
    dataset=ds_tfm,  # Your dataset with set_transform
    batch_size=32,            # Use your real batch size
    num_workers=1,            # Match your training setup
    pin_memory=True,          # Same as training config
    shuffle=False             # Disable for consistent measurement
)

# 2. Warm-up run (initial batches are slower due to setup)
print("Warming up...")
for _ in loader: pass

# 3. Timed measurement
num_batches = len(loader)
print(f"Testing with {num_batches} batches...")

start_time = time.perf_counter()  # More precise timer
for _ in loader:
    pass
total_time = time.perf_counter() - start_time

# 4. Calculate metrics
throughput = num_batches / total_time
samples_per_sec = len(ds_tfm) / total_time

print(f"\nResults:")
print(f"- Batches/s: {throughput:.1f}")
print(f"- Samples/s: {samples_per_sec:.1f}")
print(f"- Batch time: {1000*total_time/num_batches:.1f}ms")
print(f"- Total time: {total_time:.2f}s")


# Warming up...
# Testing with 1657 batches...

# Results:
# - Batches/s: 47.8
# - Samples/s: 1527.5
# - Batch time: 20.9ms
# - Total time: 34.69s

In [None]:
# one CPU

# Warming up...
# Testing with 1657 batches...

# Results:
# - Batches/s: 37.1
# - Samples/s: 1186.0
# - Batch time: 27.0ms
# - Total time: 44.68s

In [None]:

from torch.utils.data import DataLoader
import time
import numpy as np

# 1. Create DataLoader with your actual training parameters
loader = DataLoader(
    dataset=ds_tfm,  # Your dataset with set_transform
    batch_size=64,            # Use your real batch size
    num_workers=1,            # Match your training setup
    pin_memory=True,          # Same as training config
    shuffle=False             # Disable for consistent measurement
)

# 2. Warm-up run (initial batches are slower due to setup)
print("Warming up...")
for _ in loader: pass

# 3. Timed measurement
num_batches = len(loader)
print(f"Testing with {num_batches} batches...")

start_time = time.perf_counter()  # More precise timer
for _ in loader:
    pass
total_time = time.perf_counter() - start_time

# 4. Calculate metrics
throughput = num_batches / total_time
samples_per_sec = len(ds_tfm) / total_time

print(f"\nResults:")
print(f"- Batches/s: {throughput:.1f}")
print(f"- Samples/s: {samples_per_sec:.1f}")
print(f"- Batch time: {1000*total_time/num_batches:.1f}ms")
print(f"- Total time: {total_time:.2f}s")