# Space

In [None]:
import sys
import os 
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML

KEY = '1-WORKSPACE'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'_Data/0-Data_Raw',
    'DATA_RFT': f'_Data/1-Data_RFT',
    'DATA_CASE': f'_Data/2-Data_CASE',
    'DATA_AIDATA': f'_Data/3-Data_AIDATA',
    'DATA_SPLIT': f'_Data/4-Data_Split', 
    'DATA_EXTERNAL': f'code/external',
    'CODE_FN': f'code/pipeline', 
    'MODEL_ROOT': f'_Model',
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'

print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# CF Data

In [None]:
import datasets 
from recfldtkn.case_base.casefnutils.casefn import Case_Fn #  import AIDATA_ENTRYINPUT_PATH

######################## get the CF_DataName list
CF_DataName = 'CGMwithDietBf8h-CaseBase-CGM5MinEntry-31ec84c0520b37c1'
CohortName_list = [
    'WellDoc2022CGM',
    'WellDoc2025ALS',
    'WellDoc2025CVS', 
    'WellDoc2025LLY',
]
######################## 

######################## get the CF_DataName list
CF_DataName_list = [
    f'{CF_DataName}/{i}' for i in CohortName_list
]
########################

ds_list = []
ref_config = None
ref_column_names = None
for i, CF_DataName in enumerate(CF_DataName_list):
    path = os.path.join(SPACE['DATA_AIDATA'], CF_DataName)
    ds = datasets.load_from_disk(path)
    print(CF_DataName, ds )
    # config = copy.deepcopy(ds.info.config.__dict__) if hasattr(ds.info, 'config') else {}
    config = ds.config_name
    column_names = ds.column_names
    ds_list.append(ds)

# pprint(config)
dataset = datasets.concatenate_datasets(ds_list)

CF_list = list(set([i.split('--')[0] for i in dataset.column_names if '--tid' in i]))
CF_fn_list = [Case_Fn(CF, SPACE) for CF in CF_list]
CF_to_CFvocab = {CF: CF_fn.COVocab for CF, CF_fn in zip(CF_list, CF_fn_list)}

CF_DataName = config['TriggerCaseBaseName']
TriggerCaseBaseArgs = config['TriggerCaseBaseName_to_TriggerCaseBaseArgs'][CF_DataName]
TriggerName = TriggerCaseBaseArgs['Trigger']['TriggerName']

logger.info(f'set up TriggerName: {TriggerName}')
logger.info(f'set up CF_Config: {[i for i in config]}')
config['CF_to_CFvocab'] = CF_to_CFvocab

print('total', dataset)

In [None]:
####### should be a split here #######

Data = {'ds_case': dataset}

In [None]:
SPACE

# SPLIT: RandomByPat

## Args

In [None]:
OneEntryArgs = {
    'Split_Part': {
        'SplitMethod': 'SplitFromTable', # <--- you need to design this function.
        'TablePath': f'{SPACE["DATA_SPLIT"]}/Split_All_WellDoc.parquet',
        # 'SplitRatio': {'train': 0.8, 'valid': 0.1, 'test': 0.1, 'random_state': 42},
        'ObsDT_Minute': True,
        'Split_to_Selection': {
            'train': {'Rules': [
                                ['split', '==', 'train-early'],
                                ['MEDInfoBf24h-DietRecNum', '>', 0],
                                ['MEDInfoBf24h-DietLastToNow', '>=', 120], 
                                ['MEDInfoBf24h-DietLastToNow', '<=', 420], 
                                ['ObsDT_Minute', '==', 0], 

                                ],
                      'Op': 'and'},



            'valid':  {'Rules': [
                                ['split', '==', 'val-early'],
                                # ['split', '==', 'val-late'],

                                ['MEDInfoBf24h-DietRecNum', '>', 0],
                                ['MEDInfoBf24h-DietLastToNow', '>=', 120], 
                                ['MEDInfoBf24h-DietLastToNow', '<=', 420], 
                                ['ObsDT_Minute', '==', 0], 


                                ],
                      'Op': 'and'},
            'test':  {'Rules': [
                                ['split', '==', 'test-early'],

                                ['MEDInfoBf24h-DietRecNum', '>', 0],
                                ['MEDInfoBf24h-DietLastToNow', '>=', 120], 
                                ['MEDInfoBf24h-DietLastToNow', '<=', 420], 
                                ['ObsDT_Minute', '==', 0], 


                                ],
                      'Op': 'and'},
        }
    },
    
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': '1TknInStep',
        'CF_list': [
            'CGMValueBf24h',
            # 'CGMValueAf2h',
        ],
        'BeforePeriods': ['Bf24h'],
        # 'AfterPeriods': ['Af2h'],
        'InferenceMode': False, # True, # True, # False, # True, 
        'TargetField': 'CGMValue', 
    }, 


    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'UniLabelRules',
        'CF_list': ['MEDInfoBf24h'],
        'label_rule': {
            1: ('MEDInfoBf24h-DietLastToNow', 'in', [120, 180]), # eat between before 2 to 3 hours
            0: ('MEDInfoBf24h-DietLastToNow', 'in', [180, 420]), # eat between before 3 to 7 hours
            -100: 'others'
        },
        'assertion': [
            ('MEDInfoBf24h-DietLastToNow', 'in', [120, 420]),
        ],
        'set_transform': False,
        'num_proc': 4, 
    },
}

SplitMethod = OneEntryArgs['Split_Part']['SplitMethod']
SplitMethod

In [None]:
from recfldtkn.aidata_base.entry import EntryAIData_Builder

entry = EntryAIData_Builder(OneEntryArgs = OneEntryArgs, 
                            SPACE = SPACE)

# tfm_fn_AIInputData = entry.tfm_fn_AIInputData
# entry_fn_AIInputData = entry.entry_fn_AIInputData

## Function

In [None]:
tag_columns = [i for i in dataset.column_names if '--' not in i]
df_tag = dataset.select_columns(tag_columns).to_pandas()
# df_tag = dataset_split_tagging_fn(df_tag, OneEntryArgs)

In [None]:
Split_Part = OneEntryArgs['Split_Part']
TablePath = Split_Part['TablePath']
df_pre_split = pd.read_parquet(TablePath).rename(columns = {'ObsDT': 'ObsDay'})



df_tag['ObsDay'] = pd.to_datetime(df_tag['ObsDT']).dt.date

columns = ['PID', 'ObsDay', 'has_event', 'has_diet',
              'has_med', 'has_exercise', 'age', 'age_group', 'time_bin',
              'days_to_split', 'date_idx', 'split']
df_pre_split['ObsDay'] = pd.to_datetime(df_pre_split['ObsDay']).dt.date

df_pre_split = df_pre_split[columns].reset_index(drop = True)# .columns

print(df_pre_split.shape)
df_pre_split = df_pre_split[df_pre_split['PID'].isin(df_tag['PID'])].reset_index(drop = True) # .drop_duplicates(subset = ['PID', 'ObsDay'])
print(df_pre_split.shape)
print(df_tag.shape)

In [None]:
df_pre_split

In [None]:
df_tag = pd.merge(df_tag, df_pre_split, how = 'left', on = ['PID', 'ObsDay'])

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import inspect 


########################################################
def dataset_split_tagging_fn(df_tag, OneEntryArgs):
    df_tag['ObsDay'] = pd.to_datetime(df_tag['ObsDT']).dt.date
    Split_Part = OneEntryArgs['Split_Part']
    TablePath = Split_Part['TablePath']
    df_pre_split = pd.read_parquet(TablePath).rename(columns = {'ObsDT': 'ObsDay'})
    columns = ['PID', 'ObsDay', 'has_event', 'has_diet',
                'has_med', 'has_exercise', 'age', 'age_group', 'time_bin',
                'days_to_split', 'date_idx', 'split']
    df_pre_split['ObsDay'] = pd.to_datetime(df_pre_split['ObsDay']).dt.date
    df_pre_split = df_pre_split[columns].reset_index(drop = True)# .columns
    df_pre_split = df_pre_split[df_pre_split['PID'].isin(df_tag['PID'])].reset_index(drop = True)

    print(df_pre_split.shape)
    print(df_tag.shape)
    df_tag = pd.merge(df_tag, df_pre_split, how = 'left', on = ['PID', 'ObsDay'])
    print(df_tag.shape)
    if Split_Part['ObsDT_Minute']:
        df_tag['ObsDT_Minute'] = df_tag['ObsDT'].dt.minute

    return df_tag

dataset_split_tagging_fn.fn_string = inspect.getsource(dataset_split_tagging_fn)
########################################################

In [None]:
tag_columns = [i for i in dataset.column_names if '--' not in i]
df_tag = dataset.select_columns(tag_columns).to_pandas()

df_tag = dataset_split_tagging_fn(df_tag, OneEntryArgs)
df_tag

In [None]:
from recfldtkn.base import apply_multiple_conditions
import numpy as np 


Split_to_Selection = OneEntryArgs['Split_Part']['Split_to_Selection']

split_to_dataset = {}
for split_name, Selection in Split_to_Selection.items():
    # split_to_dataset[split_name] = dataset.filter(lambda x: apply_multiple_conditions(x, split_config['Rules'], split_config['Op']))
    Rules = Selection['Rules']
    Op = Selection['Op']
 
    index = apply_multiple_conditions(df_tag, Rules, Op)
    indices = np.where(index == 1)[0]
    # len(indices)
    dataset_selected = dataset.select(indices)
    split_to_dataset[split_name] = dataset_selected

split_to_dataset = datasets.DatasetDict(split_to_dataset)
split_to_dataset

In [None]:
from recfldtkn.base import Base
from recfldtkn.aidata_base.entry import AIDATA_SPLIT_PATH

prefix = [
    'import torch',
    'import pandas as pd', 
    'import numpy as np', 
    'import datasets',
    'from sklearn.model_selection import train_test_split'
    ]
fn_variables = [
    dataset_split_tagging_fn,
]
pycode = Base.convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], AIDATA_SPLIT_PATH, f'{SplitMethod}.py')
print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

# Test

In [None]:
pprint(OneEntryArgs)

In [None]:
from recfldtkn.aidata_base.entry import EntryAIData_Builder

entry = EntryAIData_Builder(OneEntryArgs = OneEntryArgs, 
                            SPACE = SPACE)

In [None]:
dataset

In [None]:
split_to_dataset = entry.split_cf_dataset(dataset)
split_to_dataset

In [None]:
Name_to_Data = entry.setup_EntryFn_to_NameToData(split_to_dataset, CF_to_CFvocab, OneEntryArgs)

In [None]:
Data = Name_to_Data['train']
ds = Data['ds_tfm']
ds

In [None]:
pd.DataFrame(ds['labels']).value_counts()