# Space

In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

SPACE['MODEL_ENDPOINT'] = 'vTest'

# Part 1: AIData

In [None]:
# Oneday: 288, 24pd. 1/12
from datasets import load_from_disk

AIDataName = 'CGM2EventFood_bf6h_WellDoc_v2_v0323' # v2 6 cohorts. 
path = os.path.join(SPACE['DATA_AIDATA'], AIDataName)
print(path)
dataset = load_from_disk(path)
dataset

config = dataset.info.__dict__['config_name']# .features['cf'].feature.vocab
print([i for i in config])
CF_to_CFvocab = config['CF_to_CFvocab']
print([i for i in CF_to_CFvocab])

CF_to_CFArgs = config['CaseSettingInfo']['Case_Args_Settings']['CF_to_CFArgs']
print([i for i in CF_to_CFArgs])


TriggerCaseBaseName = config['TriggerCaseBaseName']
TriggerCaseBaseArgs = config['TriggerCaseBaseName_to_TriggerCaseBaseArgs'][TriggerCaseBaseName]
TriggerName = TriggerCaseBaseArgs['Trigger']['TriggerName']
TriggerName
# print(TriggerCaseBaseArgs)


## Split

In [None]:
# ds = dataset
dataset 

In [None]:
# df_tag.columns

In [None]:
from recfldtkn.base import assign_caseSplitTag_to_dsCase
from recfldtkn.base import apply_multiple_conditions
import numpy as np 


columns = dataset.column_names
columns_tag = [i for i in columns if '--' not in i]
df_tag = dataset.select_columns(columns_tag).to_pandas()



def map_age_to_agegroup(age):
    if age < 18:
        return '0-17'
    elif 18<= age < 40:
        return '18-39'
    elif 40<= age < 65:
        return '40-64'
    else:
        return '65+'
    
###### additional tagging columns 
df_tag['Year'] = df_tag['ObsDT'].dt.year
df_tag['Cohort'] = df_tag['PID'].astype(str).str[0]
df_tag['Age'] = df_tag['Year'] - df_tag['YearOfBirth']  # .dt.year
df_tag['AgeGroup'] = df_tag['Age'].apply(map_age_to_agegroup)
##########################


dataset = dataset.add_column('Age', df_tag['Age'].values)
dataset = dataset.add_column('Cohort', df_tag['Cohort'].values)
dataset = dataset.add_column('Year', df_tag['Year'].values)
dataset = dataset.add_column('AgeGroup', df_tag['AgeGroup'].values)

In [None]:
'cf.Diet5MinBaseN2C_Bf24H--input_ids'
# dataset['cf.Diet5MinBaseN2C_Bf24H--input_ids']

col = 'cf.Diet5MinBaseLMH_Bf24H--input_ids'

df_tag['Check'] = dataset[col]
df_tag['Check']

In [None]:
df_tag['Check'] = df_tag['Check'].apply(lambda x: x[-1])
df_tag['Check']

In [None]:
df_tag['CheckLength'] = df_tag['Check'].apply(lambda x: len(x))
df_tag['CheckLength'].value_counts().sort_index().reset_index()


df_tag['CheckIsEmpty'] = df_tag['Check'].apply(lambda x: 0 in x).astype(int)

df_tag['CheckIsEmpty'].value_counts().sort_index().reset_index()

In [None]:
# df_tag[df_tag['CheckEmpty'] == 1]['Check']

In [None]:
# s = df_tag[df_tag['CheckLength'] == 1]['Check']
# s = s.apply(lambda x: '-'.join([str(i) for i in x]))
# s
# s.value_counts().sort_index()

# s['CheckEmpty']


In [None]:
# CF_to_CFvocab['cf.Diet5MinBaseLMH_Bf24H']

In [None]:
# df_tag['Age'].value_counts().sort_index().reset_index()

In [None]:
Split_to_Selection = {
    'Train': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['ObsDT', '<', '2022-07-01'], 
            ['ObsDT', '>=', '2021-01-01'],
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
            ['CheckIsEmpty', '==', 0],
        ], 
        'Op': 'and',
    },
    'Val': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['ObsDT', '<', '2023-01-01'], 
            ['ObsDT', '>=', '2022-07-01'],
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
            ['CheckIsEmpty', '==', 0],
        ], 
        'Op': 'and',
    },
    'Test': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['ObsDT', '>=', '2023-01-01'], 
            ['ObsDT', '<', '2024-01-01'],
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
            ['CheckIsEmpty', '==', 0],
        ], 
        'Op': 'and',
    }
}

In [None]:
split_to_dataset = {}
for split_name, Selection in Split_to_Selection.items():
    # split_to_dataset[split_name] = dataset.filter(lambda x: apply_multiple_conditions(x, split_config['Rules'], split_config['Op']))
    Rules = Selection['Rules']
    Op = Selection['Op']

    index = apply_multiple_conditions(df_tag, Rules, Op)
    indices = np.where(index == 1)[0]
    # len(indices)
    dataset_selected = dataset.select(indices)
    split_to_dataset[split_name] = dataset_selected


split_to_dataset

In [None]:
df_tag['co.Bf24H_Diet5MinInfo:MinToNow'].value_counts().sort_index().reset_index()

## Data Description

In [None]:
columns = [
    'DiseaseTypeGroup', 'GenderGroup', # 'AgeGroup'
]

for SplitName in Split_to_Selection.keys():
    print(f'\n========== {SplitName} ==========' )

    ds = split_to_dataset[SplitName]
    columns_tag = ds.column_names
    columns_tag
    columns_tag = [i for i in columns_tag if '--' not in i]
    df_tag = ds.select_columns(columns_tag).to_pandas()

    v = df_tag[columns].value_counts().sort_index().reset_index()
    display(v)

    v = df_tag[['PID'] + columns].drop_duplicates()[columns].value_counts().sort_index().reset_index()
    display(v)

In [None]:
Name_to_Data = {}
for split, dataset in split_to_dataset.items():
    Name_to_Data[split] = {'ds_case': dataset}

Name_to_Data

## Update EntryArgs

In [None]:
OneEntryArgs = {
     # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'Mto1Period_MultiTknInStepNoWgt',
        'CF_list': [
            'cf.TargetCGM_Bf24H',
            # 'cf.TargetCGM_Af2H',

            'cf.TimeSparse_Bf24H', 
            # 'cf.TimeSparse_Af2H',

            # 'cf.Diet5MinBaseLMH_Bf24H',
            # 'cf.Diet5MinBaseLMH_Af2H',
        ],
        'TargetField': 'TargetCGM',
        'TimeField':   'Time',
        # 'EventFields': [
        #     # 'Activity',
        #     'Diet5MinBaseLMH',
        # ],
        'BeforePeriods': ['Bf24H'],
        # 'AfterPeriods': ['Af2H'],
        'InferenceMode': False, # 'WithFutureEvent' #  # 'NoFutureEvent', 'WithFutureEvent', 
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'EventPred',
        
        # ------------ one head for time to now ------------
        'EventTimeToNow': 'co.Bf24H_Diet5MinInfo:MinToNow',
        'label_to_id_head1': {'0h': 0, '1h': 1, '2h': 2, 
                              '3h': 3, '4h': 4, '5h': 5},
        'dimensions_head1': ['food_event_time'],
        # ------------ one head for food content ------------
        'EventCF_Name': 'cf.Diet5MinBaseLMH_Bf24H',
        'label_to_id_head2': {'low': 0, 'medium': 1, 'high': 2},
        'dimensions_head2': ['carbs', 'fiber','fat', 'protein', 'sugar'],

        'set_transform': False,
        'num_proc': 4, 
    },
}


from recfldtkn.aidata_base.entry import EntryAIData_Builder

entry = EntryAIData_Builder(TriggerName = TriggerName, 
                            OneEntryArgs = OneEntryArgs, 
                            SPACE = SPACE)

In [None]:
Name_to_Data = entry.setup_EntryFn_to_NameToData(Name_to_Data, CF_to_CFvocab, OneEntryArgs)
# Name_to_Data

## View A Batch

In [None]:
Data = Name_to_Data['Train']
# Data

In [None]:
ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
ds_tfm

In [None]:
pd.Series(ds_tfm['food_event_time_labels']).value_counts()

In [None]:
batch_size = 26
batch = ds_tfm[:batch_size]
batch

In [None]:
import torch
for k, v in batch.items():
    v = torch.LongTensor(v)
    batch[k] = v
    print(k, v.shape)

In [None]:
batch

In [None]:
batch['carbs_labels'] 

# 0h, 1h, 2h, 3h, 4h, 5h

In [None]:
batch['input_ids'][0]

In [None]:
batch['input_ids'][0] # .shape # 

In [None]:
batch['Time--input_ids'][0]

In [None]:
batch['Time--timestep_orig_ids'][0]

In [None]:
# batch['Diet--event_indicators'][0]# .sum(axis = 1)

In [None]:
input_ids = batch['input_ids']
input_ids.shape

In [None]:
input_ids

In [None]:
input_ids[2, :] # 313 = 288 (24h) +  1 (obspoint) + 24 (2h)

In [None]:
# labels = batch['xxx']
# labels.shape

In [None]:
batch