# Space

In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

SPACE['MODEL_ENDPOINT'] = 'vTest'

# Part 1: AIData

In [None]:
from recfldtkn.aidata_base.aidata import AIData

DATA_AIDATA = SPACE['DATA_AIDATA']
OneAIDataName = 'CgmLhm_Bf24Af2Af2t8H_5Min_3Cohort_EventFlt_ds0p50_foodnumGE1_SubTD_bf24af2'

OneEntryArgs = {
    # ----------------- Task Part -----------------
    'Task_Part': {

        'Tagging': {
            # 'TagName_to_TaggingMethod': {
            #     # TagName: TaggingMethod {Rules: [(x,x,x)], Op: and or}
            # },
            # 'ColumnsAddToDsCase': [],
            'TagFilter': True, # <--- still need to add Fitlter Tag, as we need to do the RandomDownSample.
            'TagSplit': False, # <--- do not need to add Split Tag anymore, as we already have. 
        },

        'Filtering': {
            # 'FilterTagging': None,
            'FilterTagging': {
                "Rules": [
                    ['RandDownSample', '<=', 0.2],
                    ['co.Bf24H_Food_recnum:recnum', '>=', 1], 
                    # ['co.Af2H_Food_recnum:recnum', '>=', 1], 
                    ], 
                'Op': 'and',
            }
        }, 
        
        'Splitting': {
            # 'SplitTagging': { # <----- for the Tagging part.
            #     'RANDOM_SEED': 32,
            #     'out_ratio': 0.1,
            #     'test_ratio': 'tail0.1',
            #     'valid_ratio': 0.1
            # },
            'TrainEvals': {
                'TrainSetName_InTrain': 'In-Train', 
                'EvalSetNames_InTrain': [
                    'In-Test_T1D', 
                    'In-Test_T2D', 
                    'In-Valid_T1D', 
                    'In-Valid_T2D', 
                    'Out_T1D', 
                    'Out_T2D',
                ],

                'TrainSetName': 'In-Train', 
                'EvalSetNames': [
                    'In-Test', 
                    'In-Valid', 
                    'Out',
                ],
                'DivideEvalConfig': {
                    'Columns': ['DiseaseTypeGroup'],
                    'SubGroupNames': {
                        'T2D': ['DiseaseType.2.0'],
                        'T1D': ['DiseaseType.1.0'],
                    },
                },
                'DivideTrainConfig': {
                    # 'Columns': ['DiseaseTypeGroup'],
                    # 'SubGroupNames': {
                    #     'T2D': ['DiseaseType.2.0'],
                    #     'T1D': ['DiseaseType.1.0'],
                    # },

                    # 'Columns': ['DiseaseTypeGroup', 'GenderGroup'],
                    # 'SubGroupNames': {
                    #     'T2D.Male': ['DiseaseType.2.0', 'Gender.1'],
                    #     'T1D.Male': ['DiseaseType.1.0', 'Gender.1'],
                    #     'T2D.Female': ['DiseaseType.2.0', 'Gender.2'],
                    #     'T1D.Female': ['DiseaseType.1.0', 'Gender.2'],
                    # },
                },
            },
        }
    },

    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'Mto1Period_MultiTknInStep',
        'CF_list': [
            'cf.TargetCGM_Bf24H',
            'cf.TargetCGM_Af2H',
            # 'cf.TimeSparse_Bf24H', 
            # 'cf.TimeSparse_Af2H',
            'cf.DietSparse_Bf24H',
            'cf.DietSparse_Af2H',
        ],
        'TargetField': 'TargetCGM',
        # 'TimeField':   'Time',
        'EventFields': [
            'Diet',
        ],
        'BeforePeriods': ['Bf24H'],
        'AfterPeriods': ['Af2H'],
        'InferenceMode': False, # 'WithFutureEvent' #  # 'NoFutureEvent', 'WithFutureEvent', 
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'NTP',
    },
}

aidata = AIData.load_aidata(DATA_AIDATA, OneAIDataName, SPACE)
aidata.update_NameToData_with_OneEntryArgs(OneEntryArgs)
dataset = aidata.Name_to_DS
dataset

In [None]:
# aidata.Name_to_DsAIData

split_name = [i for i in  aidata.Name_to_Data][0]
Name_to_Data = aidata.Name_to_Data# [split_name]
Data = Name_to_Data[split_name]
df_case = Data['df_case']

df_case.head()

In [None]:
ds_tfm = Data['ds_tfm']
# ds_tfm

batch_size = 4
batch = ds_tfm[:batch_size]
for k, v in batch.items(): print(k, v.shape)
batch

# Part 2: Model Init

## Step 1: init_model

In [None]:
from nn.cgmlhm.modeling_cgmlhm import GgmLhmLMHeadModel


model_path = '../_Model/CgmLhm_Bf24Af2Af2t8H_5Min_3Cohort_EventFlt_ds0p50_foodnumGE1_SubTD_bf24af2_lhm-foodL3-ds0p5-frozen-lsm/checkpoint-3600'

model = GgmLhmLMHeadModel.from_pretrained(model_path)
model

# Part 3: Forward

In [None]:
from nn.cgmlhm.inference_cgmlhm import process_a_single_batch



# batch_gen['input_ids'].shape
InferenceArgs = {
    'NTP_Args': {
        'num_old_tokens': 289, 
        'items_list': ['losses_each_seq', 'losses_each_token', 'predicted_ntp_labels']
    }, 
    'GEN_Args': {
        'num_old_tokens': 289,
        'max_new_tokens': 24,
        'do_sample': False,
        'items_list': ['hist', 'real', 'pred_wfe', 'logits_wfe', 'pred_nfe', 'logits_nfe'], # wfe: with future events, nfe: without future events
    },
}


batch_output = process_a_single_batch(model, batch, InferenceArgs)

df_batch = pd.DataFrame(batch_output)
df_batch


In [None]:
########################
Split_Name = [i for i in aidata.Name_to_Data][0]
Data = aidata.Name_to_Data[Split_Name]
########################

ds_tfm  = Data['ds_tfm']
df_case = Data['df_case']
print(ds_tfm)
display(df_case.head())

In [None]:
import datasets 

Name_list = [
 # 'In-Train',
#  'In-Test_T2D',
#  'In-Test_T1D',
#  'In-Valid_T2D',
#  'In-Valid_T1D',
 'Out_T2D'
 ]


ds_tfm_list = []
df_case_list = []
for Name in Name_list:
    Data = aidata.Name_to_Data[Name]
    ds_tfm = Data['ds_tfm']
    df_case = Data['df_case']
    # print(ds_tfm)
    # display(df_case.head())
    ds_tfm_list.append(ds_tfm)
    df_case_list.append(df_case)


df_case = pd.concat(df_case_list, axis = 0)
ds_case = datasets.concatenate_datasets(ds_tfm_list)
print(ds_case)
print(df_case.shape)

Data = {'ds_case': ds_case, 'df_case': df_case}
CF_to_CFvocab = aidata.CF_to_CFvocab
Data = aidata.entry_builder.setup_EntryFn_to_Data(Data, CF_to_CFvocab)
ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
import numpy as np
import random 


random.seed(42)

#################################
max_inference_num = 2000
save_df = False 
load_df = False 
chunk_size = 12800
batch_size = 16
#################################

# case_id_columns = aidata.case_id_columns
model = model

if max_inference_num is not None and max_inference_num < len(ds_tfm): 
    random_indices = np.random.randint(0, len(ds_tfm), max_inference_num)
    ds_tfm = ds_tfm.select(random_indices)
    df_case = df_case.iloc[random_indices]

print(ds_tfm)
print(df_case.shape)
display(df_case.head())

In [None]:
from tqdm import tqdm
import torch 
from datetime import datetime

###################
# df_case
# ds_tfm
###################

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(model.device)

chunk_numbers = len(df_case) // chunk_size
print(chunk_numbers)

for chunk_id in range(chunk_numbers+1):
    # chunk_id = 0
    start = chunk_id * chunk_size
    end = min((chunk_id+1) * chunk_size, len(df_case))
    print(start, end)


    df_case_chunk = df_case.iloc[start:end].reset_index(drop = True)
    ds_tfm_chunk = ds_tfm.select(range(start, end))
    print(ds_tfm_chunk)
    print(df_case_chunk.shape)


    df_eval_chunk = pd.DataFrame()
    # for batch_s in tqdm(range(0, len(ds_tfm_chunk), batch_size)):
    for batch_s in range(0, len(ds_tfm_chunk), batch_size):
        batch_e = min(batch_s + batch_size, len(ds_tfm_chunk))
        
        
        s = datetime.now()
        batch = ds_tfm_chunk[batch_s: batch_e]
        for k, v in batch.items():
            batch[k] = v.to(model.device)
        e = datetime.now()
        print(f'{batch_s} prepare batch: {e - s}')


        with torch.no_grad():
            model.eval()
            s = datetime.now()
            output = process_a_single_batch(model, batch, InferenceArgs)
            e = datetime.now()
            print(f'{batch_s} forward batch: {e - s}')
            
        df_batch = pd.DataFrame(output)
        df_eval_chunk = pd.concat([df_eval_chunk, df_batch], axis = 0)

    df_eval_chunk = df_eval_chunk.reset_index(drop=True)  

    df_chunk = pd.concat([df_case_chunk, df_eval_chunk], axis = 1)

    # df_chunk
    break

In [None]:
df_case_eval = df_chunk
df_case_eval

In [None]:
from nn.eval.seqeval import SeqEvalForOneEvalSet

setname = 'test'
x_hist_seq_name = 'hist'
y_real_seq_name = 'real'
y_pred_seq_name = 'pred_wfe'

metric_list = ['rMSE']

horizon_to_se = {
    '000-030min': [0, 6],
    '000-060min': [0, 12],
    '000-120min': [0, 18],
    '000-180min': [0, 24],
    '060-120min': [6, 18],
}


df_case_eval = df_case_eval

eval_instance = SeqEvalForOneEvalSet(
    setname = setname,
    df_case_eval = df_case_eval, 
    x_hist_seq_name = x_hist_seq_name,
    y_real_seq_name = y_real_seq_name, 
    y_pred_seq_name = y_pred_seq_name,
    metric_list = metric_list,
    horizon_to_se = horizon_to_se, 
)

eval_results = eval_instance.get_evaluation_report()
eval_results


# Test1 Test2 Out2
# '000-030min_rMSE': 7.525231092436974,
# '000-060min_rMSE': 13.154212184873948,
# '000-120min_rMSE': 18.734353991596638,
# '000-180min_rMSE': 24.163114495798318,
# '060-120min_rMSE': 21.933193277310927,


# Test1 Test2 Out2
# '000-030min_rMSE': 6.932545854732208,
# '000-060min_rMSE': 12.072663242846662,
# '000-120min_rMSE': 17.17441672780631,
# '000-180min_rMSE': 22.22123257520176,
# '060-120min_rMSE': 20.11437270726339,


In [None]:
from nn.eval.seqeval import SeqEvalForOneEvalSet

setname = 'test'
x_hist_seq_name = 'hist'
y_real_seq_name = 'real'
y_pred_seq_name = 'pred_nfe'

metric_list = ['rMSE']

horizon_to_se = {
    '000-030min': [0, 6],
    '000-060min': [0, 12],
    '000-120min': [0, 18],
    '000-180min': [0, 24],
    '060-120min': [6, 18],
}


df_case_eval = df_case_eval

eval_instance = SeqEvalForOneEvalSet(
    setname = setname,
    df_case_eval = df_case_eval, 
    x_hist_seq_name = x_hist_seq_name,
    y_real_seq_name = y_real_seq_name, 
    y_pred_seq_name = y_pred_seq_name,
    metric_list = metric_list,
    horizon_to_se = horizon_to_se, 
)

eval_results = eval_instance.get_evaluation_report()
eval_results

# '000-030min_rMSE': 7.43746,
#  '000-060min_rMSE': 13.02185,
#  '000-120min_rMSE': 17.66305,
#  '000-180min_rMSE': 21.87347,
#  '060-120min_rMSE': 20.6729,

# '000-030min_rMSE': 7.508,
#  '000-060min_rMSE': 13.16405,
#  '000-120min_rMSE': 17.78696,
#  '000-180min_rMSE': 21.99143,
#  '060-120min_rMSE': 20.82129,