# Space

In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
SPACE['MODEL_Task'] = 'Test_ntp'
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

SPACE['MODEL_VERSION'] = 'vTest'

# Part 1: AIData

## Step 1: Record and Case Base

In [None]:
from config.config_record.Cohort import CohortName_to_OneCohortArgs
from config.config_case.CKPD import Ckpd_to_CkpdObsConfig
from recfldtkn.record_base import Record_Base
CohortNames = [i for i in CohortName_to_OneCohortArgs.keys()]
print(CohortNames)

###################################

HumanRecordRecfeat_Args = {
    'P': {
        # 'BP': [],
        'CGM5Min': ['CGM5Min-N2Cin1'],
        # 'Carb': ['Carb-N2Cin20'],
        # 'Exercise': ['Exercise-Nume'],
        # 'Food': ['Food-NutriNume'],
        'P': ['P-DemoCate'],
        # 'Sleep': ['Sleep-Nume'],
        # 'Step': ['Step-Nume'],
        # 'Weight': ['Weight-Nume'],
        # 'PHeight': [], 
    }
}

CohortName_list = [
    # 'WellDoc2022CGM', 
    # 'WellDoc2023CVSTDC', 
    'WellDoc2023CVSDeRx',
]
HumanRecordRecfeat_Args = HumanRecordRecfeat_Args
Record_Proc_Config = {'save_data': True, 'load_data':True, 'via_method': 'ds'}
Inference_Entry = None # this is not inference mode
###################################

record_base = Record_Base(
    CohortName_list, 
    HumanRecordRecfeat_Args,
    CohortName_to_OneCohortArgs,
    SPACE = SPACE, 
    Inference_Entry = Inference_Entry,
    Record_Proc_Config = Record_Proc_Config,
)

record_base

In [None]:
from config.config_case.GROUP import GROUP_TO_GROUPMethodArgs
from config.config_case.CF import CF_to_CFArgs
from config.config_case.CKPD import Ckpd_to_CkpdObsConfig
from config.config_case.TagRec import TagRec_to_TagRecArgs
from config.config_case.TagCF import TagCF_to_TagCFArgs 
from config.config_case.Flt import FltName_to_FltArgs
from config.config_case.CASE import TriggerCaseBaseName_to_TriggerCaseBaseArgs

from recfldtkn.case_base.case_base import OneCohortTrigger_CaseBase
from recfldtkn.case_base.case_base import CaseSetManager, Case_Base

Case_Args_Settings = {
    'Ckpd_to_CkpdObsConfig': Ckpd_to_CkpdObsConfig,
    'CF_to_CFArgs': CF_to_CFArgs,
    'TagCF_to_TagCFArgs': TagCF_to_TagCFArgs,
    'TagRec_to_TagRecArgs': TagRec_to_TagRecArgs,
    'FltName_to_FltArgs': FltName_to_FltArgs,
    'GROUP_TO_GROUPMethodArgs': GROUP_TO_GROUPMethodArgs,
}

In [None]:
Case_Proc_Config = {
    'max_trigger_case_num': None, 
    'use_task_cache': False, 
    'caseset_chunk_size': 200000, # 200k for CGM, 50k for others.
    'save_data': True, 
    'load_data': True, 
    'load_casecollection': True, 
    'via_method': 'ds',
    'n_cpus': 8, 
    'batch_size': 1000,  
}

In [None]:
TriggerCaseBaseName = 'Bf24HAf2H_CGM'
TriggerCaseBaseArgs =  {
    # --------- this three are relatively stable ----------------
    'Trigger': {
        'TriggerName': 'CGM5MinEntry', 
        'TagRec': [
            'TagRec.PDemoFromP',
        ],
        'Group': 'GrpGenderDisease', # 
        'Filter': 'FltBasicDemo',
        'ObsTask': {
            'TagCF_list': [
                'TagCF.Bf24hCGMinfo', 
                'TagCF.Af2hCGMinfo',
            ],
            'CF_list':  [],
        }
    },
    # --------------------------------
    
    # --------------------------------
    'FilterCaseSet': {
        'Filter': 'FltMiniBfAfCGMRecInfo',
        'ObsTask': {
            'TagCF_list': [
                'TagCF.Bf24hCGMinfo', 
                'TagCF.Af2hCGMinfo',
            ],
            'CF_list':  [
                'cf.TargetCGM_Bf24H', 
                'cf.TargetCGM_Af2H',
            ],
        },
    }
    # --------------------------------
}

In [None]:
TriggerCaseBaseName_to_CohortNameList = {
    TriggerCaseBaseName: CohortName_list,
}

TriggerCaseBaseName_to_CohortNameList

In [None]:
TriggerCaseBaseName_to_TriggerCaseBaseArgs[TriggerCaseBaseName] = TriggerCaseBaseArgs
pprint(TriggerCaseBaseArgs, sort_dicts=False)

TriggerCaseBaseName_to_CohortNameList = {
    TriggerCaseBaseName: CohortName_list,
}

# 2min 1 cpu
# 1m40s 8 cpus
case_base = Case_Base(
    record_base = record_base, 
    TriggerCaseBaseName_to_CohortNameList = TriggerCaseBaseName_to_CohortNameList, 
    TriggerCaseBaseName_to_TriggerCaseBaseArgs = TriggerCaseBaseName_to_TriggerCaseBaseArgs,
    Case_Proc_Config = Case_Proc_Config,
    Case_Args_Settings = Case_Args_Settings, 
)

In [None]:
CaseSetNameToCaseset = case_base.TriggerCaseBaseName_to_CaseSetNameToCaseset['Bf24HAf2H_CGM']
CaseSetNameToCaseset

## Step 2: AIData Config

### Input Args

In [None]:
from config.config_aidata.ConfigInput import InputName_to_Settings

## %%%%%%%%%%%%%%%%%%%%%%%% user generation 
# ------------------------ Input Args ------------------------
inputcf_name = 'before_after_26h_CGM'
INPUT_CFs_Args = {
    inputcf_name: {
        'InputCFs': [
            'cf.TargetCGM_Bf24H',
            'cf.TargetCGM_Af2H',
        ]
    },
}

InputName_to_Settings['INPUT_CFs_Args'] = INPUT_CFs_Args

EntryInputMethod = 'Mto1Period_1TknInStep'
Input_Args = {
    'TriggerName': 'CGM5MinEntry',
    'INPUT_CFs_Args': inputcf_name,
    'EntryInputMethod': EntryInputMethod,
    # we might add the Filtering for Input Data as well. 
}


### Task Args

In [None]:
from config.config_aidata.ConfigTasks import TasksName_to_Settings


# ------------------------ Task Args ------------------------
TaskType          = 'NextOneTknPred'
TaskSeriesName    = 'NextOneTknPred'
TaskName          = 'NextOneTknPred'

EntryOutputMethod = 'NextOneTknPred'

Tasks_Args = {
    'TaskType':          TaskType, # task type 
    'TaskSeriesName':    TaskSeriesName, # 'SurveyPred',
    'TaskName':          TaskName,
    'EntryOutputMethod': EntryOutputMethod,
    # 'Filtering': [], 
}
TasksName_to_Settings[TaskSeriesName] = {}
TasksName_to_Settings[TaskSeriesName][TaskName] = Tasks_Args


### AIDev Args

In [None]:
from config.config_aidata.ConfigAIDev import AIDevName_to_Settings

# SplitTagging 
pprint(AIDevName_to_Settings, sort_dicts=False)

In [None]:
SplitTagging_Args = AIDevName_to_Settings['SplitTagging']

# -----------------------
SplitTaggingName = 'Rs32out1Tail1Valid1'
SplitTagging_Args[SplitTaggingName] = {
    'RANDOM_SEED': 32,
    'out_ratio': 0.1,
    'test_ratio': 'tail0.1',
    'valid_ratio': 0.1
}

In [None]:
# -----------------------
TrainEvals_Args = AIDevName_to_Settings['TrainEvals']
TrainEvalName = 'Train-EvaOutTestValidatio'

TrainEvals_Args[TrainEvalName] = {
    'TrainSetName': 'In-Train', 
    'EvalSetNames': ['In-Test', 'In-Valid', 'Out']
}

AIDevName_to_Settings['SplitTagging'] = SplitTagging_Args
AIDevName_to_Settings['TrainEvals']   = TrainEvals_Args

In [None]:
AIDev_Args = {
    'NewName_to_OldNames': 'BaseAll',  # 'BaseC1', 
    'SplitTagging': SplitTaggingName,
    'TrainEvals': TrainEvalName, 
    'Filtering': 'FltNone', # 'FltBaseSMS', #  # 
}

### AIData_Job_Args

In [None]:
# -------------------------------------- # 
# this one should be put in the training script.
AIData_Job_Args = {
    'TriggerCaseBaseName': TriggerCaseBaseName,
    'CohortName_list': CohortName_list,

    'AIDev_Args': AIDev_Args,

    'Input_Args': Input_Args,

    ###################################
    'Tasks_Series_Args': {
        'TaskType': 'NextOneTknPred',                     
        'EntryOutputMethod': EntryOutputMethod, 
        'TaskSeriesName_List': [
            'NextOneTknPred',
        ],
    },
    ###################################
}


from config.config_aidata.ConfigTasks import TaskType_to_EntryOutputMethod
from recfldtkn.aidata_base.aidata import convert_AIDataSeriesArgs_to_TaskFullNameToAIDataArgs

TaskFullName_to_AIDataArgs = convert_AIDataSeriesArgs_to_TaskFullNameToAIDataArgs(AIData_Job_Args, 
                                                                                  TasksName_to_Settings)

pprint(TaskFullName_to_AIDataArgs, sort_dicts=False)
# AIDataArgs 

## Step 3: AIData

In [None]:
from config.config_aidata.ConfigInput import InputName_to_Settings
from config.config_aidata.ConfigTasks import TasksName_to_Settings
from config.config_aidata.ConfigAIDev import AIDevName_to_Settings
from recfldtkn.aidata_base.aidata_base import AIData_Base


AIDataSettings = {
    'InputName_to_Settings': InputName_to_Settings,
    'TasksName_to_Settings': TasksName_to_Settings,
    'AIDevName_to_Settings': AIDevName_to_Settings,
}

############## inference mode ####################
# AIDataArgs_columns = ['TriggerCaseBaseName', 'Input_Args']
# CohortName_list = ['Inference']

############## training mode ####################
AIDataArgs_columns = None
CohortName_list = None

aidata_base = AIData_Base(
    case_base = case_base, 
    TaskFullName_to_AIDataArgs = TaskFullName_to_AIDataArgs,
    AIDataArgs_columns = AIDataArgs_columns,
    CohortName_list = CohortName_list, 
    AIDataSettings = AIDataSettings, 
    SPACE = SPACE, 
)   

pprint(aidata_base.TaskFullName_to_AIDataArgsFinal, sort_dicts=False)

In [None]:
TaskFullName_list = [i for i in aidata_base.TaskFullName_to_AIDataArgsFinal]
pprint(TaskFullName_list)

In [None]:
taskfullname = TaskFullName_list[0]
taskfullname 

In [None]:
OneAIData_Args = aidata_base.TaskFullName_to_AIDataArgsFinal[taskfullname]
pprint(OneAIData_Args, sort_dicts=False)

In [None]:
aidata = aidata_base.get_aidata_from_taskfullname(taskfullname)
aidata

In [None]:
TaskFullName_list = [i for i in aidata_base.TaskFullName_to_AIDataArgsFinal]
pprint(TaskFullName_list)


In [None]:
Name_to_Data = aidata.Name_to_Data
for Name, Data in Name_to_Data.items():
    print(Name, ':', Data['ds_case'])


# hold-out
# hold-in

In [None]:
aidata

## Step 4: Prepare A Batch

In [None]:
# aidata.Name_to_DsAIData

split = 'In-Train'
dataset = aidata.Name_to_Data[split]
dataset['df_case']

In [None]:
ds_case = aidata.Name_to_Data['In-Train']['ds_case']
ds_case

In [None]:
batch_size = 4
batch = ds_case[:batch_size]
batch

In [None]:
input_ids = batch['input_ids']
input_ids.shape

In [None]:
input_ids

In [None]:
input_ids[2, :] # 313 = 288 (24h) +  1 (obspoint) + 24 (2h)

In [None]:
labels = batch['labels']
labels.shape

In [None]:
batch

# Part 2: Model Instance

## Step 1: `__init__`

In [None]:
aidata

In [None]:
ModelArgs = {
    'model_name_or_path': "cgmgpt_medal",
    'task': None,
    'config_name': None,
    'algorithm_name': None,
    'model_type': 'cgmgpt_medal',
}

In [None]:
TrainingArgs = {
    'output_dir': '_test', 
    'overwrite_output_dir': False,
    
    'do_train': True, 
    'num_train_epochs': 10,
    'per_device_train_batch_size': 4, # 64, # 4, # 64
    'per_device_eval_batch_size': 4, # 64, # 4, # 64
    'gradient_accumulation_steps': 4,
    'save_strategy': 'epoch',
    'save_total_limit': 10, 
    
    'logging_steps': 1,
    
    'do_eval': True, 
    'eval_steps': 100, 
    'evaluation_strategy': 'steps',
    
    'report_to': 'wandb',
    'remove_unused_columns': False, # <--- must be False.
    'dataloader_drop_last': True,
}

In [None]:
# InferenceArgs = {
#     # save_df: means save the dataframe to the disk.
#     'save_df': False, 

#     # get_df: means get the dataframe from the disk.
#     'get_df': True,

#     # task: means the task name, which is used to save the dataframe.
#     'task': 'ntp',
# }

In [None]:
InferenceArgs = {
    'save_df': False, 
    'get_df': True, 
    'task': 'gen',
    'GenArgs': {
        'max_new_tokens': 24,
        'do_sample': False,
    }
}

In [None]:
EvaluationArgs = {
    'max_inference_num': 10,
}

In [None]:
SPACE

## Step 2: init_model

In [None]:
import transformers
import logging
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_CAUSAL_LM_MAPPING,
    AutoConfig,
    HfArgumentParser,
    TrainingArguments,
)

MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
# MODEL_CONFIG_CLASSES

MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
# MODEL_TYPES

In [None]:
############# this is the NN development that showing our novelty #############
from nn.cgmlsm.configuration_cgmgpt import CgmGptConfig
from nn.cgmlsm.instance_cgmgpt import CgmGptInstance
from nn.cgmlsm.modeling_cgmgpt import CgmGptLMHeadModel
#################################################################

In [None]:
ModelArgs

In [None]:
# ----- within the method of init_model.
CF_to_CFvocab = aidata.CF_to_CFvocab
CF = list(CF_to_CFvocab.keys())[0]
CFvocab = CF_to_CFvocab[CF]
tkn2tid = CFvocab['input_ids']['tkn2tid']

config_kwargs = {
    # "cache_dir": model_args.cache_dir,
    # "revision": model_args.model_revision,
    # "token": model_args.token,
    # "trust_remote_code": model_args.trust_remote_code,
    ###########
    'vocab_size': len(tkn2tid),
    'bos_token_id': tkn2tid['[BOS]'],
    'eos_token_id': tkn2tid['[EOS]'],
    'pad_token_id':  0,
    ###########
}

ModelArgs.update(config_kwargs)

pprint(ModelArgs)
config = CgmGptConfig(**ModelArgs)
pprint(config)

In [None]:
model = CgmGptLMHeadModel(config) 
model

In [None]:
total_params = sum(p.numel() for p in model.parameters())
total_params

In [None]:
model_instance = CgmGptInstance(aidata, ModelArgs, TrainingArgs, InferenceArgs, EvaluationArgs, SPACE= SPACE)
model_instance.init_model()

## Step 3: One Single Batch


1. Training ---> Loss
2. Inference ---> Evaluation NTP or Gen

In [None]:
import numpy as np 
import torch 

batch2dp = 8
batch = ds_case.select(range(batch2dp))[:batch2dp]
inputs = batch

input_batch = {'input_ids': torch.LongTensor(inputs['input_ids'])}
for k, v in input_batch.items():
    print(k, v.shape)   

inputs=batch

In [None]:
transformer_outputs = model.transformer(**input_batch)

transformer_outputs

In [None]:
hidden_states = transformer_outputs[0]
hidden_states.shape

In [None]:
lm_logits = model.lm_head(hidden_states)
lm_logits.shape

In [None]:
output = model(**batch)
output.loss

## Step 4: fit

In [None]:
model_instance.fit()

## Step 5: inference

### process_a_single_batch_for_ntp

In [None]:
for k, v in batch.items():
    batch[k] = v.to(model.device)

In [None]:
model.device

In [None]:
output = model(**batch)

In [None]:
# get predicted_labels
lm_logits = output.logits
predicted_labels = torch.argmax(lm_logits, dim=-1)

# get the loss each token
labels = batch['labels']
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()

logits_permuted = shift_logits.permute(0, 2, 1)
loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
losses = loss_fn(logits_permuted, shift_labels)

# organize the output 
output = {
    # 'loss': loss,
    'losses_each_seq': losses.mean(dim=1),
    'losses_each_seqbf24': losses[:, :288].mean(dim=1),
    'losses_each_seqaf2h': losses[:, 288:].mean(dim=1),
    'losses_each_token': losses,
    'predicted_labels': predicted_labels,
}

pprint(output)

In [None]:
def process_a_single_batch_for_ntp(model, batch, InferenceArgs = None):

    # model should be activate with the model.eval() within the inference. 
    output = model(**batch)
    
    # get predicted_labels
    lm_logits = output.logits
    predicted_labels = torch.argmax(lm_logits, dim=-1)

    # get the loss each token
    labels = batch['labels']
    shift_logits = lm_logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()

    logits_permuted = shift_logits.permute(0, 2, 1)
    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
    losses = loss_fn(logits_permuted, shift_labels)

    # organize the output 
    output = {
        # 'loss': loss,
        'losses_each_seq': losses.mean(dim=1),
        'losses_each_seqbf24': losses[:, :288].mean(dim=1),
        'losses_each_seqaf2h': losses[:, 288:].mean(dim=1),
        'losses_each_token': losses,
        'predicted_labels': predicted_labels,
    }
    return output

### process_a_single_batch_for_gen

In [None]:
InferenceArgs = {
    'save_df': False, 
    'get_df': True, 
    'task': 'gen',
    'GenArgs': {
        'max_new_tokens': 12,
        'do_sample': False,
    }
}

In [None]:
from transformers import GenerationConfig


GenArgs = InferenceArgs['GenArgs']
GenArgs['pad_token_id'] = model.config.pad_token_id
GenArgs

generation_config = GenerationConfig(**GenArgs)

In [None]:
max_new_tokens = GenArgs['max_new_tokens']
max_input_tokens = len(batch['input_ids'])

In [None]:
outputs = model.generate(generation_config = generation_config, **batch)

In [None]:
batch['input_ids'].shape

In [None]:
outputs.shape

In [None]:
def process_a_single_batch_for_gen(model, batch, InferenceArgs = None):

    # ---------
    # to add things here. 

    GenArgs = InferenceArgs['GenArgs']
    GenArgs['pad_token_id'] = model.config.pad_token_id
    # GenArgs

    generation_config = GenerationConfig(**GenArgs)


    max_new_tokens = GenArgs['max_new_tokens']
    max_input_tokens = len(batch['input_ids'])

    outputs = model.generate(generation_config = generation_config, **batch)

    # ---------
    output = {
        f"hist_{max_input_tokens}": batch['input_ids'],
        f"real_{max_new_tokens}": batch['labels'],
        f"pred_{max_new_tokens}": outputs.cpu().numpy()[:, -max_new_tokens:], 
    }
    return output

In [None]:
output = process_a_single_batch_for_gen(model, batch, InferenceArgs)
output

### Inference

In [None]:
Data = aidata.Name_to_Data['In-Train']
dataset = Data['ds_case']
dataset

In [None]:
# SPACE

max_inference_num = 1000
dataset = dataset.select(range(max_inference_num))
dataset

In [None]:
#model_instance.inference(dataset)

## Step 6: evaluation

### One df_case_eval

In [None]:
self= model_instance

aidata = self.aidata    
EvaluationArgs = self.EvaluationArgs
model_instance = self
eval_instance = self.eval_instance

df_case_list = []
for SetName in aidata.TrainEvals['EvalSetNames']:
    if SetName=='Out':
        break
    logger.info(f'Evaluate on {SetName}...')
    Data     = aidata.Name_to_Data[SetName] # Data['df_case'] (meta), Data['ds_case'] (CF). 
    DsAIData = aidata.Name_to_DsAIData[SetName]  # dataset (into the model)
    dataset = DsAIData['ds_case']   
    max_inference_num= self.EvaluationArgs['max_inference_num']
    dataset = dataset.select(range(max_inference_num))
    df_case = model_instance.inference(dataset)
    df_case_list.append(df_case)

df_case_eval = pd.concat(df_case_list, axis = 0)  

In [None]:
df_case_eval

In [None]:
max_new_tokens = 24
gen_id_col = f'pred_{max_new_tokens}'
real_id_col = f'real_{max_new_tokens}'
input_id_col = 'hist_313'

In [None]:
import numpy as np
df_case_eval['var'] = df_case_eval[input_id_col].apply(lambda x: np.var(x))

In [None]:
gen = df_case_eval[gen_id_col]
real = df_case_eval[real_id_col]

print(gen)
print(real)

In [None]:
horizon_to_se = {
    '30Min': (0, 6),
    '1stH ': (0, 12),
    '2ndH ': (12, 24), 
    '2H   ': (0, 24)
}

case_columns_id = ['PID', 'ObsDT']

In [None]:
example = df_case_eval.iloc[0]

In [None]:
self.eval_instance.plot_cgm_sensor(example, gen_id_col, real_id_col, input_id_col)


In [None]:
evals= self.eval_instance.get_complete_metrics(example, gen_id_col,real_id_col,horizon_to_se)

In [None]:
print(evals.keys())

In [None]:
for k,v in evals.items():
    print(k,":",v)

In [None]:
case_columns_id = ['PID', 'ObsDT']
report = pd.concat([pd.DataFrame([example[case_columns_id]]), pd.DataFrame([evals])], axis=1)

In [None]:
report

### Looping over all df_case_evals

In [None]:
df_full_report= model_instance.evaluate()

In [None]:
df_full_report

In [None]:
columns = ['rMSE_30Min', 'rMSE_1stH ', 'rMSE_2ndH ', 'rMSE_2H   ',]

df_full_report[columns].mean()