# Space

In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

SPACE['MODEL_ENDPOINT'] = 'vTest'

# Part 1: AIData

In [None]:
# Oneday: 288, 24pd. 1/12
from datasets import load_from_disk

# 24 / 288
AIDataName = 'CGM_32h_24pd_WellDoc_v2_v0323' # CGM, 32h, 24 data per day. 

path = os.path.join(SPACE['DATA_AIDATA'], AIDataName)
print(path)
dataset = load_from_disk(path)
# dataset

config = dataset.info.__dict__['config_name']# .features['cf'].feature.vocab
print([i for i in config])
CF_to_CFvocab = config['CF_to_CFvocab']
print([i for i in CF_to_CFvocab])

CF_to_CFArgs = config['CaseSettingInfo']['Case_Args_Settings']['CF_to_CFArgs']
print([i for i in CF_to_CFArgs])


TriggerCaseBaseName = config['TriggerCaseBaseName']
TriggerCaseBaseArgs = config['TriggerCaseBaseName_to_TriggerCaseBaseArgs'][TriggerCaseBaseName]
TriggerName = TriggerCaseBaseArgs['Trigger']['TriggerName']
TriggerName
# print(TriggerCaseBaseArgs)

In [None]:
# df_tag.columns

from recfldtkn.base import assign_caseSplitTag_to_dsCase
from recfldtkn.base import apply_multiple_conditions
import numpy as np 


columns = dataset.column_names
columns_tag = [i for i in columns if '--' not in i]
df_tag = dataset.select_columns(columns_tag).to_pandas()

df_tag


In [None]:
def map_age_to_agegroup(age):
    if age < 18:
        return '0-17'
    elif 18<= age < 40:
        return '18-39'
    elif 40<= age < 65:
        return '40-64'
    else:
        return '65+'
    
###### additional tagging columns 
df_tag['Year'] = df_tag['ObsDT'].dt.year
df_tag['Cohort'] = df_tag['PID'].astype(str).str[0]
df_tag['Age'] = df_tag['Year'] - df_tag['YearOfBirth']  # .dt.year
df_tag['AgeGroup'] = df_tag['Age'].apply(map_age_to_agegroup)
##########################


dataset = dataset.add_column('Age', df_tag['Age'].values)
dataset = dataset.add_column('Cohort', df_tag['Cohort'].values)
dataset = dataset.add_column('Year', df_tag['Year'].values)
dataset = dataset.add_column('AgeGroup', df_tag['AgeGroup'].values)

In [None]:
Split_to_Selection = {
    'Train': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
            ['ObsDT', '<', '2022-07-01'], 
            ['ObsDT', '>=', '2021-01-01'],
        ], 
        'Op': 'and',
    },
    'Val': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['ObsDT', '<', '2023-01-01'], 
            ['ObsDT', '>=', '2022-07-01'],
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
        ], 
        'Op': 'and',
    },
    'Test': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['ObsDT', '>=', '2023-01-01'], 
            ['ObsDT', '<', '2024-01-01'],
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
        ], 
        'Op': 'and',
    }
}

In [None]:
split_to_dataset = {}
for split_name, Selection in Split_to_Selection.items():
    # split_to_dataset[split_name] = dataset.filter(lambda x: apply_multiple_conditions(x, split_config['Rules'], split_config['Op']))
    Rules = Selection['Rules']
    Op = Selection['Op']

    index = apply_multiple_conditions(df_tag, Rules, Op)
    indices = np.where(index == 1)[0]
    # len(indices)
    dataset_selected = dataset.select(indices)
    split_to_dataset[split_name] = dataset_selected

split_to_dataset

In [None]:
OneEntryArgs = {
     # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'Mto1Period_MultiTknInStepNoWgt',
        'CF_list': [
            'cf.TargetCGM_Bf24H',
            'cf.TargetCGM_Af2H',
            # 'cf.TargetCGM_Af2Hto8H',
        ],
        'TargetField': 'TargetCGM',
        'BeforePeriods': ['Bf24H'],
        'AfterPeriods': ['Af2H'],
        'InferenceMode': False, # 'WithFutureEvent' #  # 'NoFutureEvent', 'WithFutureEvent', 
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'CausalLM',
        'set_transform': True,
        'num_proc': 4, 
    },

    # 'Output_Part': {
    #     'EntryOutputMethod': 'MaskedLM',
    #     'MaskingRate': 0.15,
    #     'set_transform': True,
    #     'num_proc': 4, 
    # },

    # 'Output_Part': {
    #     'EntryOutputMethod': 'SupervisedFT',
    #     'AfStepNum': 24, # 12, # assert AfterPeriods Af2H,so 12 * 2 = 24
    #     'set_transform': True,
    #     'num_proc': 4, 
    # },
}

from recfldtkn.aidata_base.entry import EntryAIData_Builder

entry = EntryAIData_Builder(TriggerName = TriggerName, 
                            OneEntryArgs = OneEntryArgs, 
                            SPACE = SPACE)

In [None]:
Name_to_Data = {}
for split, dataset in split_to_dataset.items():
    Name_to_Data[split] = {'ds_case': dataset}
# Name_to_Data

Name_to_Data = entry.setup_EntryFn_to_NameToData(Name_to_Data, CF_to_CFvocab, OneEntryArgs)
# Name_to_Data

In [None]:
# aidata.Name_to_DsAIData
###############################
TrainSetName = 'Train'
EvalSetNames = [Name for Name in Name_to_Data if Name != TrainSetName]
max_train_samples = 1000
max_eval_samples = 64
###############################


# ------------ train datasets ------------
TrainData = Name_to_Data[TrainSetName]
ds_tfm_train = TrainData['ds_tfm']
if max_train_samples is not None:
    max_train_samples = min(len(ds_tfm_train), max_train_samples)
    ds_tfm_train = ds_tfm_train.shuffle(seed=42).select(range(max_train_samples))
logger.info(ds_tfm_train)


# ------------ eval datasets ------------
eval_dataset_dict = {}
for evalname in EvalSetNames:
    if evalname not in Name_to_Data: 
        logger.info(f'{evalname} not in aidata.Name_to_Data')
        continue
    eval_dataset = Name_to_Data[evalname]['ds_tfm']
    if max_eval_samples is not None:
        max_eval_samples = min(len(eval_dataset), max_eval_samples)
        eval_dataset = eval_dataset.shuffle(seed=42).select(range(max_eval_samples))
    eval_dataset_dict[evalname] = eval_dataset
logger.info(f'---- eval_datasets ----')
logger.info(eval_dataset_dict)


print(len(ds_tfm_train))
for k, v in eval_dataset_dict.items():
    print(k, len(v))    

# Part 2: Model Init

## Step 1: init_model

In [None]:
from nn.cgmlhm.configuration_cgmlhm import CgmLhmConfig 

ModelArgs = {
    'model_type': 'cgmlhm',
    'OneEntryArgs': OneEntryArgs,
    'CF_to_CFvocab': CF_to_CFvocab,
    'sc_num_hidden_layers': 0, 
    'tf_n_layer': 0, 
}

config = CgmLhmConfig(**ModelArgs)
# print(config)
config.field_to_fieldinfo


In [None]:
from nn.cgmlhm.modeling_cgmlhm import GgmLhmLMHeadModel

model = GgmLhmLMHeadModel(config)
model

# Part 3: Forward

In [None]:
# import numpy as np 
# import torch 

# batch2dp = 8
# batch = ds_tfm.select(range(batch2dp))[:batch2dp]

In [None]:
# output = model(**batch)
# output.loss

In [None]:
# past_key_values_lsm, past_key_values_fusor = output.past_key_values# [0][0].shape
# print(past_key_values_lsm[0][0].shape)
# print(len(past_key_values_lsm), len(past_key_values_lsm[0]))

# # past_key_values_fusor could be None
# if past_key_values_fusor is not None:   
#     print(past_key_values_fusor[0][0].shape)
#     print(len(past_key_values_fusor), len(past_key_values_fusor[0]))

# Part 4: Eval

In [None]:

from nn.cgmlhm.inference_cgmlhm import inference_model_with_ds


InferenceArgs = {
    # 'NTP_Args': {
    #     'num_old_tokens': 289, 
    #     'items_list': [
    #         'losses_each_seq', 
    #         # 'losses_each_token', 
    #         # 'predicted_ntp_labels'
    #     ]
    # }, 
    'GEN_Args': {
        'num_old_tokens': 289,
        'max_new_tokens': 24,
        'do_sample': False,
        'items_list': [
            'hist', 
            'real', 
            # 'pred_wfe', 
            # 'logits_wfe', 
            'pred_nfe', 
            # 'logits_nfe'
            ],
    },
}

ds_tfm = eval_dataset_dict['Test']
print(ds_tfm)


In [None]:
import torch 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
results = inference_model_with_ds(model, ds_tfm, InferenceArgs)

In [None]:
df_case_eval = results['df_case_eval']
df_case_eval.head(1)

In [None]:
from nn.eval.seqeval import SeqPredEval

EvaluationArgs = {
    'subgroup_config_list': ['DiseaseTypeGroup'],
    'x_hist_seq_name': 'hist',
    'y_real_seq_name': 'real',
    'y_pred_seq_name': 'pred_nfe',
    # 'losses_each_seq': 'losses_each_seq',
    'metric_list': ['rMSE'],
    'horizon_to_se': {
        'a_0.5h': [ 0,  6],   # 6×5 = 30min
        'b_1h':   [ 0, 12],   # 12×5 = 60min
        'c_2h':   [ 0, 24],   # 24×5 = 120min
        'd_3h':   [ 0, 36],   # 36×5 = 180min
        'e_1t2h': [12, 24],   # from 60min (12*5) to 120min (24*5)
    },
}

subgroup_config_list = EvaluationArgs['subgroup_config_list']
x_hist_seq_name      = EvaluationArgs['x_hist_seq_name']
y_real_seq_name      = EvaluationArgs['y_real_seq_name']
y_pred_seq_name      = EvaluationArgs['y_pred_seq_name']
metric_list          = EvaluationArgs['metric_list']
horizon_to_se        = EvaluationArgs['horizon_to_se']
# losses_each_seq      = EvaluationArgs['losses_each_seq']

eval_instance = SeqPredEval(
    df_case_eval = df_case_eval, 
    subgroup_config_list = subgroup_config_list,
    x_hist_seq_name = x_hist_seq_name,
    y_real_seq_name = y_real_seq_name,
    y_pred_seq_name = y_pred_seq_name,
    # losses_each_seq = losses_each_seq,
    metric_list = metric_list,
    horizon_to_se = horizon_to_se
)

eval_instance

In [None]:
results = inference_model_with_ds(model, ds_tfm, InferenceArgs)
df_case_eval = results['df_case_eval']
subgroup_config_list = EvaluationArgs['subgroup_config_list']
x_hist_seq_name      = EvaluationArgs['x_hist_seq_name']
y_real_seq_name      = EvaluationArgs['y_real_seq_name']
y_pred_seq_name      = EvaluationArgs['y_pred_seq_name']
metric_list          = EvaluationArgs['metric_list']
horizon_to_se        = EvaluationArgs['horizon_to_se']
# losses_each_seq      = EvaluationArgs['losses_each_seq']

eval_instance = SeqPredEval(
    df_case_eval = df_case_eval, 
    subgroup_config_list = subgroup_config_list,
    x_hist_seq_name = x_hist_seq_name,
    y_real_seq_name = y_real_seq_name,
    y_pred_seq_name = y_pred_seq_name,
    # losses_each_seq = losses_each_seq,
    metric_list = metric_list,
    horizon_to_se = horizon_to_se
)

df_report = eval_instance.df_report_neat

d = {}
for idx, row in df_report.iterrows():
    # print(row)
    row_d = row.to_dict()
    setname = row_d.pop('setname')
    for k, v in row_d.items():
        d[f'{k}_{setname}'] = v
    # d[setname] = row_d
d

In [None]:
# eval_instance.df_report_neat

# Part 5: Train

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
from transformers import Trainer, TrainingArguments, TrainerCallback


#################################
HuggingFaceTrainingArgs = {
    'output_dir': '_test',  # will be updated to model_instance.model_checkpoint_path
    'overwrite_output_dir': False,

    'do_train': True, 
    'num_train_epochs': 10,
    'per_device_train_batch_size': 4, # 64, # 4, # 64
    'per_device_eval_batch_size': 4, # 64, # 4, # 64
    'gradient_accumulation_steps': 4,
    'save_strategy': 'epoch',
    'save_total_limit': 5, 

    'logging_steps': 1,

    'do_eval': True, 
    'eval_steps': 100, 
    'eval_strategy': 'steps',
    'report_to': 'wandb',
    
    # ------- do not change these -------
    'remove_unused_columns': False, # <--- must be False.
    'dataloader_drop_last': True,
    'logging_first_step': True,
}
#################################

training_args = TrainingArguments(**HuggingFaceTrainingArgs)
training_args

In [None]:
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_CAUSAL_LM_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    is_torch_tpu_available,
    set_seed,
)

print(training_args.seed)
set_seed(training_args.seed)

In [None]:
from datetime import datetime
from datasets.fingerprint import Hasher 

timestamp = datetime.now().strftime("%Y%m%d-%H")
experiment_id = timestamp + "-" + Hasher().hash([config])

print(experiment_id)

In [None]:
class TimestampCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Add the current timestamp to the logs
        logs["step"] = state.global_step
        logs["timestamp"] = str(datetime.now())

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    # print(logits.shape, type(logits), '<----- logits')
    return logits.argmax(dim=-1)

In [None]:


from transformers import TrainerCallback
import torch
from nn.cgmlhm.inference_cgmlhm import inference_model_with_ds
from nn.eval.seqeval import SeqPredEval  # assuming this is the evaluation class you're using

import wandb 


class GenerationEvalCallback(TrainerCallback):
    def __init__(self, ds_tfm, InferenceArgs, EvaluationArgs):
        """
        ds_tfm: processed dataset used for inference
        InferenceArgs: dictionary of inference-related arguments
        EvaluationArgs: dictionary of evaluation-related configs, including:
            - subgroup_config_list
            - x_hist_seq_name
            - y_real_seq_name
            - y_pred_seq_name
            - metric_list
            - horizon_to_se
        """
        self.ds_tfm = ds_tfm
        self.InferenceArgs = InferenceArgs
        self.EvaluationArgs = EvaluationArgs

    def on_evaluate(self, args, state, control, model=None, **kwargs):
        model = model or kwargs.get("model", None)
        if model is None:
            print("⚠️ Model not provided during evaluation.")
            return {}

        # Run inference using the dataset and model
        inference_results = inference_model_with_ds(model, self.ds_tfm, self.InferenceArgs)
        df_case_eval = inference_results['df_case_eval']

        # Extract evaluation config
        subgroup_config_list = self.EvaluationArgs['subgroup_config_list']
        x_hist_seq_name      = self.EvaluationArgs['x_hist_seq_name']
        y_real_seq_name      = self.EvaluationArgs['y_real_seq_name']
        y_pred_seq_name      = self.EvaluationArgs['y_pred_seq_name']
        metric_list          = self.EvaluationArgs['metric_list']
        horizon_to_se        = self.EvaluationArgs['horizon_to_se']

        # Initialize evaluator
        eval_instance = SeqPredEval(
            df_case_eval=df_case_eval,
            subgroup_config_list=subgroup_config_list,
            x_hist_seq_name=x_hist_seq_name,
            y_real_seq_name=y_real_seq_name,
            y_pred_seq_name=y_pred_seq_name,
            metric_list=metric_list,
            horizon_to_se=horizon_to_se
        )

        # Create a flat dictionary of evaluation metrics
        df_report = eval_instance.df_report_neat
        flat_metrics = {}

        for _, row in df_report.iterrows():
            row_dict = row.to_dict()
            setname = row_dict.pop('setname')
            for metric_name, value in row_dict.items():
                flat_metrics[f"{metric_name}_{setname}"] = value

        # ✅ Log to Weights & Biases
        try:
            # 🛠 Ensure step is increasing
            current_wandb_step = wandb.run.step if wandb.run else 0
            log_step = max(current_wandb_step, state.global_step)
            wandb.log(flat_metrics, step=log_step)
        except Exception as e:
            print(f"⚠️ W&B logging failed: {e}")

            
        # return flat_metrics


In [None]:
ds_tfm_train

In [None]:
eval_dataset_dict

ds_tfm_eval = eval_dataset_dict['Test']
ds_tfm_eval


In [None]:
trainer = Trainer(
    ########## you have your model 
    model = model,
    ########## you have your training_args
    args = training_args,
    ########## get train_dataset
    train_dataset = ds_tfm_train, # if training_args.do_train else None,
    ########## get eval_dataset
    eval_dataset = ds_tfm_eval, # <--- for in-training evaluation
    ########## huge question here: is it ok to ignore the tokenizer?
    # tokenizer = tokenizer, # Apr 2024: don't add tokenizer, hard to save.
    ########## huge question here: data_collator
    data_collator = default_data_collator,
    # compute_metrics = lambda x: compute_metrics_for_ntp(x, experiment_id, AfTknNum),
    # preprocess_logits_for_metrics = preprocess_logits_for_metrics,
    callbacks = [TimestampCallback, GenerationEvalCallback(ds_tfm_eval, InferenceArgs, EvaluationArgs)],
)

logger.info(trainer)

In [None]:
len(ds_tfm_train)

In [None]:
training_args.output_dir

In [None]:
from transformers.trainer_utils import get_last_checkpoint

def prepare_last_checkpoint(training_args):
    # ------------------------------- part 3: last checkpoint -------------------------------
    # Detecting last checkpoint.
    last_checkpoint = None

    dont_overwrite_output_dir = bool(not training_args.overwrite_output_dir)

    if os.path.isdir(training_args.output_dir) and training_args.do_train and dont_overwrite_output_dir:

        last_checkpoint = get_last_checkpoint(training_args.output_dir)

        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
               f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
               f"Checkpoint detected, resuming training at {last_checkpoint}."
                "To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    return last_checkpoint

In [None]:
checkpoint = prepare_last_checkpoint(training_args)
print(checkpoint)

In [None]:
for batch in trainer.get_train_dataloader():
    print(f"Batch shape: {batch['input_ids'].shape}")
    break  # Just check the first batch

In [None]:
train_result = trainer.train(resume_from_checkpoint = checkpoint)