In [None]:
import sys
import os
import logging
import pandas as pd
import datasets
from pprint import pprint
KEY = '2-NOTEBOOK'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'_Data/0-Data_Raw',
    'DATA_RFT': f'_Data/1-Data_RFT',
    'DATA_CASE': f'_Data/2-Data_CASE',
    'DATA_AIDATA': f'_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'code/external',
    'CODE_FN': f'code/pipeline',
    'MODEL_ROOT': f'./_Model',
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'
print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

os.environ["CUDA_VISIBLE_DEVICES"]="0"

# AI Data

In [None]:
from recfldtkn.aidata_base.entry import EntryAIData_Builder

OneAIDataName = 'DietEventBench'
CF_DataName = 'DietEvent-CGM5MinEntry-1ea9d787eef20fb7'
CohortName_list = ['WellDoc2022CGM', 'WellDoc2025ALS', 'WellDoc2025CVS', 'WellDoc2025LLY']
CF_DataName_list = [f'{i}/{CF_DataName}' for i in CohortName_list]

OneEntryArgs = {
    'Split_Part': {
        'SplitMethod': 'SplitFromColumns',
        'Split_to_Selection': {
            'train': {
                'Rules': [
                    ['split_timebin', 'in', ('train-early', 'valid-early')],
                    ['MEDInfoBf24h-DietRecNum', '>', 0],
                    ['MEDInfoBf24h-DietLastToNow', '>=', 120],
                    ['MEDInfoBf24h-DietLastToNow', '<=', 420],
                    ['ObsDT_Minute', '==', 0],
                ],
                'Op': 'and'
            },
            'valid': {
                'Rules': [
                    ['split_timebin', 'in', ('train-middle', 'valid-middle')],
                    ['MEDInfoBf24h-DietRecNum', '>', 0],
                    ['MEDInfoBf24h-DietLastToNow', '>=', 120],
                    ['MEDInfoBf24h-DietLastToNow', '<=', 420],
                    ['ObsDT_Minute', '==', 0],
                ],
                'Op': 'and'
            },
            'test-id': {
                'Rules': [
                    ['split_timebin', 'in', ('train-late', 'valid-late')],
                    ['MEDInfoBf24h-DietRecNum', '>', 0],
                    ['MEDInfoBf24h-DietLastToNow', '>=', 120],
                    ['MEDInfoBf24h-DietLastToNow', '<=', 420],
                    ['ObsDT_Minute', '==', 0],
                ],
                'Op': 'and'
            },
            'test-od': {
                'Rules': [
                    ['split_timebin', 'in', ('test-early', 'test-middle', 'test-late')],
                    ['MEDInfoBf24h-DietRecNum', '>', 0],
                    ['MEDInfoBf24h-DietLastToNow', '>=', 120],
                    ['MEDInfoBf24h-DietLastToNow', '<=', 420],
                    ['ObsDT_Minute', '==', 0],
                ],
                'Op': 'and'
            }
        }
    },
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': '1TknInStepWt5MinHM',
        'CF_list': [
            'CGMValueBf24h',
            # 'CGMValueAf2h',
        ],
        'BeforePeriods': ['Bf24h'],
        # 'AfterPeriods': ['Af2h'],
        'TimeIndex': True, 
        'InferenceMode': False, # True, # True, # False, # True, 
        'TargetField': 'CGMValue',
        'TargetRange': [40, 400], # 500
        # 'HM': None, 
        'HM': {'start': -24, 'unit': 'h', 'interval': '5m'},
    }, 
    'Output_Part': {
        'EntryOutputMethod': 'UniLabelRules',
        'CF_list': ['MEDInfoBf24h'],
        'label_rule': {
            1: ('MEDInfoBf24h-DietLastToNow', 'in', [120, 180]),
            0: ('MEDInfoBf24h-DietLastToNow', 'in', [180, 420]),
            -100: 'others'
        },
        'assertion': [('MEDInfoBf24h-DietLastToNow', 'in', [120, 420])],
        'set_transform': False,
        'num_proc': 4,
    },
}

entry = EntryAIData_Builder(OneEntryArgs=OneEntryArgs, SPACE=SPACE)
dataset = entry.merge_one_cf_dataset(CF_DataName_list)
data_config = dataset.info.config_name 
split_to_dataset = entry.split_cf_dataset(dataset)

In [None]:
CFName = 'HM5MinStep'
interval_delta = pd.Timedelta(minutes=5)
idx2tkn = [pd.Timestamp('2022-01-01 00:00:00') + interval_delta * i for i in range(24 * 12)]
idx2tkn = [f'{i.hour:02d}:{i.minute:02d}' for i in idx2tkn]
tkn2idx = {tkn: idx for idx, tkn in enumerate(idx2tkn)}
CF_to_CFvocab = data_config['CF_to_CFvocab']
CF_to_CFvocab[CFName] = {'idx2tkn': idx2tkn, 'tkn2idx': tkn2idx}

In [None]:
CFName = 'CGMValue'
idx2tkn = ["PAD", "UNKNOWN", "MASK"] + [f'Other_{i}' for i in range(0, 7)] + [str(i) for i in range(10, 401)]
tkn2idx = {tkn: idx for idx, tkn in enumerate(idx2tkn)}
CF_to_CFvocab[CFName] = {'idx2tkn': idx2tkn, 'tkn2idx': tkn2idx}

In [None]:
[i for i in dataset.info.config_name['CF_to_CFvocab']] # [CFName]

In [None]:
entry.CF_to_CFvocab = CF_to_CFvocab

In [None]:
Name_to_Data = entry.setup_EntryFn_to_NameToData(split_to_dataset)

In [None]:
split_to_dataset

In [None]:
# Name_to_Data

# Config

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
from transformers import RobertaConfig, RobertaForSequenceClassification, Trainer, TrainingArguments, PreTrainedTokenizerFast
import torch


# Step 1: Create a vocab dict

# print(len(idx2tkn))
# print(idx2tkn[400])

idx2tkn = CF_to_CFvocab['CGMValue']['idx2tkn']
vocab_dict = {token: idx for idx, token in enumerate(idx2tkn)}

# Step 2: Build a WordLevel tokenizer from vocab
wordlevel = WordLevel(vocab=vocab_dict, unk_token="UNKNOWN")
tokenizer_backend = Tokenizer(wordlevel)
tokenizer_backend.pre_tokenizer = Whitespace()  # Simple whitespace split

# Step 3: Save to disk (optional)
# tokenizer_backend.save("tokenizer.json")

# Step 4: Wrap with Hugging Face's PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer_backend,
    unk_token="UNKNOWN",
    pad_token="PAD",
    mask_token="MASK"
)

# tokenizer.encode("PAD 11 42 MASK")
num_labels = len([i for i in OneEntryArgs['Output_Part']['label_rule'] if i != -100])


hm_idx2tkn = CF_to_CFvocab['HM5MinStep']['idx2tkn']

In [None]:
from transformers import RobertaConfig

class RobertaWithHMConfig(RobertaConfig):
    model_type = "roberta"       # keep HF happy

    def __init__(self, hm_vocab_size=288, **kwargs):
        super().__init__(**kwargs)
        self.hm_vocab_size = hm_vocab_size

config = RobertaWithHMConfig(
    vocab_size=len(tokenizer),
    num_labels=num_labels,  # ← Change this to match your task (e.g., 3 for multi-class)
    hm_vocab_size = len(hm_idx2tkn)
)

# Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
import torch
from torch import nn
from transformers import RobertaConfig, RobertaForSequenceClassification
from transformers.models.roberta.modeling_roberta import SequenceClassifierOutput

class RobertaWithHMForSequenceClassification(RobertaForSequenceClassification):
    def __init__(self, config: RobertaConfig):
        super().__init__(config)
        self.hm_embeddings = nn.Embedding(config.hm_vocab_size, config.hidden_size)
        self._init_weights(self.hm_embeddings)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        hm_ids=None,
        labels=None,
        **kwargs
    ):
        # 1) Build token+pos+type embeddings
        inputs_embeds = self.roberta.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
        )

        # 2) Add your hm feature embeddings
        hm_embeds     = self.hm_embeddings(hm_ids)
        inputs_embeds = inputs_embeds + hm_embeds

        # 3) Run the full RobertaModel
        model_outputs  = self.roberta(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            return_dict=True,
        )
        sequence_output = model_outputs.last_hidden_state  # [batch, seq_len, hidden]

        # 4) Classifier wants the sequence, not the pooled vector
        logits = self.classifier(sequence_output)          # picks out token 0 internally

        # 5) Loss if labels provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss     = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=model_outputs.hidden_states,
            attentions=model_outputs.attentions,
        )


In [None]:
# Step 3: Initialize classification model
model = RobertaWithHMForSequenceClassification(config=config)
model.to(device)

In [None]:
ds_tfm_train = Name_to_Data['train']['ds_tfm']
batch = ds_tfm_train[:4]
pprint(batch, compact=True)

batch = {k: v.to(device) for k, v in batch.items()}
# batch

In [None]:
model(**batch)

# Train

In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig, Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 1: Define the compute_metrics function
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


# ---------------------- Training Arguments ----------------------
training_args = TrainingArguments(
    output_dir=os.path.join(SPACE['MODEL_ROOT'], "roberta-classifier-diet-event"),
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="valid_f1",
    greater_is_better=True,
    remove_unused_columns=False,
    dataloader_drop_last=True,
    report_to="wandb",  # <<--- wandb integration
)

ds_tfm_train  = Name_to_Data['train']['ds_tfm']
ds_tfm_valid  = Name_to_Data['valid']['ds_tfm']
ds_tfm_testid = Name_to_Data['test-id']['ds_tfm']
ds_tfm_testod = Name_to_Data['test-od']['ds_tfm']

eval_dict = {
    'valid': ds_tfm_valid,
    'test-id': ds_tfm_testid,
    'test-od': ds_tfm_testod,
}

# Step 3: Set up Trainer with eval_dataset and metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tfm_train,        # your training set
    eval_dataset=eval_dict,        # your validation set (needs same format)
    # data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Step 7: Train
trainer.train()