# Space

In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
# disable_caching()

SPACE['MODEL_ENDPOINT'] = 'vTest'

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import datetime

def get_timestamp_name():
    return datetime.now().strftime("%Y%m%d%H%M")

# Part 1: AIData

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
from datasets import load_from_disk


# 24 / 288

# AIDataName = 'CGM_32h_24pd_WellDoc_v2_v0323' # CGM, 32h, 24 data per day. 
# AIDataName = 'CGM_32h_24pd_WellDoc_v2_sample' # CGM, 32h, 24 data per day. 
AIDataName = 'CGM2EventFood_bf6h_WellDoc_v2_v0323'


path = os.path.join(SPACE['DATA_AIDATA'], AIDataName)
print(path)
dataset = load_from_disk(path)
# dataset

config = dataset.info.__dict__['config_name']# .features['cf'].feature.vocab
print([i for i in config])
CF_to_CFvocab = config['CF_to_CFvocab']
print([i for i in CF_to_CFvocab])

CF_to_CFArgs = config['CaseSettingInfo']['Case_Args_Settings']['CF_to_CFArgs']
print([i for i in CF_to_CFArgs])


TriggerCaseBaseName = config['TriggerCaseBaseName']
TriggerCaseBaseArgs = config['TriggerCaseBaseName_to_TriggerCaseBaseArgs'][TriggerCaseBaseName]
TriggerName = TriggerCaseBaseArgs['Trigger']['TriggerName']
# TriggerName
# print(TriggerCaseBaseArgs)


In [None]:
# df_tag.columns

from recfldtkn.base import assign_caseSplitTag_to_dsCase
from recfldtkn.base import apply_multiple_conditions
import numpy as np 


columns = dataset.column_names
columns_tag = [i for i in columns if '--' not in i]
df_tag = dataset.select_columns(columns_tag).to_pandas()


In [None]:
Split_to_Selection = {
    'Train': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
            ['ObsDT', '<', '2022-07-01'], 
            ['ObsDT', '>=', '2021-01-01'],
        ], 
        'Op': 'and',
    },
    'Val': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['ObsDT', '<', '2023-01-01'], 
            ['ObsDT', '>=', '2022-07-01'],
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
        ], 
        'Op': 'and',
    },
    'Test': {
        'Rules': [
            ['Age', '>=', 40],
            ['Cohort', 'in', ['1', '2', '3']], # <--- add Cohort column
            ['Year', 'in', [2020, 2021, 2022, 2023]], # <--- add Year column
            ['ObsDT', '>=', '2023-01-01'], 
            ['ObsDT', '<', '2024-01-01'],
            ['GenderGroup', 'in', ['Gender.1', 'Gender.2']], 
        ], 
        'Op': 'and',
    }
}

In [None]:
split_to_dataset = {}
for split_name, Selection in Split_to_Selection.items():
    # split_to_dataset[split_name] = dataset.filter(lambda x: apply_multiple_conditions(x, split_config['Rules'], split_config['Op']))
    Rules = Selection['Rules']
    Op = Selection['Op']

    index = apply_multiple_conditions(df_tag, Rules, Op)
    indices = np.where(index == 1)[0]
    # len(indices)
    dataset_selected = dataset.select(indices)
    split_to_dataset[split_name] = dataset_selected


split_to_dataset

In [None]:
Name_to_Data = {}
for split, dataset in split_to_dataset.items():
    Name_to_Data[split] = {'ds_case': dataset}
Name_to_Data

In [None]:
OneEntryArgs = {
     # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'Mto1Period_MultiTknInStepNoWgt',
        'CF_list': [
            'cf.TargetCGM_Bf24H',
            # 'cf.TargetCGM_Af2H',

            'cf.TimeSparse_Bf24H', 
            # 'cf.TimeSparse_Af2H',

            # 'cf.Diet5MinBaseLMH_Bf24H',
            # 'cf.Diet5MinBaseLMH_Af2H',
        ],
        'TargetField': 'TargetCGM',
        'TimeField':   'Time',
        # 'EventFields': [
        #     # 'Activity',
        #     'Diet5MinBaseLMH',
        # ],
        'BeforePeriods': ['Bf24H'],
        # 'AfterPeriods': ['Af2H'],
        'InferenceMode': False, # 'WithFutureEvent' #  # 'NoFutureEvent', 'WithFutureEvent', 
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'EventPred',
        
        # ------------ one head for time to now ------------
        'EventTimeToNow': 'co.Bf24H_Diet5MinInfo:MinToNow',
        'label_to_id_head1': {'0h': 0, 
                              '1h': 1, '2h': 2, 
                              '3h': 3, '4h': 4, '5h': 5},
        'dimensions_head1': ['food_event_time'],
        # ------------ one head for food content ------------
        'EventCF_Name': 'cf.Diet5MinBaseLMH_Bf24H',
        'label_to_id_head2': {'low': 0, 'medium': 1, 'high': 2},
        'dimensions_head2': ['carbs', 'fiber','fat', 'protein', 'sugar'],

        'set_transform': False,
        'num_proc': 4, 
    },
}


from recfldtkn.aidata_base.entry import EntryAIData_Builder

entry = EntryAIData_Builder(TriggerName = TriggerName, 
                            OneEntryArgs = OneEntryArgs, 
                            SPACE = SPACE)

In [None]:
Name_to_Data = entry.setup_EntryFn_to_NameToData(Name_to_Data, CF_to_CFvocab, OneEntryArgs)
# Name_to_Data

In [None]:
ds_train = Name_to_Data['Train']['ds_tfm']
ds_train.set_format(type='torch', columns=['input_ids', 'labels'])

eval_dataset = Name_to_Data['Val']['ds_tfm']
eval_dataset.set_format(type='torch', columns=['input_ids', 'labels'])



test_dataset = Name_to_Data['Test']['ds_tfm']
test_dataset.set_format(type='torch', columns=['input_ids', 'labels'])




eval_dataset_dict = {
    # 'train': ds_train,
    'val': eval_dataset,
    'test': test_dataset
}



In [None]:
# #Extra process
# for split in Name_to_Data:
#     dataset_subsize = int(len(Name_to_Data[split]['ds_tfm'])*0.25)
#     Name_to_Data[split]['ds_tfm'] = Name_to_Data[split]['ds_tfm'].shuffle(seed=42).select(range(dataset_subsize)).filter(lambda x: x['labels'] >0)

In [None]:
model_path = '../_Model/mlm_c123_backup/checkpoint-5000'



In [None]:
# from transformers import RobertaConfig
# from transformers import RobertaForMaskedLM
# from transformers import RobertaForSequenceClassification

# [CF for CF in CF_to_CFvocab]
# vocab = CF_to_CFvocab['cf.TargetCGM_Af2H']['input_ids']['tid2tkn']
# vocab


# config = RobertaConfig(
#     vocab_size=len(vocab),  # Must match the tokenizer
#     hidden_size=768,
#     num_attention_heads=12,
#     num_hidden_layers=6,
#     intermediate_size=3072,
#     type_vocab_size=1,
#     max_position_embeddings=514,
#     num_labels=6
# )

# model = RobertaForSequenceClassification(config=config)
# model

In [None]:
from transformers import RobertaConfig, RobertaForSequenceClassification

# Load your vocab
vocab = CF_to_CFvocab['cf.TargetCGM_Af2H']['input_ids']['tid2tkn']

# Define config
config = RobertaConfig(
    vocab_size=len(vocab),
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    intermediate_size=3072,
    type_vocab_size=1,
    max_position_embeddings=514,
    num_labels=6  # For classification task
)

# Load the model from pretrained MLM checkpoint
model = RobertaForSequenceClassification.from_pretrained(
    model_path,
    config=config,
    ignore_mismatched_sizes=True  # if classification head differs from MLM head
)


In [None]:
model

In [None]:
# Freeze all RoBERTa encoder parameters
for param in model.roberta.parameters():
    param.requires_grad = False


for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable: {name}")


In [None]:
# batch = ds_train[:4]

# model(**batch)

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='_Model/roberta_for_meal_hour_with_pretrain',
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    learning_rate=5e-4,
    
    weight_decay=0.01,
    logging_steps=1,


    eval_strategy="steps",
    eval_steps=50,


    load_best_model_at_end=True,
    metric_for_best_model="eval_val_loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=eval_dataset_dict,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model on validation set
val_results = trainer.evaluate()
print(f"Validation results: {val_results}")

# Evaluate the model on test set
test_results = trainer.evaluate(eval_dataset_dict['test'])
print(f"Test results: {test_results}")

# Part 2: Model Init

In [None]:
OneEntryArgs

# Food Hour Model

## Step 1: model config

In [None]:
# from nn.cgmlhm.configuration_cgmlhm import CgmLhmConfig 
from nn.cgmevent.configuration_fieldencoder import FieldEncoderConfig

ModelArgs = {
    'model_type': 'cgm_encoder',
    'num_classes': 6,
    'num_hidden_layers': 3,
}
config = FieldEncoderConfig(**ModelArgs)
# print(config)
# config.field_to_fieldinfo
config

## Step 2: model structure

In [None]:
from nn.cgmevent.modeling_fieldencoder import FieldEncoderForClassification

eventmodel = FieldEncoderForClassification(config)
eventmodel

## Step 3: forward

In [None]:
eventmodel_input = batch
# {
#     'input_ids': batch['input_ids'],
#     'labels': batch['food_event_time_labels'],
#     # 'timestep_ids': batch['Time--timestep_orig_ids'],
#     # 'attention_mask': batch['attention_mask'],
# }


event_outputs = eventmodel(**eventmodel_input)
event_outputs


## Step4: train

In [None]:
# aidata.Name_to_DsAIData
###############################
TrainSetName = 'train'
EvalSetNames = ['val', 'test']
max_train_samples = None
max_eval_samples = None
###############################


# ------------ train datasets ------------
TrainData = Name_to_Data[TrainSetName]
ds_tfm_train = TrainData['ds_tfm']
if max_train_samples is not None:
    max_train_samples = min(len(ds_tfm_train), max_train_samples)
    ds_tfm_train = ds_tfm_train.shuffle(seed=42).select(range(max_train_samples))
logger.info(ds_tfm_train)


# ------------ eval datasets ------------
eval_dataset_dict = {}
for evalname in EvalSetNames:
    if evalname not in Name_to_Data: 
        logger.info(f'{evalname} not in aidata.Name_to_Data')
        continue
    eval_dataset = Name_to_Data[evalname]['ds_tfm']
    if max_eval_samples is not None:
        max_eval_samples = min(len(eval_dataset), max_eval_samples)
        eval_dataset = eval_dataset.shuffle(seed=42).select(range(max_eval_samples))
    eval_dataset_dict[evalname] = eval_dataset
logger.info(f'---- eval_datasets ----')
logger.info(eval_dataset_dict)


In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Step4a Train with customize event category (food hour) by CGM

In [None]:
from transformers import Trainer, TrainingArguments, TrainerCallback

#Test Running Config (don't save anything, just print the evaluation)
HuggingFaceTrainingArgs = {
    'output_dir': '_noop',                # required field but won’t be used
    'overwrite_output_dir': True,

    'do_train': True, 
    'num_train_epochs': 10,
    'per_device_train_batch_size': 64,
    'per_device_eval_batch_size': 64,
    'gradient_accumulation_steps': 4,

    'do_eval': True, 
    'evaluation_strategy': 'steps',      # <-- evaluate once per epoch
    'eval_steps': 50,                    # <-- evaluate once per epoch
    
    'logging_steps': 1,
    'logging_strategy': 'steps',         # <-- print logs once per epoch
    # 'logging_first_step': True,
    
    

    'save_strategy': 'no',               # <-- disables checkpoint saving
    'report_to': 'wandb',                 # <-- disables wandb/logging

    'remove_unused_columns': True,
    'dataloader_drop_last': True,
    
    'learning_rate': 5e-4,
    'warmup_steps': 100,
    'lr_scheduler_type': 'cosine',
}

#################################

training_args = TrainingArguments(**HuggingFaceTrainingArgs)
training_args

In [None]:
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_CAUSAL_LM_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    is_torch_tpu_available,
    set_seed,
)

print(training_args.seed)
set_seed(training_args.seed)

In [None]:
from datetime import datetime
from datasets.fingerprint import Hasher 

###################
AfTknNum = 24
###################

timestamp = datetime.now().strftime("%Y%m%d-%H")
experiment_id = timestamp + "-" + Hasher().hash([config])

print(experiment_id)

class TimestampCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Add the current timestamp to the logs
        logs["step"] = state.global_step
        logs["timestamp"] = str(datetime.now())

In [None]:
import evaluate
import torch
import torch.nn.functional as F

def classification_softmax_mse(logits: torch.Tensor, labels: torch.Tensor) -> float:
    probs = F.softmax(logits, dim=-1)
    one_hot = F.one_hot(labels, num_classes=probs.size(-1)).float()
    return F.mse_loss(probs, one_hot).item()

def compute_metrics(eval_preds, experiment_id):
    metric_acc = evaluate.load("accuracy", experiment_id=experiment_id)
    metric_f1 = evaluate.load("f1", experiment_id=experiment_id)

    logits, labels = eval_preds
    logits = torch.tensor(logits)
    labels = torch.tensor(labels)

    preds = logits.argmax(dim=-1)

    # Accuracy
    d_acc = metric_acc.compute(predictions=preds.tolist(), references=labels.tolist())

    # F1 Macro
    d_f1 = metric_f1.compute(predictions=preds.tolist(), references=labels.tolist(), average="macro")

    # Softmax MSE
    mse_soft = classification_softmax_mse(logits, labels)

    return {
        "accuracy": d_acc["accuracy"],
        "f1_macro": d_f1["f1"],
        "softmax_mse": mse_soft,
    }


In [None]:
trainer = Trainer(
    ########## you have your model 
    model = eventmodel,
    ########## you have your training_args
    args = training_args,
    ########## get train_dataset
    train_dataset = ds_tfm_train, # if training_args.do_train else None,
    ########## get eval_dataset
    eval_dataset = eval_dataset_dict, # <--- for in-training evaluation
    ########## huge question here: is it ok to ignore the tokenizer?
    # tokenizer = tokenizer, # Apr 2024: don't add tokenizer, hard to save.
    ########## huge question here: data_collator
    data_collator = default_data_collator,
    compute_metrics = lambda x: compute_metrics(x, experiment_id),
    # preprocess_logits_for_metrics = preprocess_logits_for_metrics,
    # callbacks = [CorrectProfilerCallback(wait=1, warmup=1, active=3)],
)


logger.info(trainer)

In [None]:
trainer.train()

### Step4b Train with Pretrained Roberta on event category (food hour) by CGM

In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=6)

trainer = Trainer(
    model=model,  # ⬅️ switched to pretrained
    args=training_args,
    train_dataset=ds_tfm_train,
    eval_dataset=eval_dataset_dict,
    data_collator=default_data_collator,
    compute_metrics=lambda x: compute_metrics(x, experiment_id),
)


In [None]:
trainer.args.run_name = 'exp_' + get_timestamp_name()
trainer.train()

### Step4b Train with Pretrained Roberta on Hour Regression by CGM

In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=6)

trainer = Trainer(
    model=model,  # ⬅️ switched to pretrained
    args=training_args,
    train_dataset=ds_tfm_train,
    eval_dataset=eval_dataset_dict,
    data_collator=default_data_collator,
    compute_metrics=lambda x: compute_metrics(x, experiment_id),
)


In [None]:
trainer.args.run_name = 'exp_' + get_timestamp_name()
trainer.train()

In [None]:
def train(model):
    ...

### Step4c Fake Dataset

In [None]:
import numpy as np
from datasets import Dataset, DatasetDict

# Define label-to-boosted-region mapping
def generate_structured_sample(label, length=289):
    x = np.random.randint(0,10, size=length)  # base noise
    
    block_ranges = {
        0: (0, 50),
        1: (50, 100),
        2: (100, 150),
        3: (150, 200),
        4: (200, 250),
        5: (250, 289),
    }
    
    start, end = block_ranges[label]
    x[start:end] += 5  # inject signal
    return x.tolist()

# Dataset generator
def generate_dataset(num_samples):
    input_ids = []
    attention_mask = []
    labels = []

    for _ in range(num_samples):
        label = np.random.randint(0, 6)
        sample = generate_structured_sample(label)
        input_ids.append(sample)
        attention_mask.append([1] * 289)
        labels.append(label)

    return Dataset.from_dict({
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    })

# Generate train/val/test datasets
train_dataset = generate_dataset(800)
val_dataset = generate_dataset(100)
test_dataset = generate_dataset(100)

# Optional: combine into DatasetDict
dataset_dict_train = train_dataset
dataset_eval = DatasetDict({
    "val": val_dataset,
    "test": test_dataset
})


In [None]:
#pretrained Roberta
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import default_data_collator

# 1. Load pretrained classification model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=6)

# 2. Define training arguments
training_args = TrainingArguments(
    output_dir='./simulation_signal_classification',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="no",
    num_train_epochs=5,
    logging_steps=10,
    learning_rate=5e-5,
    load_best_model_at_end=False,
    report_to="wandb",
    run_name="simulation_signal_classification",
)

# 3. Optional: compute accuracy
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred, experiment_id=None):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

# 4. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict_train,
    eval_dataset=dataset_eval,
    data_collator=default_data_collator,
     compute_metrics = lambda x: compute_metrics(x, experiment_id),
)


In [None]:
trainer.train()

In [None]:
def add_sum_column(example):
    example['food_event_time_labels'] = example["labels"]
    return example

dataset_dict_train = dataset_dict_train.map(add_sum_column)
for i in dataset_eval:
    dataset_eval[i] = dataset_eval[i].map(add_sum_column)


In [None]:
# from nn.cgmlhm.configuration_cgmlhm import CgmLhmConfig 
from nn.cgmevent.configuration_fieldencoder import FieldEncoderConfig

ModelArgs = {
    'model_type': 'cgm_encoder',
    'num_classes': 6,
    'num_hidden_layers': 3,
}
config = FieldEncoderConfig(**ModelArgs)

from nn.cgmevent.modeling_fieldencoder import FieldEncoderForClassification

eventmodel = FieldEncoderForClassification(config)

from transformers import Trainer, TrainingArguments, TrainerCallback

#Test Running Config (don't save anything, just print the evaluation)
HuggingFaceTrainingArgs = {
    'output_dir': '_noop',                # required field but won’t be used
    'overwrite_output_dir': True,

    'do_train': True, 
    'num_train_epochs': 130,
    'per_device_train_batch_size': 64,
    'per_device_eval_batch_size': 64,
    'gradient_accumulation_steps': 4,

    'do_eval': True, 
    'evaluation_strategy': 'steps',      # <-- evaluate once per epoch
    'eval_steps': 50,                    # <-- evaluate once per epoch
    
    'logging_steps': 1,
    'logging_strategy': 'steps',         # <-- print logs once per epoch
    # 'logging_first_step': True,
    
    

    'save_strategy': 'no',               # <-- disables checkpoint saving
    'report_to': 'wandb',                 # <-- disables wandb/logging

    'remove_unused_columns': True,
    'dataloader_drop_last': True,
    
    'learning_rate': 5e-4,
    'warmup_steps': 100,
    'lr_scheduler_type': 'cosine',
}

#################################

training_args = TrainingArguments(**HuggingFaceTrainingArgs)
training_args

trainer = Trainer(
    ########## you have your model 
    model = eventmodel,
    ########## you have your training_args
    args = training_args,
    train_dataset=dataset_dict_train,
    eval_dataset=dataset_eval,

    data_collator = default_data_collator,
    compute_metrics = lambda x: compute_metrics(x, experiment_id),
    
)


logger.info(trainer)

In [None]:
trainer.train()


# Food and Carb Model

## Step 1:model config

In [None]:
# from nn.cgmlhm.configuration_cgmlhm import CgmLhmConfig 
from nn.cgmevent.configuration_fieldencoder import FieldEncoderConfig

ModelArgs = {
    'model_type': 'cgm_encoder',
    'num_classes': 6,
    'num_hidden_layers': 6,
    'quantity_regression':False,
    'num_quantity_classes':3
}
config = FieldEncoderConfig(**ModelArgs)
# print(config)
# config.field_to_fieldinfo
config

## Step 2: model structure

In [None]:
from nn.cgmevent.modeling_fieldencoder import FieldEncoderForClassificationAndRegression

eventmodel = FieldEncoderForClassificationAndRegression(config)
eventmodel

## step 3: forward

In [None]:


eventmodel_input = {
    'input_ids': batch['input_ids'],
    'labels': batch['food_event_time_labels'],
    # 'timestep_ids': batch['Time--timestep_orig_ids'],
    'labels_quantity': batch['carbs_labels'],
}


event_outputs = eventmodel(**eventmodel_input)
event_outputs