In [None]:
import sys
import os
import logging
import pandas as pd
import datasets
from pprint import pprint
KEY = '2-NOTEBOOK'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'_Data/0-Data_Raw',
    'DATA_RFT': f'_Data/1-Data_RFT',
    'DATA_CASE': f'_Data/2-Data_CASE',
    'DATA_AIDATA': f'_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'code/external',
    'DATA_HFDATA': f'_Data/5-Data_HFData',
    'CODE_FN': f'code/pipeline',
    'MODEL_ROOT': f'./_Model',
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'
print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

os.environ["CUDA_VISIBLE_DEVICES"]="1"

# AI Data

In [None]:
HFDataName = 'PreTrainBench-MaskedLM-Split-v0515'
path = os.path.join(SPACE['DATA_HFDATA'], HFDataName)
split_to_dataset = datasets.load_from_disk(path)
remove_unused_columns = True # if using the processed dataset, set to True. 
print(split_to_dataset)
Name_to_Data = {i: {'ds_tfm': split_to_dataset[i]} for i in split_to_dataset}


In [None]:
data_config = {}
CF_to_CFvocab = {}
data_config['CF_to_CFvocab'] = CF_to_CFvocab

CFName = 'HM5MinStep'
interval_delta = pd.Timedelta(minutes=5)
idx2tkn = [pd.Timestamp('2022-01-01 00:00:00') + interval_delta * i for i in range(24 * 12)]
idx2tkn = [f'{i.hour:02d}:{i.minute:02d}' for i in idx2tkn]
tkn2idx = {tkn: idx for idx, tkn in enumerate(idx2tkn)}
CF_to_CFvocab = data_config['CF_to_CFvocab']
CF_to_CFvocab[CFName] = {'idx2tkn': idx2tkn, 'tkn2idx': tkn2idx}

In [None]:
CFName = 'CGMValue'
idx2tkn = ["PAD", "UNKNOWN", "MASK"] + [f'Other_{i}' for i in range(0, 7)] + [str(i) for i in range(10, 401)]
tkn2idx = {tkn: idx for idx, tkn in enumerate(idx2tkn)}
CF_to_CFvocab[CFName] = {'idx2tkn': idx2tkn, 'tkn2idx': tkn2idx}

# Config

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
from transformers import RobertaConfig, RobertaForSequenceClassification, Trainer, TrainingArguments, PreTrainedTokenizerFast
import torch


# Step 1: Create a vocab dict
# print(len(idx2tkn))
# print(idx2tkn[400])

idx2tkn = CF_to_CFvocab['CGMValue']['idx2tkn']
vocab_dict = {token: idx for idx, token in enumerate(idx2tkn)}

# Step 2: Build a WordLevel tokenizer from vocab
wordlevel = WordLevel(vocab=vocab_dict, unk_token="UNKNOWN")
tokenizer_backend = Tokenizer(wordlevel)
tokenizer_backend.pre_tokenizer = Whitespace()  # Simple whitespace split

# Step 3: Save to disk (optional)
# tokenizer_backend.save("tokenizer.json")

# Step 4: Wrap with Hugging Face's PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer_backend,
    unk_token="UNKNOWN",
    pad_token="PAD",
    mask_token="MASK"
)

# tokenizer.encode("PAD 11 42 MASK")
# num_labels = len([i for i in OneEntryArgs['Output_Part']['label_rule'] if i != -100])

hm_idx2tkn = CF_to_CFvocab['HM5MinStep']['idx2tkn']

In [None]:
from transformers import RobertaConfig

class RobertaWithHMConfig(RobertaConfig):
    model_type = "roberta"       # keep HF happy

    def __init__(self, hm_vocab_size=288, **kwargs):
        super().__init__(**kwargs)
        self.hm_vocab_size = hm_vocab_size

config = RobertaWithHMConfig(
    vocab_size=len(tokenizer),
    # num_labels=num_labels,  # ← Change this to match your task (e.g., 3 for multi-class)
    hm_vocab_size = len(hm_idx2tkn)
)

config

# Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
import torch
from torch import nn
from transformers import RobertaConfig, RobertaForSequenceClassification
from transformers.models.roberta.modeling_roberta import SequenceClassifierOutput

from nn.cgmencoder.modeling_cgmencoder import RobertaWithHMForMaskedLM

In [None]:
# Step 3: Initialize classification model
model = RobertaWithHMForMaskedLM(config=config)
model.to(device)

In [None]:
ds_tfm_train = Name_to_Data['train']['ds_tfm']
batch = ds_tfm_train[:2]


batch = {k: v for k, v in batch.items() if k in ['input_ids', 'hm_ids', 'labels']}

pprint(batch, compact=True)

batch = {k: torch.tensor(v, dtype=torch.long).to(device) for k, v in batch.items()}
batch

In [None]:
model(**batch)

# Train

In [None]:
[i for i in Name_to_Data]

In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig, Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 1: Define the compute_metrics function for masked language modeling
def compute_metrics(pred):
    logits, labels = pred
    # For MLM, we only want to evaluate on the masked tokens (where labels != -100)
    mask = labels != -100
    
    # Calculate loss on masked tokens only
    loss_fct = torch.nn.CrossEntropyLoss()
    masked_lm_loss = loss_fct(logits.view(-1, logits.size(-1))[mask.view(-1)], 
                             labels.view(-1)[mask.view(-1)])
    
    # Get predictions for masked tokens
    predictions = torch.argmax(logits, dim=-1)
    
    # Calculate accuracy on masked tokens only
    correct_preds = (predictions == labels) & mask
    accuracy = correct_preds.sum().float() / mask.sum().float()
    
    # Calculate perplexity
    perplexity = torch.exp(masked_lm_loss)
    
    return {
        'masked_lm_loss': masked_lm_loss.item(),
        'perplexity': perplexity.item(),
        'accuracy': accuracy.item(),
        'num_masked_tokens': mask.sum().item()
    }


# ---------------------- Training Arguments ----------------------
model_name = "roberta-cgm-mlm"
training_args = TrainingArguments(
    output_dir=os.path.join(SPACE['MODEL_ROOT'], model_name),
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="valid_f1",
    greater_is_better=True,
    remove_unused_columns=False,
    dataloader_drop_last=True,
    report_to="wandb",  # <<--- wandb integration,
    # eval_steps=1042,  # Number of datapoints used in evaluation set
)


eval_set_size = 1042
random_seed = 42
ds_tfm_train  = Name_to_Data['train']['ds_tfm']
ds_tfm_valid  = Name_to_Data['valid']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))
ds_tfm_testid = Name_to_Data['test-id']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))
ds_tfm_testod = Name_to_Data['test-od']['ds_tfm'].shuffle(seed=random_seed).select(range(eval_set_size))


eval_dict = {
    'valid': ds_tfm_valid,
    'test-id': ds_tfm_testid,
    'test-od': ds_tfm_testod,
}

# Step 3: Set up Trainer with eval_dataset and metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tfm_train,        # your training set
    eval_dataset=eval_dict,        # your validation set (needs same format)
    # data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Step 7: Train
trainer.train()