# Space

In [None]:

import torch

torch.cuda.empty_cache()


In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

SPACE['MODEL_ENDPOINT'] = 'vTest'

# Part 1: AIData

In [None]:
from recfldtkn.aidata_base.aidata import AIData

DATA_AIDATA = SPACE['DATA_AIDATA']
OneAIDataName = 'CgmLhm_Bf24Af2Af2t8H_5Min_3Cohort_EventFlt_Sample'

aidata = AIData.load_aidata(DATA_AIDATA, OneAIDataName, SPACE)
aidata

In [None]:
# TaskType = 'MLUniLabel'
SeriesName  = 'Bf24.Af2H'
OneTaskName = 'cgm_lhm_bf24h_af2h_5min'
OneEntryArgs = {
    # ----------------- Task Part -----------------
    'Task_Part': {

        'Tagging': {
            # 'TagName_to_TaggingMethod': {
            #     # TagName: TaggingMethod {Rules: [(x,x,x)], Op: and or}
            # },
            # 'ColumnsAddToDsCase': [],
            'TagFilter': True, # <--- still need to add Fitlter Tag, as we need to do the RandomDownSample.
            'TagSplit': False, # <--- do not need to add Split Tag anymore, as we already have. 
        },

        'Filtering': {
            # 'FilterTagging': None,
            'FilterTagging': {
                "Rules": [
                    ['RandDownSample', '<=', 0.5],
                    ['co.Bf24H_Food_recnum:recnum', '>=', 1], 
                    ], 
                'Op': 'and',
            }
        }, 
        
        'Splitting': {
            # 'SplitTagging': { # <----- for the Tagging part.
            #     'RANDOM_SEED': 32,
            #     'out_ratio': 0.1,
            #     'test_ratio': 'tail0.1',
            #     'valid_ratio': 0.1
            # },
            'TrainEvals': {
                'TrainSetName': 'In-Train', 
                'EvalSetNames': ['In-Test', 'In-Valid', 'Out']
            },
        }
    },

    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'Mto1Period_MultiTknInStep',
        'CF_list': [
            'cf.TargetCGM_Bf24H',
            'cf.TargetCGM_Af2H',

            'cf.ActivitySparse_Bf24H',
            'cf.ActivitySparse_Af2H',

            # 'cf.TimeSparse_Bf24H', 
            # 'cf.TimeSparse_Af2H',


            'cf.DietSparse_Bf24H',
            'cf.DietSparse_Af2H',
        ],
        'TargetField': 'TargetCGM',
        # 'TimeField':   'Time',
        'EventFields': [
            'Activity',
            'Diet',
        ],
        'BeforePeriods': ['Bf24H'],
        'AfterPeriods': ['Af2H'],
        'InferenceMode': False, # 'WithFutureEvent' #  # 'NoFutureEvent', 'WithFutureEvent', 
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'EventPrediction',
        'MaskingRate': 0,
        'Task_Label': 'Diet',
        #other parameters toward X and Y value
        'agg_function':None,
        'label_process': None, 
    },
}


aidata.update_NameToData_with_OneEntryArgs(OneEntryArgs)
dataset = aidata.Name_to_DS
dataset

In [None]:
# aidata.Name_to_DsAIData
split_name = [i for i in  aidata.Name_to_Data][0]
Name_to_Data = aidata.Name_to_Data# [split_name]
Data = Name_to_Data[split_name]
df_case = Data['df_case']
df_case.head()

In [None]:
ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
Data['df_case'].columns

In [None]:
batch_size = 4
batch = ds_tfm[:batch_size]
batch

In [None]:
import numpy as np
import evaluate  # New Hugging Face library for evaluation metrics

# Load accuracy metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)  # Get highest probability token

    # Flatten for sequence-based accuracy
    preds = preds.flatten()
    labels = labels.flatten()

    return metric.compute(predictions=preds, references=labels)


data_subset_prep

In [None]:
from datasets import Dataset
import torch


dataset = Dataset.from_dict(ds_tfm[:8000])
dataset_test = Dataset.from_dict(ds_tfm[8000:10000])


def tokenize_function(examples):
    example_dataset = {
        "input_ids": torch.tensor(examples["input_ids"], dtype=torch.long),
        "labels": torch.tensor(examples["labels"], dtype=torch.long),
    }
    example_dataset["attention_mask"]=torch.ones_like(example_dataset['input_ids'])
    return example_dataset




# Part 2: Baseline Model - Simple Embedding and FC

## 2a Model Definitation

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim

# # Device setup (GPU if available)
# device = torch.device( "cpu")

# # Define vocab size and embedding dimensions
# vocab_size = 1000   # Adjust based on vocab
# embedding_dim = 128  # Size of the embedding vectors
# seq_length = 313  # Length of each sequence in the batch

# # Create an Embedding layer (trainable)
# embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# # datast change
# #[TODO]: build this into transformatino
# batch_size = 4
# token_ids = torch.tensor(batch['input_ids'], dtype=torch.long, device=device)  # (batch_size, seq_length)
# target_labels = torch.tensor(batch['labels'], dtype=torch.long, device=device)  # (batch_size, seq_length)

# # model
# class SimpleModel(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_dim):
#         super(SimpleModel, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.fc = nn.Linear(embedding_dim, vocab_size)  # Predict token classes for each position
#         self.relu = nn.ReLU()

#     def forward(self, x):
#         x = self.embedding(x)  # Convert token IDs to embeddings (batch_size, seq_len, embed_dim)
#         x = self.relu(x)
#         x = self.fc(x)  # Output shape: (batch_size, seq_len, vocab_size) -> token classification
#         return x



In [None]:
import torch
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SimpleModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)  # Predict token classes for each position
        self.relu = nn.ReLU()

    def forward(self, input_ids, labels=None):
        x = self.embedding(input_ids)  # (batch_size, seq_len, embed_dim)
        x = self.relu(x)
        logits = self.fc(x).float()  # (batch_size, seq_len, vocab_size)

        # Compute loss if labels are provided (Trainer requires this!)
        loss = None
        if labels is not None:
            labels = labels.to(torch.long)
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))  # Reshape for CE Loss

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


## 2b Forward and Trainer


In [None]:
from transformers import TrainingArguments, Trainer

# Model
model = SimpleModel(vocab_size=1000, embedding_dim=128, hidden_dim=256)

# Training Arguments
training_args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=2,
    report_to=None
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset_test,  # Using same data for evaluation (change as needed)
    compute_metrics=compute_metrics, 
)

# Train
trainer.train()



# Part 3: Roberta MLM

Maksed

In [None]:
from datasets import Dataset
import torch

dataset_train = Dataset.from_dict(ds_tfm[:8000])
dataset_test = Dataset.from_dict(ds_tfm[12000:14000])
def tokenize_function_formaksed(examples):
    example_dataset = {
        "input_ids": torch.tensor(examples["input_ids"], dtype=torch.long),
        "labels": torch.tensor(examples["input_ids"], dtype=torch.long),
    }
    example_dataset["attention_mask"]=torch.ones_like(example_dataset['input_ids'])
    return example_dataset

# Tokenize dataset
dataset_train_masked = dataset_train.map(tokenize_function_formaksed)
dataset_test_masked = dataset_test.map(tokenize_function_formaksed)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import RobertaForMaskedLM, RobertaTokenizer

from transformers import RobertaConfig


# Load the existing configuration
config = RobertaConfig.from_pretrained('roberta-base')

# Update the vocabulary size
config.vocab_size = 500

# Save the updated configuration
config.save_pretrained('./new_model_config')

# Load RoBERTa tokenizer and model
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

model = RobertaForMaskedLM(config)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

## 3b Forward and Trainer

Masking the dataset

In [None]:
from transformers import DataCollatorForLanguageModeling

# Data Collator for Masked Language Modeling (MLM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,  # Enable masked language modeling
    mlm_probability=0.15  # Default 15% masking probability
)

**Trainer**
```
#[TODO]: define data_collator (DataCollator, optional) to specify how to get a batch
#[TODO]: fill in compute_metrics (Callable[[EvalPrediction], Dict], optional) , other metrixs other than accuracy?
#[TODO]: optimizers  (Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR], optional and related arguments
```

**TrainingArguments**
- possible parameter:https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/trainer#transformers.TrainingArguments


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="results",  
    overwrite_output_dir=True,  # Overwrite previous model output
    evaluation_strategy="epoch",  
    save_strategy="epoch", 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=5,  
    weight_decay=0.01,
    logging_dir="logs",  
    logging_steps=10,
    save_total_limit=2,  # Keep only the last 2 checkpoints to save space
    push_to_hub=False,  # Set True if you want to upload to Hugging Face Hub
    # report_to=None,  # Force not to report to wandb
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset_test,
    # train_dataset=dataset_train_masked,
    # eval_dataset=dataset_test_masked,
    # tokenizer=tokenizer,
    # compute_metrics=compute_metrics, 
    data_collator=data_collator,
)


In [None]:
trainer.train()


## 3c  Masked LM Embedding + Downstream

load pretrain mask LM, train for down stream task

In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Load tokenizer and model from local directory
model_path = "roberta-base" #"results/rng_state.pth"
tokenizer = RobertaTokenizer.from_pretrained(model_path)

# Load RoBERTa model (num_labels=2 for binary classification per token)
model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=2)


In [None]:
import torch
import torch.nn as nn
from transformers import RobertaModel

class RobertaForTokenClassification(nn.Module):
    def __init__(self, model_path, num_labels=2):
        super(RobertaForTokenClassification, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_path)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)  # Output shape (batch_size, seq_len, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state)  # Token-level classification

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))  # Flatten for loss function

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# Initialize model
model = RobertaForTokenClassification(model_path, num_labels=2)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="results",  
    overwrite_output_dir=True,  # Overwrite previous model output
    evaluation_strategy="epoch",  
    save_strategy="epoch", 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=5,  
    weight_decay=0.01,
    logging_dir="logs",  
    logging_steps=10,
    save_total_limit=2,  # Keep only the last 2 checkpoints to save space
    push_to_hub=False,  # Set True if you want to upload to Hugging Face Hub
    report_to=None,  # Force not to report to wandb
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset_test,
    # tokenizer=tokenizer,
    # compute_metrics=compute_metrics, 
    data_collator=data_collator,
)
