In [56]:
%load_ext autoreload
%autoreload 2
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from theorder import (
    LoBertModel,
    LoBertConfig,
    LoBertForMaskedLM,
    LoBertTokenizer,
    DataCollatorForMessageModeling
    )

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
path_data_sample = "/workspaces/2025 LoBERT/data/LOBSTER_SampleFile_AAPL_2012-06-21_10/ArrowDataset"
ds = Dataset.load_from_disk(path_data_sample)

ds

Dataset({
    features: ['input_ids', 'time_ids', 'volume_ids'],
    num_rows: 3040
})

In [58]:
ds.info

DatasetInfo(description='', citation='', homepage='', license='', features={'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'time_ids': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'volume_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [59]:
config_lobert_tiny = {
    "hidden_size": 128,
    "intermediate_size": 512,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "lobert",
    "num_attention_heads": 2,
    "num_hidden_layers": 2,
    "vocab_size": 3200
}
config = LoBertConfig(**config_lobert_tiny)
config

LoBertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "lobert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3200
}

In [60]:
config.eos_token_id

In [61]:
config.bos_token_id

# Base LoBERT model

In [62]:
base_model = LoBertModel(config)
print(f"LoBERT model has {base_model.num_parameters():,.0f} parameters")
base_model

LoBERT model has 889,216 parameters


LoBertModel(
  (embeddings): LoBertEmbeddings(
    (message_embeddings): Embedding(3200, 128, padding_idx=0)
    (time_embeddings): ValueProjectionEmbedding(
      (value_projection): Linear(in_features=1, out_features=128, bias=True)
    )
    (volume_embeddings): ValueProjectionEmbedding(
      (value_projection): Linear(in_features=1, out_features=128, bias=True)
    )
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-1): 2 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=128, out_features=128, bias=True)
            (key): Linear(in_features=128, out_features=128, bias=True)
            (value): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.

In [63]:
input_features = ds.with_format("torch")[:4]

In [64]:
base_model(**input_features)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.6157e+00, -1.9908e-01, -2.1255e-01,  ..., -2.7877e-01,
          -1.4077e+00, -1.1687e+00],
         [-1.3132e+00, -3.6295e-01,  1.1769e-01,  ...,  1.4646e+00,
          -1.6474e+00, -7.3820e-01],
         [-1.2402e+00, -3.0582e-01,  1.1329e-01,  ...,  1.6507e+00,
          -1.6379e+00, -6.8953e-01],
         ...,
         [-8.2495e-01,  4.1337e-02, -2.9983e-01,  ..., -1.8727e+00,
          -2.7236e-01, -9.9052e-01],
         [-8.4577e-01, -5.1273e-02,  5.6863e-02,  ..., -1.9961e+00,
          -3.1062e-01, -9.8232e-01],
         [-9.3413e-01,  1.5333e-01,  1.5451e-01,  ..., -2.0823e+00,
          -2.8003e-01, -1.0254e+00]],

        [[-1.5022e+00,  2.3214e-01,  1.8370e-01,  ..., -2.2086e+00,
          -2.3672e-01, -8.8510e-01],
         [-1.2335e+00,  8.2390e-01, -8.1799e-01,  ..., -4.3834e-01,
          -2.9417e-01,  6.1280e-01],
         [-2.0858e-03, -8.4708e-02, -3.3627e-01,  ..., -2.0478e+00,
          -2.

# Masked message modeling

In [65]:
mmmodel = LoBertForMaskedLM(config)
mmmodel

LoBertForMaskedLM(
  (bert): LoBertModel(
    (embeddings): LoBertEmbeddings(
      (message_embeddings): Embedding(3200, 128, padding_idx=0)
      (time_embeddings): ValueProjectionEmbedding(
        (value_projection): Linear(in_features=1, out_features=128, bias=True)
      )
      (volume_embeddings): ValueProjectionEmbedding(
        (value_projection): Linear(in_features=1, out_features=128, bias=True)
      )
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_feature

In [66]:
tokenizer = LoBertTokenizer()
data_collator = DataCollatorForMessageModeling(tokenizer=tokenizer, mlm=True)


In [73]:
args = TrainingArguments(
    output_dir='/workspaces/2025 LoBERT/models',
    num_train_epochs=1,
    )

In [74]:
tokenized_dataset = ds.train_test_split(test_size=0.1)

In [75]:
trainer = Trainer(
    model=mmmodel,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

  trainer = Trainer(


In [76]:
trainer.train()

IndexError: index out of range in self