In [9]:
%load_ext autoreload
%autoreload 2
from datasets import Dataset
from transformers import BertForMaskedLM
from theorder import LoBertModel, LoBertConfig, LoBertForMaskedLM


In [10]:
path_data_sample = "/workspaces/2025 LoBERT/data/LOBSTER_SampleFile_AAPL_2012-06-21_10/ArrowDataset"
ds = Dataset.load_from_disk(path_data_sample)

ds

Dataset({
    features: ['input_ids', 'time_ids', 'volume_ids'],
    num_rows: 3040
})

In [11]:
ds.info

DatasetInfo(description='', citation='', homepage='', license='', features={'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'time_ids': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'volume_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [12]:
config_lobert_tiny = {
    "hidden_size": 128,
    "intermediate_size": 512,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "lobert",
    "num_attention_heads": 2,
    "num_hidden_layers": 2,
    "vocab_size": 3200
}
config = LoBertConfig(**config_lobert_tiny)
config

LoBertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "lobert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3200
}

In [13]:
config.eos_token_id

In [14]:
config.bos_token_id

# Base LoBERT model

In [15]:
base_model = LoBertModel(config)
print(f"LoBERT model has {base_model.num_parameters():,.0f} parameters")
base_model

LoBERT model has 889,216 parameters


LoBertModel(
  (embeddings): LoBertEmbeddings(
    (message_embeddings): Embedding(3200, 128, padding_idx=0)
    (time_embeddings): ValueProjectionEmbedding(
      (value_projection): Linear(in_features=1, out_features=128, bias=True)
    )
    (volume_embeddings): ValueProjectionEmbedding(
      (value_projection): Linear(in_features=1, out_features=128, bias=True)
    )
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-1): 2 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=128, out_features=128, bias=True)
            (key): Linear(in_features=128, out_features=128, bias=True)
            (value): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.

In [16]:
input_features = ds.with_format("torch")[:4]

In [17]:
base_model(**input_features)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.7275, -0.1028, -0.1403,  ..., -1.9343, -0.2860, -0.3965],
         [ 1.1002,  2.4706,  0.7439,  ..., -2.0784,  0.0186, -1.0983],
         [ 1.3703,  3.0149,  1.0603,  ..., -2.1795,  0.1434, -1.2215],
         ...,
         [-0.3715, -0.8666, -1.3562,  ..., -0.5026, -0.7140,  0.7688],
         [-0.2333, -0.8788, -1.1629,  ..., -0.4724, -0.5513,  0.6603],
         [-0.1844, -0.8188, -1.2360,  ..., -0.4445, -0.5595,  0.7353]],

        [[-0.1977, -0.6423, -1.1199,  ..., -0.2547, -0.7114,  0.7132],
         [-0.1733,  0.9481,  0.0073,  ...,  0.8994, -1.1154,  0.8086],
         [ 0.1012, -0.8389,  0.1236,  ..., -0.5018, -0.6408,  0.7601],
         ...,
         [ 0.0247,  1.5670,  0.1220,  ..., -1.9427, -0.4349, -0.4982],
         [ 1.2271,  2.6801,  0.9897,  ..., -2.0800,  0.0821, -1.1131],
         [ 1.0642,  2.4116,  0.5120,  ..., -2.0707, -0.0940, -1.0699]],

        [[ 0.0841,  2.6060,  0.9784,  ..., -1.9265,  

# Masked message modelling

In [18]:
mmmodel = LoBertForMaskedLM(config)
mmmodel

LoBertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


LoBertForMaskedLM(
  (bert): LoBertModel(
    (embeddings): LoBertEmbeddings(
      (message_embeddings): Embedding(3200, 128, padding_idx=0)
      (time_embeddings): ValueProjectionEmbedding(
        (value_projection): Linear(in_features=1, out_features=128, bias=True)
      )
      (volume_embeddings): ValueProjectionEmbedding(
        (value_projection): Linear(in_features=1, out_features=128, bias=True)
      )
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_feature

In [19]:
from theorder import DataCollatorForMessageModeling

In [20]:
from dataclasses import dataclass

@dataclass
class LoBertTokenizer:
    mask_token: str = "[MASK]"

tokenizer = LoBertTokenizer()

In [21]:
data_collator = DataCollatorForMessageModeling(tokenizer=tokenizer, mlm=True)

Dataset({
    features: ['input_ids', 'time_ids', 'volume_ids'],
    num_rows: 3040
})

In [27]:
out = data_collator([ds[i] for i in range(4)])

AttributeError: 'LoBertTokenizer' object has no attribute 'pad'