In [12]:
%load_ext autoreload
%autoreload 2
from datasets import Dataset
from transformers import BertForMaskedLM
from theorder import LoBertModel, LoBertConfig, LoBertForMaskedLM


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
path_data_sample = "/workspaces/2025 LoBERT/data/LOBSTER_SampleFile_AAPL_2012-06-21_10/ArrowDataset"
ds = Dataset.load_from_disk(path_data_sample)

ds

Dataset({
    features: ['input_ids', 'time_ids', 'volume_ids'],
    num_rows: 3040
})

In [14]:
ds.info

DatasetInfo(description='', citation='', homepage='', license='', features={'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'time_ids': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'volume_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [15]:
config_lobert_tiny = {
    "hidden_size": 128,
    "intermediate_size": 512,
    "layer_norm_eps": 1e-12,
    "max_position_embeddings": 512,
    "model_type": "lobert",
    "num_attention_heads": 2,
    "num_hidden_layers": 2,
    "vocab_size": 3200
}
config = LoBertConfig(**config_lobert_tiny)
config

LoBertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "lobert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3200
}

In [21]:
config.eos_token_id

In [22]:
config.bos_token_id

# Base LoBERT model

In [16]:
base_model = LoBertModel(config)
print(f"LoBERT model has {base_model.num_parameters():,.0f} parameters")
base_model

LoBERT model has 889,216 parameters


LoBertModel(
  (embeddings): LoBertEmbeddings(
    (message_embeddings): Embedding(3200, 128, padding_idx=0)
    (time_embeddings): ValueProjectionEmbedding(
      (value_projection): Linear(in_features=1, out_features=128, bias=True)
    )
    (volume_embeddings): ValueProjectionEmbedding(
      (value_projection): Linear(in_features=1, out_features=128, bias=True)
    )
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-1): 2 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=128, out_features=128, bias=True)
            (key): Linear(in_features=128, out_features=128, bias=True)
            (value): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.

In [17]:
input_features = ds.with_format("torch")[:4]

In [18]:
base_model(**input_features)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.4504,  1.1949,  1.2456,  ..., -1.0610, -0.1199, -0.6303],
         [ 0.7369,  0.9356,  0.7063,  ..., -0.6261,  0.2679, -0.8705],
         [ 0.8703,  1.0277,  0.6671,  ..., -0.5746,  0.0114, -0.8967],
         ...,
         [ 0.1019,  0.8035, -0.0128,  ..., -1.0803, -0.0914, -0.1595],
         [ 0.1433,  0.6966,  1.1382,  ..., -1.1918,  0.0935, -0.1655],
         [ 0.0953,  0.5953,  1.0202,  ..., -1.2073, -0.1533, -0.0633]],

        [[ 0.0118,  0.7161,  1.2969,  ..., -0.9773, -0.2765, -0.0415],
         [ 0.3487,  0.7441,  1.2599,  ..., -0.8139,  0.0673, -0.1321],
         [ 0.1388,  0.6827,  1.0154,  ..., -1.0839, -0.0251, -0.1003],
         ...,
         [ 0.6784,  1.1624,  0.9364,  ..., -0.0115, -0.0384, -0.6741],
         [ 0.7300, -0.1894,  0.5839,  ..., -0.4427,  0.0804, -0.8113],
         [ 0.7470,  1.0561,  0.8367,  ..., -0.7418,  0.0226, -0.7400]],

        [[ 0.6761,  0.8791,  0.5708,  ..., -0.4479,  

# Masked message modelling

In [20]:
mmmodel = LoBertForMaskedLM(config)
mmmodel

LoBertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


LoBertForMaskedLM(
  (bert): LoBertModel(
    (embeddings): LoBertEmbeddings(
      (message_embeddings): Embedding(3200, 128, padding_idx=0)
      (time_embeddings): ValueProjectionEmbedding(
        (value_projection): Linear(in_features=1, out_features=128, bias=True)
      )
      (volume_embeddings): ValueProjectionEmbedding(
        (value_projection): Linear(in_features=1, out_features=128, bias=True)
      )
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_feature

In [23]:
from theorder import DataCollatorForLanguageModeling

[autoreload of theorder failed: Traceback (most recent call last):
  File "/opt/conda/envs/lob001/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/opt/conda/envs/lob001/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "/opt/conda/envs/lob001/lib/python3.11/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 621, in _exec
  File "<frozen importlib._bootstrap_external>", line 940, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/workspaces/2025 LoBERT/src/package/theorder/__init__.py", line 4, in <module>
    from .data import DataCollatorForMessageModeling
  File "/workspaces/2025 LoBERT/src/package/theorder/data/__init__.py", line 1, in <module>
    from .data_collator impor

ImportError: cannot import name 'DataCollatorForLanguageModeling' from 'theorder' (/workspaces/2025 LoBERT/src/package/theorder/__init__.py)