In [1]:
import torchinfo


In [2]:
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
torchinfo.summary(model,shape=(4,100))

Layer (type:depth-idx)                                  Param #
RobertaModel                                            --
├─RobertaEmbeddings: 1-1                                --
│    └─Embedding: 2-1                                   38,603,520
│    └─Embedding: 2-2                                   394,752
│    └─Embedding: 2-3                                   768
│    └─LayerNorm: 2-4                                   1,536
│    └─Dropout: 2-5                                     --
├─RobertaEncoder: 1-2                                   --
│    └─ModuleList: 2-6                                  --
│    │    └─RobertaLayer: 3-1                           7,087,872
│    │    └─RobertaLayer: 3-2                           7,087,872
│    │    └─RobertaLayer: 3-3                           7,087,872
│    │    └─RobertaLayer: 3-4                           7,087,872
│    │    └─RobertaLayer: 3-5                           7,087,872
│    │    └─RobertaLayer: 3-6                           7,

In [4]:
for name, module in model.named_modules():
    print(name, module.__class__)

 <class 'transformers.models.roberta.modeling_roberta.RobertaModel'>
embeddings <class 'transformers.models.roberta.modeling_roberta.RobertaEmbeddings'>
embeddings.word_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.position_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.token_type_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.LayerNorm <class 'torch.nn.modules.normalization.LayerNorm'>
embeddings.dropout <class 'torch.nn.modules.dropout.Dropout'>
encoder <class 'transformers.models.roberta.modeling_roberta.RobertaEncoder'>
encoder.layer <class 'torch.nn.modules.container.ModuleList'>
encoder.layer.0 <class 'transformers.models.roberta.modeling_roberta.RobertaLayer'>
encoder.layer.0.attention <class 'transformers.models.roberta.modeling_roberta.RobertaAttention'>
encoder.layer.0.attention.self <class 'transformers.models.roberta.modeling_roberta.RobertaSelfAttention'>
encoder.layer.0.attention.self.query <class 'torch.nn.modules

In [5]:
from minmonarch.modular_monarch import inject_trainable_monarch, monkeypatch_remove_lora
import torch

model = RobertaModel.from_pretrained('roberta-base')
model.requires_grad_(False)

# Set OFT parameters
oft_r=4
oft_eps=1e-3
oft_coft=False
oft_block_share=False
normalize=False
search_class=[torch.nn.Linear] # Default is only nn.Linear, but you can also pass nn.Conv2d

# Set training and optimization parameters
learning_rate=2e-5
weight_decay=0.01
beta1 = 0.9
beta2 = 0.95

# Replace modules with trainable OFT linear modules
ft_modules = ["RobertaAttention"] # Modules will be specific to your model, but you can target any number of them
oft_params, train_names = inject_trainable_monarch(
                              model, 
                              num_blocks=4,
                              adapt=True,
                              verbose=True, 
                              target_replace_module = ft_modules
                          )
# Set optimizer
optim_groups = [
    {
        "params": oft_params,
        "weight_decay": weight_decay
    }
]
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))

for _ in range(5):
    optimizer.zero_grad()
    loss = torch.sum( model(torch.zeros(4,100, dtype=torch.long) ).last_hidden_state ) # trash loss, just for toy example backprop
    loss.backward()
    optimizer.step()
    print(loss.item())

print(len(oft_params))

[2024-04-24 19:30:55,374] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Monarch Injection : injecting monarch into  query
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  key
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  value
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  dense
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  query
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  key
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  value
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  dense
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  query
Monarch Injection : weight shape torch.Size([768, 768])
Monarch Injection : injecting monarch into  key
Mo

In [6]:
for name, module in model.named_modules():
    print(name, module.__class__)

 <class 'transformers.models.roberta.modeling_roberta.RobertaModel'>
embeddings <class 'transformers.models.roberta.modeling_roberta.RobertaEmbeddings'>
embeddings.word_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.position_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.token_type_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.LayerNorm <class 'torch.nn.modules.normalization.LayerNorm'>
embeddings.dropout <class 'torch.nn.modules.dropout.Dropout'>
encoder <class 'transformers.models.roberta.modeling_roberta.RobertaEncoder'>
encoder.layer <class 'torch.nn.modules.container.ModuleList'>
encoder.layer.0 <class 'transformers.models.roberta.modeling_roberta.RobertaLayer'>
encoder.layer.0.attention <class 'transformers.models.roberta.modeling_roberta.RobertaAttention'>
encoder.layer.0.attention.self <class 'transformers.models.roberta.modeling_roberta.RobertaSelfAttention'>
encoder.layer.0.attention.self.query <class 'minmonarch.modul

In [7]:
torchinfo.summary(model,shape=(4,100))

Layer (type:depth-idx)                                       Param #
RobertaModel                                                 --
├─RobertaEmbeddings: 1-1                                     --
│    └─Embedding: 2-1                                        (38,603,520)
│    └─Embedding: 2-2                                        (394,752)
│    └─Embedding: 2-3                                        (768)
│    └─LayerNorm: 2-4                                        (1,536)
│    └─Dropout: 2-5                                          --
├─RobertaEncoder: 1-2                                        --
│    └─ModuleList: 2-6                                       --
│    │    └─RobertaLayer: 3-1                                8,267,520
│    │    └─RobertaLayer: 3-2                                8,267,520
│    │    └─RobertaLayer: 3-3                                8,267,520
│    │    └─RobertaLayer: 3-4                                8,267,520
│    │    └─RobertaLayer: 3-5                 

In [8]:
monkeypatch_remove_lora(model) # TODO: this doesn't seem to do anything for monarch yet

In [9]:
for name, module in model.named_modules():
    print(name, module.__class__)

 <class 'transformers.models.roberta.modeling_roberta.RobertaModel'>
embeddings <class 'transformers.models.roberta.modeling_roberta.RobertaEmbeddings'>
embeddings.word_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.position_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.token_type_embeddings <class 'torch.nn.modules.sparse.Embedding'>
embeddings.LayerNorm <class 'torch.nn.modules.normalization.LayerNorm'>
embeddings.dropout <class 'torch.nn.modules.dropout.Dropout'>
encoder <class 'transformers.models.roberta.modeling_roberta.RobertaEncoder'>
encoder.layer <class 'torch.nn.modules.container.ModuleList'>
encoder.layer.0 <class 'transformers.models.roberta.modeling_roberta.RobertaLayer'>
encoder.layer.0.attention <class 'transformers.models.roberta.modeling_roberta.RobertaAttention'>
encoder.layer.0.attention.self <class 'transformers.models.roberta.modeling_roberta.RobertaSelfAttention'>
encoder.layer.0.attention.self.query <class 'torch.nn.modules