## How to sparsify a Pytorch model

In [2]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from pytorch_block_sparse import BlockSparseModelPatcher
import re

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config).cuda()

# =>84 million parameters
print(f"Initial model parameters count={model.num_parameters()}")
 

  from .autonotebook import tqdm as notebook_tqdm


Initial model parameters count=83504416


In [3]:
# Create a model patcher
mp = BlockSparseModelPatcher()

# Show names that can be used: this returns a list of all names in the network that are patchable.
# These names are escaped to be used as regexps in mp.add_pattern()
patchables = mp.get_patchable_layers(model)

dedup_layers = []

# Pretty print the regexps: replace layer number with regexp matching numbers, and dedup them
# This is a bit specific to Roberta, but should work for most transformers, it's just for ease of reading.
for patchable in patchables:
    r = patchable["regexp"]
    r = re.sub(r'[0-9]+', '[0-9]+', r)
    if r not in dedup_layers:
        dedup_layers.append(r)
        layer = patchable['layer']
        print(f"{r}\n   => {layer.in_features}x{layer.out_features}, bias={layer.bias is not None}")

roberta\.encoder\.layer\.[0-9]+\.attention\.self\.query
   => 768x768, bias=True
roberta\.encoder\.layer\.[0-9]+\.attention\.self\.key
   => 768x768, bias=True
roberta\.encoder\.layer\.[0-9]+\.attention\.self\.value
   => 768x768, bias=True
roberta\.encoder\.layer\.[0-9]+\.attention\.output\.dense
   => 768x768, bias=True
roberta\.encoder\.layer\.[0-9]+\.intermediate\.dense
   => 768x3072, bias=True
roberta\.encoder\.layer\.[0-9]+\.output\.dense
   => 3072x768, bias=True
lm_head\.dense
   => 768x768, bias=True
lm_head\.decoder
   => 768x52000, bias=True


In [4]:


# Selecting some layers to sparsify.
# This is the "artful" part, as some parts are more prone to be sparsified, other may impact model precision too much.

# Match layers using regexp (we escape the ., just because, it's more correct, but it does not change anything here)
# the [0-9]+ match any layer number.
# We setup a density of 0.5 on these layers, you can test other layers / densities .
mp.add_pattern("roberta\.encoder\.layer\.[0-9]+\.intermediate\.dense", {"density":0.5})
mp.add_pattern("roberta\.encoder\.layer\.[0-9]+\.output\.dense", {"density":0.5})
mp.add_pattern("roberta\.encoder\.layer\.[0-9]+\.attention\.output\.dense", {"density":0.5})
mp.patch_model(model)

print(f"Final model parameters count={model.num_parameters()}")

# => 68 million parameters instead of 84 million parameters (embeddings are taking a lof space in Roberta)

Patching 'roberta.encoder.layer.0.attention.output.dense' with density=0.5, in=768, out=768,bias=True 
Patching 'roberta.encoder.layer.0.intermediate.dense' with density=0.5, in=768, out=3072,bias=True 
Patching 'roberta.encoder.layer.0.output.dense' with density=0.5, in=3072, out=768,bias=True 
Patching 'roberta.encoder.layer.1.attention.output.dense' with density=0.5, in=768, out=768,bias=True 
Patching 'roberta.encoder.layer.1.intermediate.dense' with density=0.5, in=768, out=3072,bias=True 
Patching 'roberta.encoder.layer.1.output.dense' with density=0.5, in=3072, out=768,bias=True 
Patching 'roberta.encoder.layer.2.attention.output.dense' with density=0.5, in=768, out=768,bias=True 
Patching 'roberta.encoder.layer.2.intermediate.dense' with density=0.5, in=768, out=3072,bias=True 
Patching 'roberta.encoder.layer.2.output.dense' with density=0.5, in=3072, out=768,bias=True 
Patching 'roberta.encoder.layer.3.attention.output.dense' with density=0.5, in=768, out=768,bias=True 
Patchi