# SquadV1 inference with a pruned network

In [None]:
from transformers import pipeline
from nn_pruning.inference_model_patcher import optimize_model

# BERT-base uncased finetuned on SQuAD, speedup is 2.44, F1=87.7, 26% of linear layer parameters remaining,
# with hybrid-pruning + final fill -> dense matrices
MODEL_NAME = "madlag/bert-base-uncased-squadv1-x2.44-f87.7-d26-hybrid-filled-v1"

qa_pipeline = pipeline(
    "question-answering",
    model=MODEL_NAME,
)

# Original BERT-base size
original_bert = 110E6
print(f"BERT-base parameters: {original_bert/1E6:0.1f}M")

# Optimize the model: this just removes the empty parts of the model (lines/columns), as we
# cannot currently store the shrunk version on disk in a huggingface transformers compatible format
qa_pipeline.model = optimize_model(qa_pipeline.model, "dense")

# Check the new size
new_count = int(qa_pipeline.model.num_parameters())
print(f"Parameters count after optimization={new_count / 1E6:0.1f}M")
print(f"Reduction of the total number of parameters compared to BERT-base:{original_bert / new_count:0.2f}X")

# Use the model as usual, it's just 2.44X faster!
predictions = qa_pipeline({
    'context': "Frédéric François Chopin, born Fryderyk Franciszek Chopin (1 March 1810 – 17 October 1849), was a Polish composer and virtuoso pianist of the Romantic era who wrote primarily for solo piano.",
    'question': "Who is Frederic Chopin?",
})
print()
print("Predictions", predictions)

Some weights of BertModel were not initialized from the model checkpoint at madlag/bert-base-uncased-squadv1-x2.44-f87.7-d26-hybrid-filled-v1 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT-base parameters: 110.0M
removed heads 0, total_heads=64, percentage removed=0.0
bert.encoder.layer.0.intermediate.dense, sparsity = 82.03
bert.encoder.layer.0.output.dense, sparsity = 82.03
bert.encoder.layer.1.intermediate.dense, sparsity = 76.66
bert.encoder.layer.1.output.dense, sparsity = 76.66
bert.encoder.layer.2.intermediate.dense, sparsity = 74.12
bert.encoder.layer.2.output.dense, sparsity = 74.12
bert.encoder.layer.3.intermediate.dense, sparsity = 74.32
bert.encoder.layer.3.output.dense, sparsity = 74.32
bert.encoder.layer.4.intermediate.dense, sparsity = 72.88
bert.encoder.layer.4.output.dense, sparsity = 72.88
bert.encoder.layer.5.intermediate.dense, sparsity = 74.22
bert.encoder.layer.5.output.dense, sparsity = 74.22
bert.encoder.layer.6.intermediate.dense, sparsity = 79.82
bert.encoder.layer.6.output.dense, sparsity = 79.82
bert.encoder.layer.7.intermediate.dense, sparsity = 84.83
bert.encoder.layer.7.output.dense, sparsity = 84.83
bert.encoder.layer.8.intermediate.d

# Checking the size of the model linear layers

In [None]:
from transformers import AutoModelForQuestionAnswering
from nn_pruning.inference_model_patcher import optimize_model

model = AutoModelForQuestionAnswering.from_pretrained("madlag/bert-base-uncased-squadv1-x2.44-f87.7-d26-hybrid-filled-v1")


def compute_size(model):
    elems = 0
    for k, v in model.named_parameters():
        if "LayerNorm" not in k and "encoder" in k and "weight" in k:
            elems += v.numel()
    return elems

original_count = compute_size(model)

# Optimize the model
model = optimize_model(model, mode="dense", clone=False)

new_count = compute_size(model)

print()
print(f"Reduction of linear layers:{original_count / new_count:0.2f}X")


removed heads 0, total_heads=64, percentage removed=0.0
bert.encoder.layer.0.intermediate.dense, sparsity = 82.03
bert.encoder.layer.0.output.dense, sparsity = 82.03
bert.encoder.layer.1.intermediate.dense, sparsity = 76.66
bert.encoder.layer.1.output.dense, sparsity = 76.66
bert.encoder.layer.2.intermediate.dense, sparsity = 74.12
bert.encoder.layer.2.output.dense, sparsity = 74.12
bert.encoder.layer.3.intermediate.dense, sparsity = 74.32
bert.encoder.layer.3.output.dense, sparsity = 74.32
bert.encoder.layer.4.intermediate.dense, sparsity = 72.88
bert.encoder.layer.4.output.dense, sparsity = 72.88
bert.encoder.layer.5.intermediate.dense, sparsity = 74.22
bert.encoder.layer.5.output.dense, sparsity = 74.22
bert.encoder.layer.6.intermediate.dense, sparsity = 79.82
bert.encoder.layer.6.output.dense, sparsity = 79.82
bert.encoder.layer.7.intermediate.dense, sparsity = 84.83
bert.encoder.layer.7.output.dense, sparsity = 84.83
bert.encoder.layer.8.intermediate.dense, sparsity = 90.79
bert.e

In [None]:
def print_sizes(model):
    current_layer_index = None
    for k, v in model.named_parameters():
        if "encoder" not in k or "LayerNorm" in k:
            continue
        layer_index = k.split(".")[3]
        if layer_index != current_layer_index:
            print(f"\nLayer {layer_index}")
            current_layer_index = layer_index
            
        k = ".".join(k.split(".")[4:])
                
        if "weight" in k:
            if "attention" in k:
                th_size = [768, 768]
            else:
                if "intermediate" in k:
                    th_size = [768*4, 768]
                else:
                    th_size = [768, 768 * 4]
        elif "bias" in k:
            th_size = [768]
        else:
            raise ValueError("unsupported case")
        print(f"{th_size} => {list(v.shape)}, {k}")

print("BERT-base Size, Model Size, Layer Name")
print_sizes(model)

BERT-base Size, Model Size, Layer Name

Layer 0
[768, 768] => [256, 768], attention.self.query.weight
[768] => [256], attention.self.query.bias
[768, 768] => [256, 768], attention.self.key.weight
[768] => [256], attention.self.key.bias
[768, 768] => [256, 768], attention.self.value.weight
[768] => [256], attention.self.value.bias
[768, 768] => [768, 256], attention.output.dense.weight
[768] => [768], attention.output.dense.bias
[3072, 768] => [552, 768], intermediate.dense.weight
[768] => [552], intermediate.dense.bias
[768, 3072] => [768, 552], output.dense.weight
[768] => [768], output.dense.bias

Layer 1
[768, 768] => [256, 768], attention.self.query.weight
[768] => [256], attention.self.query.bias
[768, 768] => [256, 768], attention.self.key.weight
[768] => [256], attention.self.key.bias
[768, 768] => [256, 768], attention.self.value.weight
[768] => [256], attention.self.value.bias
[768, 768] => [768, 256], attention.output.dense.weight
[768] => [768], attention.output.dense.bias
[