In [3]:
import torch
from torchinfo import summary
from transformers import AutoModelForSequenceClassification


checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=7, ignore_mismatched_sizes=True)

summary(
    model,
    input_data=torch.randint(1, 30000, (8, 128)),
    col_names=["input_size", "output_size", "num_params", "mult_adds", "trainable"],
    row_settings=["depth", "var_names"],
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type (var_name):depth-idx)                                           Input Shape               Output Shape              Param #                   Mult-Adds                 Trainable
DistilBertForSequenceClassification (DistilBertForSequenceClassification)   [8, 128]                  [8, 7]                    --                        --                        True
├─DistilBertModel (distilbert): 1-1                                         --                        [8, 128, 768]             --                        --                        True
│    └─Embeddings (embeddings): 2-1                                         [8, 128]                  [8, 128, 768]             --                        --                        True
│    │    └─Embedding (word_embeddings): 3-1                                [8, 128]                  [8, 128, 768]             23,440,896                187,527,168               True
│    │    └─Embedding (position_embeddings): 3-2                      

I want to freeze the whole model except for classifier, and see if this will be enough for training. Model summary helped me to find out last layer name (classifier).

In [2]:
model.requires_grad_(False)
model.classifier.requires_grad_(True)

summary(
    model,
    input_data=torch.randint(1, 30000, (8, 128)),
    col_names=["input_size", "output_size", "num_params", "mult_adds", "trainable"],
    row_settings=["depth", "var_names"],
)


Layer (type (var_name):depth-idx)                                           Input Shape               Output Shape              Param #                   Mult-Adds                 Trainable
DistilBertForSequenceClassification (DistilBertForSequenceClassification)   [8, 128]                  [8, 7]                    --                        --                        Partial
├─DistilBertModel (distilbert): 1-1                                         --                        [8, 128, 768]             --                        --                        False
│    └─Embeddings (embeddings): 2-1                                         [8, 128]                  [8, 128, 768]             --                        --                        False
│    │    └─Embedding (word_embeddings): 3-1                                [8, 128]                  [8, 128, 768]             (23,440,896)              187,527,168               False
│    │    └─Embedding (position_embeddings): 3-2                

Thus total number of trainable parameters will be 5,383.