In [1]:
from transformers import SwinForImageClassification

model = SwinForImageClassification.from_pretrained(
    pretrained_model_name_or_path='microsoft/swin-tiny-patch4-window7-224',
    num_labels=1000,
    ignore_mismatched_sizes=True,    
)

for main_name, main_module in model.named_children():
    print("\n【", main_name, "】")
    for sub_name, sub_module in main_module.named_children():
        print(" ┠", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print(" ┃ └", ssub_name)

  from .autonotebook import tqdm as notebook_tqdm



【 swin 】
 ┠ embeddings
 ┃ └ patch_embeddings
 ┃ └ norm
 ┃ └ dropout
 ┠ encoder
 ┃ └ layers
 ┠ layernorm
 ┠ pooler

【 classifier 】


In [2]:
model.swin.embeddings

SwinEmbeddings(
  (patch_embeddings): SwinPatchEmbeddings(
    (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
  )
  (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

In [3]:
import torch
sample = torch.rand((2, 3,224,224), dtype=torch.float32)
output, shape = model.swin.embeddings.patch_embeddings(sample)
output

tensor([[[ 1.8002e-01,  1.1153e+00, -7.3518e-02,  ...,  5.4042e-02,
           1.4729e-01,  2.8756e-01],
         [ 2.4254e-01,  1.1437e+00,  2.0761e-01,  ..., -1.1001e-03,
           1.2255e-01,  2.7576e-01],
         [ 1.3412e-01,  1.1595e+00, -6.9756e-03,  ..., -5.6498e-02,
           1.7086e-01,  1.2430e-01],
         ...,
         [ 1.2358e-01,  1.1057e+00,  1.3460e-01,  ..., -4.4163e-03,
           2.1940e-01,  3.3055e-01],
         [ 2.8840e-01,  1.1491e+00,  2.5101e-02,  ..., -7.4145e-02,
           1.2188e-01,  2.4401e-01],
         [ 1.8269e-01,  1.1510e+00, -8.2024e-02,  ..., -2.2320e-01,
           1.6736e-01,  2.7984e-01]],

        [[ 3.9434e-01,  1.1263e+00, -1.5684e-02,  ..., -9.8119e-02,
           2.0059e-01,  2.5339e-01],
         [ 1.9995e-01,  1.1400e+00,  8.0658e-02,  ..., -1.0614e-01,
           2.5817e-01,  2.6667e-01],
         [ 3.5540e-01,  1.1370e+00,  8.5421e-02,  ..., -1.3956e-03,
           3.2202e-02,  8.3148e-02],
         ...,
         [ 2.7143e-01,  1

In [4]:
output = model(sample)
output

SwinImageClassifierOutput(loss=None, logits=tensor([[-0.1210,  0.4052,  0.2679,  ..., -0.2080,  0.1818,  0.3028],
        [-0.1520,  0.4044,  0.3013,  ..., -0.2082,  0.1786,  0.3034]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, reshaped_hidden_states=None)

In [5]:
output.logits.shape

torch.Size([2, 1000])

In [6]:
model.swin.pooler

AdaptiveAvgPool1d(output_size=1)

In [7]:
model.classifier

Linear(in_features=768, out_features=1000, bias=True)