In [9]:
from transformers import SwinForImageClassification

model = SwinForImageClassification.from_pretrained(
    pretrained_model_name_or_path='microsoft/swin-tiny-patch4-window7-224',
    num_labels=1000,
    ignore_mismatched_sizes=True,    
)

for main_name, main_module in model.named_children():
    print("\n【", main_name, "】")
    for sub_name, sub_module in main_module.named_children():
        print(" ┠", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print(" ┃ └", ssub_name)


【 swin 】
 ┠ embeddings
 ┃ └ patch_embeddings
 ┃ └ norm
 ┃ └ dropout
 ┠ encoder
 ┃ └ layers
 ┠ layernorm
 ┠ pooler

【 classifier 】


In [2]:
model.swin.embeddings

SwinEmbeddings(
  (patch_embeddings): SwinPatchEmbeddings(
    (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
  )
  (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

In [3]:
import torch
sample = torch.rand((2, 3,224,224), dtype=torch.float32)
output, shape = model.swin.embeddings.patch_embeddings(sample)
output

tensor([[[ 0.1505,  1.0869,  0.1189,  ..., -0.0191,  0.2263,  0.2068],
         [ 0.2524,  1.1541,  0.1148,  ..., -0.2006,  0.1522,  0.1650],
         [ 0.3534,  1.1188,  0.2942,  ..., -0.0823,  0.1668,  0.1688],
         ...,
         [ 0.1196,  1.1145, -0.0756,  ..., -0.0066,  0.2087,  0.3351],
         [ 0.4216,  1.1472,  0.0841,  ..., -0.0204,  0.0805,  0.2316],
         [ 0.2548,  1.0845, -0.0749,  ..., -0.0128,  0.2179,  0.2229]],

        [[ 0.3481,  1.1500,  0.1115,  ..., -0.0666,  0.1108,  0.2293],
         [ 0.3223,  1.1202, -0.0676,  ..., -0.1211,  0.2717,  0.3278],
         [ 0.5045,  1.1424,  0.2634,  ..., -0.1436,  0.1297,  0.1881],
         ...,
         [ 0.0949,  1.1451,  0.1221,  ..., -0.0654,  0.1832,  0.3275],
         [ 0.3128,  1.1283,  0.0912,  ..., -0.1778,  0.2439,  0.2220],
         [ 0.0840,  1.1028, -0.0672,  ...,  0.0924,  0.1846,  0.3118]]],
       grad_fn=<TransposeBackward0>)

In [4]:
output = model(sample)
output

SwinImageClassifierOutput(loss=None, logits=tensor([[-0.1607,  0.3856,  0.2832,  ..., -0.1943,  0.2003,  0.3263],
        [-0.1286,  0.4125,  0.3092,  ..., -0.1956,  0.2415,  0.2810]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, reshaped_hidden_states=None)

In [5]:
output.logits.shape

torch.Size([2, 1000])

In [6]:
model.swin.pooler

AdaptiveAvgPool1d(output_size=1)

In [7]:
model.classifier

Linear(in_features=768, out_features=1000, bias=True)

In [8]:
model

SwinForImageClassification(
  (swin): SwinModel(
    (embeddings): SwinEmbeddings(
      (patch_embeddings): SwinPatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): SwinEncoder(
      (layers): ModuleList(
        (0): SwinStage(
          (blocks): ModuleList(
            (0-1): 2 x SwinLayer(
              (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attention): SwinAttention(
                (self): SwinSelfAttention(
                  (query): Linear(in_features=96, out_features=96, bias=True)
                  (key): Linear(in_features=96, out_features=96, bias=True)
                  (value): Linear(in_features=96, out_features=96, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )
                (output): SwinSelfOutput(
  