lightly-ai · ersi-lightly · Mar 6, 2024 · May 30, 2023 · May 31, 2023 · May 31, 2023
diff --git a/benchmarks/imagenet/vitb16/mae.py b/benchmarks/imagenet/vitb16/mae.py
@@ -1,12 +1,19 @@
+import sys
 from typing import List, Tuple
 
 import torch
 from pytorch_lightning import LightningModule
-from timm.models.vision_transformer import vit_base_patch16_224
 from torch import Tensor
 from torch.nn import MSELoss, Parameter
 from torch.optim import AdamW
 
+from lightly.utils import dependency
+
+if dependency.timm_vit_available():
+    from timm.models.vision_transformer import vit_base_patch16_224
+else:
+    sys.exit(1)
+
 from lightly.models import utils
 from lightly.models.modules import (
     masked_autoencoder_timm,

diff --git a/docs/source/getting_started/install.rst b/docs/source/getting_started/install.rst
@@ -32,6 +32,13 @@ If you want to work with video files you need to additionally install
 
     pip install av
 
+If you want to work use the Masked Autoencoder you need to additionally install
+`TIMM <https://github.com/huggingface/pytorch-image-models>`_.
+
+.. code-block:: bash
+
+    pip install timm
+
 Next Steps
 ------------
 

diff --git a/examples/pytorch/mae.py b/examples/pytorch/mae.py
@@ -1,13 +1,13 @@
 # Note: The model and training settings do not follow the reference settings
 # from the paper. The settings are chosen such that the example can easily be
 # run on a small dataset with a single GPU.
-
 import torch
 import torchvision
+from timm.models.vision_transformer import vit_base_patch32_224
 from torch import nn
 
 from lightly.models import utils
-from lightly.models.modules import masked_autoencoder
+from lightly.models.modules import MAEDecoderTIMM, MaskedVisionTransformerTIMM
 from lightly.transforms.mae_transform import MAETransform
 
 
@@ -17,31 +17,31 @@ def __init__(self, vit):
 
         decoder_dim = 512
         self.mask_ratio = 0.75
-        self.patch_size = vit.patch_size
-        self.sequence_length = vit.seq_length
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_dim))
-        self.backbone = masked_autoencoder.MAEBackbone.from_vit(vit)
-        self.decoder = masked_autoencoder.MAEDecoder(
-            seq_length=vit.seq_length,
-            num_layers=1,
-            num_heads=16,
-            embed_input_dim=vit.hidden_dim,
-            hidden_dim=decoder_dim,
-            mlp_dim=decoder_dim * 4,
-            out_dim=vit.patch_size**2 * 3,
-            dropout=0,
-            attention_dropout=0,
+        self.patch_size = vit.patch_embed.patch_size[0]
+
+        self.backbone = MaskedVisionTransformerTIMM(vit=vit)
+        self.sequence_length = self.backbone.sequence_length
+        self.decoder = MAEDecoderTIMM(
+            num_patches=vit.patch_embed.num_patches,
+            patch_size=self.patch_size,
+            embed_dim=vit.embed_dim,
+            decoder_embed_dim=decoder_dim,
+            decoder_depth=1,
+            decoder_num_heads=16,
+            mlp_ratio=4.0,
+            proj_drop_rate=0.0,
+            attn_drop_rate=0.0,
         )
 
     def forward_encoder(self, images, idx_keep=None):
-        return self.backbone.encode(images, idx_keep)
+        return self.backbone.encode(images=images, idx_keep=idx_keep)
 
     def forward_decoder(self, x_encoded, idx_keep, idx_mask):
         # build decoder input
         batch_size = x_encoded.shape[0]
         x_decode = self.decoder.embed(x_encoded)
         x_masked = utils.repeat_token(
-            self.mask_token, (batch_size, self.sequence_length)
+            self.decoder.mask_token, (batch_size, self.sequence_length)
         )
         x_masked = utils.set_at_index(x_masked, idx_keep, x_decode.type_as(x_masked))
 
@@ -60,8 +60,10 @@ def forward(self, images):
             mask_ratio=self.mask_ratio,
             device=images.device,
         )
-        x_encoded = self.forward_encoder(images, idx_keep)
-        x_pred = self.forward_decoder(x_encoded, idx_keep, idx_mask)
+        x_encoded = self.forward_encoder(images=images, idx_keep=idx_keep)
+        x_pred = self.forward_decoder(
+            x_encoded=x_encoded, idx_keep=idx_keep, idx_mask=idx_mask
+        )
 
         # get image patches for masked tokens
         patches = utils.patchify(images, self.patch_size)
@@ -70,7 +72,7 @@ def forward(self, images):
         return x_pred, target
 
 
-vit = torchvision.models.vit_b_32(pretrained=False)
+vit = vit_base_patch32_224()
 model = MAE(vit)
 
 device = "cuda" if torch.cuda.is_available() else "cpu"

diff --git a/examples/pytorch/mae_timm.py b/examples/pytorch/mae_timm.py
diff --git a/examples/pytorch/msn.py b/examples/pytorch/msn.py
@@ -9,8 +9,8 @@
 
 from lightly.loss import MSNLoss
 from lightly.models import utils
+from lightly.models.modules import MaskedVisionTransformerTorchvision
 from lightly.models.modules.heads import MSNProjectionHead
-from lightly.models.modules.masked_autoencoder import MAEBackbone
 from lightly.transforms.msn_transform import MSNTransform
 
 
@@ -19,7 +19,7 @@ def __init__(self, vit):
         super().__init__()
 
         self.mask_ratio = 0.15
-        self.backbone = MAEBackbone.from_vit(vit)
+        self.backbone = MaskedVisionTransformerTorchvision(vit=vit)
         self.projection_head = MSNProjectionHead(input_dim=384)
 
         self.anchor_backbone = copy.deepcopy(self.backbone)
@@ -31,18 +31,18 @@ def __init__(self, vit):
         self.prototypes = nn.Linear(256, 1024, bias=False).weight
 
     def forward(self, images):
-        out = self.backbone(images)
+        out = self.backbone(images=images)
         return self.projection_head(out)
 
     def forward_masked(self, images):
         batch_size, _, _, width = images.shape
-        seq_length = (width // self.anchor_backbone.patch_size) ** 2
+        seq_length = (width // self.anchor_backbone.vit.patch_size) ** 2
         idx_keep, _ = utils.random_token_mask(
             size=(batch_size, seq_length),
             mask_ratio=self.mask_ratio,
             device=images.device,
         )
-        out = self.anchor_backbone(images, idx_keep)
+        out = self.anchor_backbone(images=images, idx_keep=idx_keep)
         return self.anchor_projection_head(out)
 
 

diff --git a/examples/pytorch/pmsn.py b/examples/pytorch/pmsn.py
@@ -9,8 +9,8 @@
 
 from lightly.loss import PMSNLoss
 from lightly.models import utils
+from lightly.models.modules import MaskedVisionTransformerTorchvision
 from lightly.models.modules.heads import MSNProjectionHead
-from lightly.models.modules.masked_autoencoder import MAEBackbone
 from lightly.transforms import MSNTransform
 
 
@@ -19,7 +19,7 @@ def __init__(self, vit):
         super().__init__()
 
         self.mask_ratio = 0.15
-        self.backbone = MAEBackbone.from_vit(vit)
+        self.backbone = MaskedVisionTransformerTorchvision(vit=vit)
         self.projection_head = MSNProjectionHead(384)
 
         self.anchor_backbone = copy.deepcopy(self.backbone)
@@ -31,18 +31,18 @@ def __init__(self, vit):
         self.prototypes = nn.Linear(256, 1024, bias=False).weight
 
     def forward(self, images):
-        out = self.backbone(images)
+        out = self.backbone(images=images)
         return self.projection_head(out)
 
     def forward_masked(self, images):
         batch_size, _, _, width = images.shape
-        seq_length = (width // self.anchor_backbone.patch_size) ** 2
+        seq_length = (width // self.anchor_backbone.vit.patch_size) ** 2
         idx_keep, _ = utils.random_token_mask(
             size=(batch_size, seq_length),
             mask_ratio=self.mask_ratio,
             device=images.device,
         )
-        out = self.anchor_backbone(images, idx_keep)
+        out = self.anchor_backbone(images=images, idx_keep=idx_keep)
         return self.anchor_projection_head(out)
 
 
@@ -106,7 +106,7 @@ def forward_masked(self, images):
         anchors = views[1]
         anchors_focal = torch.concat(views[2:], dim=0)
 
-        targets_out = model.backbone(targets)
+        targets_out = model.backbone(images=targets)
         targets_out = model.projection_head(targets_out)
         anchors_out = model.forward_masked(anchors)
         anchors_focal_out = model.forward_masked(anchors_focal)

diff --git a/examples/pytorch/simmim.py b/examples/pytorch/simmim.py
@@ -3,7 +3,9 @@
 from torch import nn
 
 from lightly.models import utils
-from lightly.models.modules import masked_autoencoder
+from lightly.models.modules.masked_vision_transformer_torchvision import (
+    MaskedVisionTransformerTorchvision,
+)
 from lightly.transforms.mae_transform import MAETransform  # Same transform as MAE
 
 
@@ -15,19 +17,15 @@ def __init__(self, vit):
         self.mask_ratio = 0.75
         self.patch_size = vit.patch_size
         self.sequence_length = vit.seq_length
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_dim))
 
-        # same backbone as MAE
-        self.backbone = masked_autoencoder.MAEBackbone.from_vit(vit)
+        self.backbone = MaskedVisionTransformerTorchvision(vit=vit)
 
         # the decoder is a simple linear layer
-        self.decoder = nn.Linear(vit.hidden_dim, vit.patch_size**2 * 3)
+        self.decoder = nn.Linear(decoder_dim, vit.patch_size**2 * 3)
 
     def forward_encoder(self, images, batch_size, idx_mask):
         # pass all the tokens to the encoder, both masked and non masked ones
-        tokens = self.backbone.images_to_tokens(images, prepend_class_token=True)
-        tokens_masked = utils.mask_at_index(tokens, idx_mask, self.mask_token)
-        return self.backbone.encoder(tokens_masked)
+        return self.backbone.encode(images=images, idx_mask=idx_mask)
 
     def forward_decoder(self, x_encoded):
         return self.decoder(x_encoded)