In [21]:
from transformers import (
    AlbertForSequenceClassification,
    AutoConfig,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from colossalai.nn.optimizer import HybridAdam
from torch.optim import Adam

In [24]:
model_name = "albert-xxlarge-v2"

In [25]:
cfg = AutoConfig.from_pretrained(model_name)

In [59]:
model = AlbertForSequenceClassification.from_pretrained(model_name, config=cfg).cuda()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xxlarge-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
model

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=4096, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((4096,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=4096, out_features=4096, bias=True)
                (key): Linear(in_features=4096, out_features=4096, bias=True)
                (value): Linear(in_features=4096, out_featur

In [7]:
no_decay = ["bias", "LayerNorm.weight"]
WEIGHT_DECAY = 0.01

In [8]:
optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": WEIGHT_DECAY,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

In [36]:
[p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)]

[Parameter containing:
 tensor([[ 0.0771, -0.0577,  0.0301,  ..., -0.0146,  0.0590, -0.0838],
         [ 0.1137,  0.1374, -0.0161,  ..., -0.0743, -0.1042, -0.1193],
         [ 0.0006,  0.0052, -0.0525,  ..., -0.0322,  0.0433,  0.0114],
         ...,
         [-0.0059,  0.0670,  0.0760,  ..., -0.2300,  0.1185,  0.0336],
         [ 0.0391, -0.1243,  0.0297,  ..., -0.0472, -0.1246, -0.1053],
         [-0.1347, -0.0608,  0.0758,  ...,  0.0126, -0.0360,  0.0759]],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.1195,  0.0082, -0.0661,  ..., -0.0294,  0.1309,  0.0990],
         [-0.0426, -0.0210, -0.0504,  ..., -0.0627, -0.0666,  0.0050],
         [-0.0409,  0.0137, -0.0800,  ...,  0.0058, -0.0570,  0.0234],
         ...,
         [-0.1007,  0.0612, -0.0024,  ..., -0.0005, -0.0056,  0.0993],
         [-0.0674,  0.0029, -0.0579,  ...,  0.0551, -0.0326, -0.0098],
         [-0.0861, -0.0716,  0.0103,  ...,  0.2564, -0.1340, -0.0281]],
        requires_grad=True),
 Parameter con

In [9]:
optimizer_grouped_parameters

[{'params': [Parameter containing:
   tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
           [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
           [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
           ...,
           [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
           [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
           [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
          requires_grad=True),
   Parameter containing:
   tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
             6.8312e-04,  1.5441e-02],
           [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
             2.9753e-02, -5.3247e-03],
           [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
             1.8741e-02, -7.3140e-03],
           ...,
           [ 1.7418e-02,  3.4903e-03, -9.5621e-03,  ...,  2.9599e-03,
             4.3435e-04, -2.6949e-02],
  

In [10]:
model.named_parameters()

<generator object Module.named_parameters at 0x7f4878c28c10>

In [61]:
optimizer1 = HybridAdam(model.parameters(), eps=1e-8)



In [45]:
optimizer1

HybridAdam (
Parameter Group 0
    betas: (0.9, 0.999)
    bias_correction: True
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [47]:
optimizer2 = Adam(optimizer_grouped_parameters, lr= 1e-3, eps=1e-8)

In [62]:
optimizer1.param_groups[0]['params'][0].dtype

torch.float32

In [49]:
from colossalai.zero.low_level import LowLevelZeroOptimizer

In [52]:
from colossalai.interface import ModelWrapper, OptimizerWrapper

In [54]:
zero_opt = OptimizerWrapper(optim=optimizer2)

In [58]:
zero_opt.optim.param_groups

[{'params': [Parameter containing:
   tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
           [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
           [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
           ...,
           [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
           [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
           [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
          requires_grad=True),
   Parameter containing:
   tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
             6.8312e-04,  1.5441e-02],
           [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
             2.9753e-02, -5.3247e-03],
           [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
             1.8741e-02, -7.3140e-03],
           ...,
           [ 1.7418e-02,  3.4903e-03, -9.5621e-03,  ...,  2.9599e-03,
             4.3435e-04, -2.6949e-02],
  

In [51]:
zero_opti = LowLevelZeroOptimizer(optim=optimizer2)

TypeError: __init__() got an unexpected keyword argument 'optim'

In [29]:
param_group = [{'params': [], 'weight_decay': 0.01, 'lr': 0.0, 'betas': (0.9, 0.999), 'eps': 1e-08, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.000192}, {'params': [], 'weight_decay': 0.0, 'lr': 0.0, 'betas': (0.9, 0.999), 'eps': 1e-08, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.000192}]

In [33]:
param_group

[{'params': [],
  'weight_decay': 0.01,
  'lr': 0.0,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'amsgrad': False,
  'maximize': False,
  'foreach': None,
  'capturable': False,
  'differentiable': False,
  'fused': None,
  'initial_lr': 0.000192},
 {'params': [],
  'weight_decay': 0.0,
  'lr': 0.0,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'amsgrad': False,
  'maximize': False,
  'foreach': None,
  'capturable': False,
  'differentiable': False,
  'fused': None,
  'initial_lr': 0.000192}]