In [1]:
import whisper

model = whisper.load_model('medium')
total_params = sum(p.numel() for p in model.parameters())
total_params

762321920

In [2]:
model.decoder.token_embedding.weight.size()

torch.Size([51865, 1024])

In [4]:
from transformers import WhisperForConditionalGeneration

model_hf = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
total_params = sum(p.numel() for p in model_hf.parameters())
total_params

763857920

In [5]:
model

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-23): 24 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (key): Linear(in_features=1024, out_features=1024, bias=False)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (out): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
        (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((

In [6]:
model_hf

<bound method Module.parameters of WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_fea

In [14]:
for name, module in model_hf.named_modules():
    # print(name)
    if name == 'proj_out':
        proj_out = module
    if name == 'model.decoder.embed_tokens':
        embed_tokens = module
    if name == 'model.encoder.layers.7.fc2':
        fc1 = module

In [7]:
for p1 in proj_out.parameters():
    print(p1.size())
    print(p1)

torch.Size([51865, 1024])
Parameter containing:
tensor([[-0.0026, -0.0016,  0.0062,  ...,  0.0007, -0.0092, -0.0009],
        [-0.0114,  0.0123,  0.0081,  ..., -0.0173, -0.0045, -0.0050],
        [ 0.0072, -0.0045, -0.0065,  ..., -0.0025,  0.0165, -0.0018],
        ...,
        [ 0.0015,  0.0010,  0.0010,  ..., -0.0027,  0.0107,  0.0038],
        [ 0.0039,  0.0015,  0.0009,  ..., -0.0072,  0.0132,  0.0083],
        [ 0.0097, -0.0006, -0.0127,  ...,  0.0075,  0.0166,  0.0010]],
       requires_grad=True)


In [8]:
for p2 in embed_tokens.parameters():
    print(p2.size())
    print(p2)

torch.Size([51865, 1024])
Parameter containing:
tensor([[-0.0026, -0.0016,  0.0062,  ...,  0.0007, -0.0092, -0.0009],
        [-0.0114,  0.0123,  0.0081,  ..., -0.0173, -0.0045, -0.0050],
        [ 0.0072, -0.0045, -0.0065,  ..., -0.0025,  0.0165, -0.0018],
        ...,
        [ 0.0015,  0.0010,  0.0010,  ..., -0.0027,  0.0107,  0.0038],
        [ 0.0039,  0.0015,  0.0009,  ..., -0.0072,  0.0132,  0.0083],
        [ 0.0097, -0.0006, -0.0127,  ...,  0.0075,  0.0166,  0.0010]],
       requires_grad=True)


In [15]:
for p3 in fc1.parameters():
    print(p3.size())
    print(p3)

torch.Size([1024, 4096])
Parameter containing:
tensor([[-0.0014,  0.0094, -0.0125,  ..., -0.0019, -0.0003,  0.0043],
        [-0.0028, -0.0017,  0.0004,  ..., -0.0185,  0.0012, -0.0071],
        [-0.0091,  0.0173,  0.0221,  ...,  0.0015, -0.0062,  0.0131],
        ...,
        [-0.0167,  0.0057, -0.0172,  ..., -0.0072,  0.0224, -0.0065],
        [ 0.0151, -0.0184, -0.0029,  ..., -0.0014, -0.0027, -0.0003],
        [-0.0319,  0.0036, -0.0266,  ..., -0.0122,  0.0123, -0.0112]],
       requires_grad=True)
torch.Size([1024])
Parameter containing:
tensor([ 0.0164,  0.0142, -0.0358,  ..., -0.0742,  0.0017, -0.0022],
       requires_grad=True)


In [15]:
import torch

embed_tokens.parameters() == proj_out.parameters()

False

In [17]:
list(proj_out.parameters()) == list(embed_tokens.parameters())

True

In [3]:
for name, params in model.named_parameters():
    print(name)

encoder.conv1.weight
encoder.conv1.bias
encoder.conv2.weight
encoder.conv2.bias
encoder.blocks.0.attn.query.weight
encoder.blocks.0.attn.query.bias
encoder.blocks.0.attn.key.weight
encoder.blocks.0.attn.value.weight
encoder.blocks.0.attn.value.bias
encoder.blocks.0.attn.out.weight
encoder.blocks.0.attn.out.bias
encoder.blocks.0.attn_ln.weight
encoder.blocks.0.attn_ln.bias
encoder.blocks.0.mlp.0.weight
encoder.blocks.0.mlp.0.bias
encoder.blocks.0.mlp.2.weight
encoder.blocks.0.mlp.2.bias
encoder.blocks.0.mlp_ln.weight
encoder.blocks.0.mlp_ln.bias
encoder.blocks.1.attn.query.weight
encoder.blocks.1.attn.query.bias
encoder.blocks.1.attn.key.weight
encoder.blocks.1.attn.value.weight
encoder.blocks.1.attn.value.bias
encoder.blocks.1.attn.out.weight
encoder.blocks.1.attn.out.bias
encoder.blocks.1.attn_ln.weight
encoder.blocks.1.attn_ln.bias
encoder.blocks.1.mlp.0.weight
encoder.blocks.1.mlp.0.bias
encoder.blocks.1.mlp.2.weight
encoder.blocks.1.mlp.2.bias
encoder.blocks.1.mlp_ln.weight
encoder

In [6]:
for name, params in model_hf.named_parameters():
    print(name)

model.encoder.conv1.weight
model.encoder.conv1.bias
model.encoder.conv2.weight
model.encoder.conv2.bias
model.encoder.embed_positions.weight
model.encoder.layers.0.self_attn.k_proj.weight
model.encoder.layers.0.self_attn.v_proj.weight
model.encoder.layers.0.self_attn.v_proj.bias
model.encoder.layers.0.self_attn.q_proj.weight
model.encoder.layers.0.self_attn.q_proj.bias
model.encoder.layers.0.self_attn.out_proj.weight
model.encoder.layers.0.self_attn.out_proj.bias
model.encoder.layers.0.self_attn_layer_norm.weight
model.encoder.layers.0.self_attn_layer_norm.bias
model.encoder.layers.0.fc1.weight
model.encoder.layers.0.fc1.bias
model.encoder.layers.0.fc2.weight
model.encoder.layers.0.fc2.bias
model.encoder.layers.0.final_layer_norm.weight
model.encoder.layers.0.final_layer_norm.bias
model.encoder.layers.1.self_attn.k_proj.weight
model.encoder.layers.1.self_attn.v_proj.weight
model.encoder.layers.1.self_attn.v_proj.bias
model.encoder.layers.1.self_attn.q_proj.weight
model.encoder.layers.1

In [18]:
next(model.named_parameters())

('encoder.conv1.weight',
 Parameter containing:
 tensor([[[-0.0040, -0.0121, -0.0248],
          [ 0.0225,  0.0288,  0.0167],
          [ 0.0295,  0.0192, -0.0166],
          ...,
          [-0.0424, -0.0015,  0.0562],
          [-0.0447, -0.0056,  0.0519],
          [-0.0721, -0.0318,  0.0418]],
 
         [[-0.0002, -0.0023, -0.0048],
          [-0.0082, -0.0153, -0.0153],
          [ 0.0084,  0.0279,  0.0320],
          ...,
          [-0.0065, -0.0031, -0.0032],
          [-0.0088,  0.0019,  0.0014],
          [-0.0134, -0.0021, -0.0022]],
 
         [[ 0.0083,  0.0136,  0.0189],
          [-0.0099, -0.0051, -0.0089],
          [ 0.0188,  0.0259,  0.0311],
          ...,
          [ 0.0088, -0.0045, -0.0211],
          [ 0.0067, -0.0074, -0.0190],
          [-0.0006, -0.0187, -0.0304]],
 
         ...,
 
         [[ 0.0019,  0.0037,  0.0033],
          [ 0.0047,  0.0044, -0.0022],
          [ 0.0015,  0.0018, -0.0069],
          ...,
          [-0.0493,  0.0188, -0.0026],
         

In [17]:
next(model_hf.named_parameters())

('model.encoder.conv1.weight',
 Parameter containing:
 tensor([[[-0.0040, -0.0121, -0.0248],
          [ 0.0225,  0.0288,  0.0167],
          [ 0.0295,  0.0192, -0.0166],
          ...,
          [-0.0424, -0.0015,  0.0562],
          [-0.0447, -0.0056,  0.0519],
          [-0.0721, -0.0318,  0.0418]],
 
         [[-0.0002, -0.0023, -0.0048],
          [-0.0082, -0.0153, -0.0153],
          [ 0.0084,  0.0279,  0.0320],
          ...,
          [-0.0065, -0.0031, -0.0032],
          [-0.0088,  0.0019,  0.0014],
          [-0.0134, -0.0021, -0.0022]],
 
         [[ 0.0083,  0.0136,  0.0189],
          [-0.0099, -0.0051, -0.0089],
          [ 0.0188,  0.0259,  0.0311],
          ...,
          [ 0.0088, -0.0045, -0.0211],
          [ 0.0067, -0.0074, -0.0190],
          [-0.0006, -0.0187, -0.0304]],
 
         ...,
 
         [[ 0.0019,  0.0037,  0.0033],
          [ 0.0047,  0.0044, -0.0022],
          [ 0.0015,  0.0018, -0.0069],
          ...,
          [-0.0493,  0.0188, -0.0026],
   