In [1]:
import whisper

model = whisper.load_model('medium')
total_params = sum(p.numel() for p in model.parameters())
total_params

762321920

In [2]:
model.decoder.token_embedding.weight.size()

torch.Size([51865, 1024])

In [3]:
from transformers import WhisperForConditionalGeneration

model_hf = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
total_params = sum(p.numel() for p in model_hf.parameters())
total_params

763857920

In [4]:
model

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-23): 24 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (key): Linear(in_features=1024, out_features=1024, bias=False)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (out): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
        (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((

In [5]:
model_hf

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

In [11]:
for name, module in model_hf.named_modules():
    # print(name)
    if name == 'proj_out':
        proj_out = module
    if name == 'model.decoder.embed_tokens':
        embed_tokens = module

In [12]:
for param in proj_out.parameters():
    print(param.size())
    print(param)

torch.Size([51865, 1024])
Parameter containing:
tensor([[-0.0026, -0.0016,  0.0062,  ...,  0.0007, -0.0092, -0.0009],
        [-0.0114,  0.0123,  0.0081,  ..., -0.0173, -0.0045, -0.0050],
        [ 0.0072, -0.0045, -0.0065,  ..., -0.0025,  0.0165, -0.0018],
        ...,
        [ 0.0015,  0.0010,  0.0010,  ..., -0.0027,  0.0107,  0.0038],
        [ 0.0039,  0.0015,  0.0009,  ..., -0.0072,  0.0132,  0.0083],
        [ 0.0097, -0.0006, -0.0127,  ...,  0.0075,  0.0166,  0.0010]],
       requires_grad=True)


In [13]:
for param in embed_tokens.parameters():
    print(param.size())
    print(param)

torch.Size([51865, 1024])
Parameter containing:
tensor([[-0.0026, -0.0016,  0.0062,  ...,  0.0007, -0.0092, -0.0009],
        [-0.0114,  0.0123,  0.0081,  ..., -0.0173, -0.0045, -0.0050],
        [ 0.0072, -0.0045, -0.0065,  ..., -0.0025,  0.0165, -0.0018],
        ...,
        [ 0.0015,  0.0010,  0.0010,  ..., -0.0027,  0.0107,  0.0038],
        [ 0.0039,  0.0015,  0.0009,  ..., -0.0072,  0.0132,  0.0083],
        [ 0.0097, -0.0006, -0.0127,  ...,  0.0075,  0.0166,  0.0010]],
       requires_grad=True)


In [15]:
import torch

embed_tokens.parameters() == proj_out.parameters()

False

In [17]:
list(proj_out.parameters()) == list(embed_tokens.parameters())

True