diff --git a/src/transformers/models/bloom/configuration_bloom.py b/src/transformers/models/bloom/configuration_bloom.py index 1103a8148ae1e..4f973d93ae48a 100644 --- a/src/transformers/models/bloom/configuration_bloom.py +++ b/src/transformers/models/bloom/configuration_bloom.py @@ -53,14 +53,16 @@ class BloomConfig(PretrainedConfig): Args: - vocab_size (`int`, *optional*, defaults to 50257): - Vocabulary size of the Bloom model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`BloomModel`]. - hidden_size (`int`, *optional*, defaults to 768): + vocab_size (`int`, *optional*, defaults to 250880): + Vocabulary size of the Bloom model. Defines the maximum number of different tokens that can be represented + by the `inputs_ids` passed when calling [`BloomModel`]. Check [this + discussion](https://huggingface.co/bigscience/bloom/discussions/120#633d28389addb8530b406c2a) on how the + `vocab_size` has been defined. + hidden_size (`int`, *optional*, defaults to 64): Dimensionality of the embeddings and hidden states. - n_layer (`int`, *optional*, defaults to 12): + n_layer (`int`, *optional*, defaults to 2): Number of hidden layers in the Transformer encoder. - n_head (`int`, *optional*, defaults to 12): + n_head (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer encoder. layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon to use in the layer normalization layers.