huggingface · danielhanchen · Mar 2, 2024 · Mar 2, 2024 · Mar 9, 2024 · Mar 9, 2024
diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
@@ -57,8 +57,9 @@ class GemmaConfig(PretrainedConfig):
             `num_attention_heads`.
         head_dim (`int`, *optional*, defaults to 256):
             The attention head dimension.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the decoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. "gelu_pytorch_tanh" uses an
+            approximation to the more exact "gelu" activation function.
         max_position_embeddings (`int`, *optional*, defaults to 8192):
             The maximum sequence length that this model might ever be used with.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -108,7 +109,7 @@ def __init__(
         num_attention_heads=16,
         num_key_value_heads=16,
         head_dim=256,
-        hidden_act="gelu",
+        hidden_act="gelu_pytorch_tanh",
         max_position_embeddings=8192,
         initializer_range=0.02,
         rms_norm_eps=1e-6,

diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
@@ -169,7 +169,16 @@ def __init__(self, config):
         self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
+        hidden_act = config.hidden_act
+        if hidden_act != "gelu_pytorch_tanh":
+            logger.warning_once(
+                "Gemma's activation function should be approximate GeLU and not exact GeLU.\n"\
+                "Please edit your model config to use `gelu_pytorch_tanh` and not `gelu`.\n"\
+                "For now, we shall use `gelu_pytorch_tanh` temporarily.\n"\
+                "See https://github.com/huggingface/transformers/pull/29402 for more details."
+            )
+            hidden_act = "gelu_pytorch_tanh"
+        self.act_fn = ACT2FN[hidden_act]
 
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))