fix

huggingface · Jul 18, 2023 · d186b13 · d186b13
1 parent bc2f351
commit d186b13
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 29 deletions.
diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2
@@ -1,12 +1,12 @@
-flash_att_commit := 4f285b354796fb17df8636485b9a04df3ebbb7dc
+flash_att_v2_commit := 4f285b354796fb17df8636485b9a04df3ebbb7dc
 
 flash-attention-v2:
     # Clone flash attention
 	pip install packaging
 	git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2
 
 build-flash-attention-v2: flash-attention-v2
-	cd flash-attention-v2 && git fetch && git checkout $(flash_att_commit)
+	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit)
 	cd flash-attention-v2 && python setup.py build
 
 install-flash-attention-v2: build-flash-attention-v2

diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -55,10 +55,8 @@
         FlashSantacoderSharded,
     )
 
-except ImportError:
-    logger.opt(exception=True).warning(
-        "Could not import Flash Attention enabled models"
-    )
+except ImportError as e:
+    logger.warning(f"Could not import Flash Attention enabled models: {e}")
     FLASH_ATTENTION = False
 
 if FLASH_ATTENTION:

diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -188,7 +188,7 @@ def forward(
                 attn_output,
                 cu_seqlen_prefill,
                 max_s,
-                self.softmax_scale
+                self.softmax_scale,
             )
         # Decode
         else:
@@ -308,7 +308,7 @@ def forward(
                 attn_output,
                 cu_seqlen_prefill,
                 max_s,
-                self.softmax_scale
+                self.softmax_scale,
             )
         # Decode
         else:

diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py
@@ -1,6 +1,8 @@
 import os
 import torch
 
+from loguru import logger
+
 if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
     raise ImportError("`USE_FLASH_ATTENTION` is false.")
 
@@ -18,10 +20,11 @@
     try:
         import flash_attn_2_cuda
     except ImportError:
-        raise ImportError("Flash Attention V2 is not installed.\n"
-                          "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                          "or install flash attention v2 with `cd server && make install install-flash-attention-v2`"
-                          )
+        raise ImportError(
+            "Flash Attention V2 is not installed.\n"
+            "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+            "or install flash attention v2 with `cd server && make install install-flash-attention-v2`"
+        )
     if not (is_sm8x or is_sm90):
         raise ImportError(
             f"GPU with CUDA capability {major} {minor} is not supported for "
@@ -32,26 +35,28 @@
     try:
         import flash_attn_cuda
     except ImportError:
-        raise ImportError("Flash Attention is not installed.\n"
-                          "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                          "or install flash attention with `cd server && make install install-flash-attention`"
-                          ) from e
+        raise ImportError(
+            "Flash Attention is not installed.\n"
+            "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+            "or install flash attention with `cd server && make install install-flash-attention`"
+        ) from e
 
     if not (is_sm75 or is_sm8x or is_sm90):
         raise ImportError(
             f"GPU with CUDA capability {major} {minor} is not supported"
         ) from e
+    logger.warning(f"Unable to use Flash Attention V2: {e}")
     HAS_FLASH_ATTN = True
 
 
 def attention(
-        q,
-        k,
-        v,
-        out,
-        cu_seqlens,
-        max_s,
-        softmax_scale,
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens,
+    max_s,
+    softmax_scale,
 ):
     if HAS_FLASH_ATTN_V2:
         return flash_attn_2_cuda.varlen_fwd(
@@ -76,21 +81,27 @@ def attention(
         if k.shape[1] != q.shape[1]:
             # MQA expand
             if k.shape[1] == 1:
-                k = k.expand(-1, q.shape[1], -1, -1)
+                k = k.expand(-1, q.shape[1], -1)
             # Grouped attention reshape
             else:
                 original_shape = k.shape
-                k = k.unsqueeze(2).expand(-1, -1, q.shape[1], -1, -1) \
-                    .reshape(original_shape[0], -1, original_shape[1], original_shape[2])
+                k = (
+                    k.unsqueeze(2)
+                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
+                    .reshape(original_shape[0], -1, original_shape[2])
+                )
         if v.shape[1] != q.shape[1]:
             # MQA expand
             if v.shape[1] == 1:
-                v = v.expand(-1, q.shape[1], -1, -1)
+                v = v.expand(-1, q.shape[1], -1)
             # Grouped attention reshape
             else:
                 original_shape = v.shape
-                v = v.unsqueeze(2).expand(-1, -1, q.shape[1], -1, -1) \
-                    .reshape(original_shape[0], -1, original_shape[1], original_shape[2])
+                v = (
+                    v.unsqueeze(2)
+                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
+                    .reshape(original_shape[0], -1, original_shape[2])
+                )
 
         return flash_attn_cuda.fwd(
             q,