Fix AMD FP8 Test and use native rowwise quantization in benchmark (py…

…torch#2849) Summary: Pull Request resolved: pytorch#2849 Fix a minor test issue where triton blockwise quantization was running on AMD despite not being supported. I also switch rowwise quantization in our fp8 benchmarks to the native hip implementation. Reviewed By: jianyuh Differential Revision: D59771162
jwfromm · Jul 16, 2024 · 2cbdfbe · 2cbdfbe
1 parent 903e928
commit 2cbdfbe
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -271,8 +271,8 @@ class FP8RowwiseGemm(QuantizeOpBase):
 
     def quantize(self, x, w):
         # Quantize both input tensors.
-        xq, x_scale = quantize_fp8_row(x)
-        wq, w_scale = quantize_fp8_row(w)
+        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(x)
+        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
         return xq, wq, x_scale, w_scale
 
     def compute(self, xq, wq, x_scale, w_scale):

diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -165,7 +165,7 @@ def test_f8f8bf16(self, kernel: str, use_fast_accum: bool) -> None:
         QType=st.sampled_from([fp8_e4m3, fp8_e5m2]),
         Bias=st.sampled_from([True, False]),
         CudaGraph=st.sampled_from([True, False]),
-        UseTriton=st.sampled_from([True, False]),
+        UseTriton=st.sampled_from([False] + ([True] if torch.version.cuda else [])),
         InputMultiDim=st.booleans(),
     )
     def test_quantize_fp8_matmul(