Skip to content

Commit

Permalink
Fix AMD FP8 Test and use native rowwise quantization in benchmark (py…
Browse files Browse the repository at this point in the history
…torch#2849)

Summary:
Pull Request resolved: pytorch#2849

Fix a minor test issue where triton blockwise quantization was running on AMD despite not being supported.

I also switch rowwise quantization in our fp8 benchmarks to the native hip implementation.

Reviewed By: jianyuh

Differential Revision: D59771162
  • Loading branch information
jwfromm authored and facebook-github-bot committed Jul 16, 2024
1 parent 903e928 commit 2cbdfbe
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
4 changes: 2 additions & 2 deletions fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,8 @@ class FP8RowwiseGemm(QuantizeOpBase):

def quantize(self, x, w):
# Quantize both input tensors.
xq, x_scale = quantize_fp8_row(x)
wq, w_scale = quantize_fp8_row(w)
xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(x)
wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
return xq, wq, x_scale, w_scale

def compute(self, xq, wq, x_scale, w_scale):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_f8f8bf16(self, kernel: str, use_fast_accum: bool) -> None:
QType=st.sampled_from([fp8_e4m3, fp8_e5m2]),
Bias=st.sampled_from([True, False]),
CudaGraph=st.sampled_from([True, False]),
UseTriton=st.sampled_from([True, False]),
UseTriton=st.sampled_from([False] + ([True] if torch.version.cuda else [])),
InputMultiDim=st.booleans(),
)
def test_quantize_fp8_matmul(
Expand Down

0 comments on commit 2cbdfbe

Please sign in to comment.