Skip to content

Commit

Permalink
[X86][AVX] EltsFromConsecutiveLoads - Add BROADCAST lowering support
Browse files Browse the repository at this point in the history
This patch adds scalar/subvector BROADCAST handling to EltsFromConsecutiveLoads.

It mainly shows codegen changes to 32-bit code which failed to handle i64 loads, although 64-bit code is also using this new path to more efficiently combine to a broadcast load.

Differential Revision: https://reviews.llvm.org/D58053

llvm-svn: 354340
  • Loading branch information
RKSimon committed Feb 19, 2019
1 parent baff199 commit 952abce
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 145 deletions.
73 changes: 70 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -7384,12 +7384,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
VT.is256BitVector() && !Subtarget.hasInt256())
return SDValue();

if (NumElems == 1)
return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

if (IsConsecutiveLoad)
return CreateLoad(VT, LDBase);

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
if (!isAfterLegalize && VT.isVector() && NumElems == VT.getVectorNumElements()) {
SmallVector<int, 4> ClearMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
if (ZeroMask[i])
Expand All @@ -7404,8 +7407,23 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}

int LoadSize =
(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
unsigned BaseSize = LDBaseVT.getStoreSizeInBits();
int LoadSize = (1 + LastLoadedElt - FirstLoadedElt) * BaseSize;

// If the upper half of a ymm/zmm load is undef then just load the lower half.
if (VT.is256BitVector() || VT.is512BitVector()) {
unsigned HalfNumElems = NumElems / 2;
if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
EVT HalfVT =
EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
SDValue HalfLD =
EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
DAG, Subtarget, isAfterLegalize);
if (HalfLD)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
HalfLD, DAG.getIntPtrConstant(0, DL));
}
}

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
Expand All @@ -7428,6 +7446,55 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}

// BROADCAST - match the smallest possible repetition pattern, load that
// scalar/subvector element and then broadcast to the entire vector.
if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) &&
(BaseSize % 8) == 0 && Subtarget.hasAVX() &&
(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
unsigned RepeatSize = SubElems * BaseSize;
unsigned ScalarSize = std::min(RepeatSize, 64u);
if (!Subtarget.hasAVX2() && ScalarSize < 32)
continue;

bool Match = true;
SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(LDBaseVT));
for (unsigned i = 0; i != NumElems && Match; ++i) {
if (!LoadMask[i])
continue;
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (RepeatedLoads[i % SubElems].isUndef())
RepeatedLoads[i % SubElems] = Elt;
else
Match &= (RepeatedLoads[i % SubElems] == Elt);
}

// We must have loads at both ends of the repetition.
Match &= !RepeatedLoads.front().isUndef();
Match &= !RepeatedLoads.back().isUndef();
if (!Match)
continue;

EVT RepeatVT =
VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
: EVT::getFloatingPointVT(ScalarSize);
if (RepeatSize > ScalarSize)
RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
RepeatSize / ScalarSize);
if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
EVT BroadcastVT =
EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
VT.getSizeInBits() / ScalarSize);
unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
: X86ISD::VBROADCAST;
SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
return DAG.getBitcast(VT, Broadcast);
}
}
}

return SDValue();
}

Expand Down
14 changes: 4 additions & 10 deletions llvm/test/CodeGen/X86/avx-vbroadcast.ll
Expand Up @@ -6,9 +6,7 @@ define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: A:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: A:
Expand All @@ -34,11 +32,9 @@ define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %edx
; X32-NEXT: movl 4(%ecx), %esi
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vbroadcastsd (%ecx), %ymm0
; X32-NEXT: movl %edx, (%eax)
; X32-NEXT: movl %esi, 4(%eax)
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
Expand Down Expand Up @@ -590,8 +586,7 @@ define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: G:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: G:
Expand All @@ -615,10 +610,9 @@ define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %edx
; X32-NEXT: movl 4(%ecx), %esi
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: movl %edx, (%eax)
; X32-NEXT: movl %esi, 4(%eax)
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/avx2-vbroadcast.ll
Expand Up @@ -207,8 +207,7 @@ define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: QQ64:
; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vbroadcastsd %xmm0, %ymm0
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: QQ64:
Expand Down Expand Up @@ -1368,8 +1367,7 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
; X32-NEXT: movl 8(%ebp), %eax
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: vmovaps %ymm0, (%esp)
; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: vbroadcastsd %xmm1, %ymm1
; X32-NEXT: vbroadcastsd (%eax), %ymm1
; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-NEXT: movl %ebp, %esp
Expand Down
36 changes: 11 additions & 25 deletions llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
Expand Up @@ -60,15 +60,13 @@ declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i
define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
; X86: ## %bb.0:
; X86-NEXT: vmovq {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x4c,0x24,0x04]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd1]
; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x8c,0x24,0x04,0x00,0x00,0x00]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0xc1]
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc9]
; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1]
; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
; X86-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xc1]
; X86-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0xd1]
; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
; X86-NEXT: retl ## encoding: [0xc3]
;
; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
Expand Down Expand Up @@ -2253,9 +2251,7 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_add_epi64_rmb:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08]
; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1]
; X86-NEXT: retl ## encoding: [0xc3]
;
Expand All @@ -2274,9 +2270,7 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
; X86-LABEL: test_mask_add_epi64_rmbk:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: ## xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
; X86-NEXT: vpbroadcastq (%eax), %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xca]
Expand All @@ -2300,9 +2294,7 @@ define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask)
; X86-LABEL: test_mask_add_epi64_rmbkz:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1]
Expand Down Expand Up @@ -2427,9 +2419,7 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_sub_epi64_rmb:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08]
; X86-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1]
; X86-NEXT: retl ## encoding: [0xc3]
;
Expand All @@ -2448,9 +2438,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
; X86-LABEL: test_mask_sub_epi64_rmbk:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: ## xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
; X86-NEXT: vpbroadcastq (%eax), %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpsubq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xca]
Expand All @@ -2474,9 +2462,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask)
; X86-LABEL: test_mask_sub_epi64_rmbkz:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1]
Expand Down
29 changes: 8 additions & 21 deletions llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
Expand Up @@ -2011,9 +2011,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_mullo_epi64_rmb_512:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: # xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastq (%eax), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08]
; X86-NEXT: vpmullq %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
Expand All @@ -2032,9 +2030,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x
; X86-LABEL: test_mask_mullo_epi64_rmbk_512:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: # xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10]
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
; X86-NEXT: vpmullq %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x40,0xca]
; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
Expand All @@ -2057,9 +2053,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8
; X86-LABEL: test_mask_mullo_epi64_rmbkz_512:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: # xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastq (%eax), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08]
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
; X86-NEXT: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
Expand Down Expand Up @@ -2178,9 +2172,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_mullo_epi64_rmb_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: # xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9]
; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08]
; X86-NEXT: vpmullq %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
Expand All @@ -2199,9 +2191,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x
; X86-LABEL: test_mask_mullo_epi64_rmbk_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: # xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd2]
; X86-NEXT: vpbroadcastq (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x10]
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
; X86-NEXT: vpmullq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x40,0xca]
; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
Expand All @@ -2224,9 +2214,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8
; X86-LABEL: test_mask_mullo_epi64_rmbkz_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: # xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9]
; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08]
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
; X86-NEXT: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
Expand Down Expand Up @@ -2718,10 +2706,9 @@ define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x
; X86: # %bb.0:
; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X86-NEXT: vmovq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: # xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x10]
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm1 {%k1} # encoding: [0x62,0xf3,0x6d,0x29,0x38,0xca,0x01]
; X86-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xca]
; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01]
; X86-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01]
; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
Expand Down

0 comments on commit 952abce

Please sign in to comment.