Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

transpose syntax sucks #16

Closed
abadams opened this issue Aug 17, 2012 · 1 comment
Closed

transpose syntax sucks #16

abadams opened this issue Aug 17, 2012 · 1 comment
Labels
enhancement New user-visible features or improvements to existing features. usability

Comments

@abadams
Copy link
Member

abadams commented Aug 17, 2012

It's quite confusing. We need some better way to handle loop nesting order.

@abadams
Copy link
Member Author

abadams commented Feb 16, 2013

transpose has been replaced with reorder

@abadams abadams closed this as completed Feb 16, 2013
abadams added a commit that referenced this issue Dec 7, 2021
This lets it save a few instructions on x86 and arm.

cast(UInt(16), lerp(some_u8s)) produces the following, before and after
this PR

Before:

x86:

	vmovdqu	(%r15,%r13), %xmm4
	vpmovzxbw	-2(%r15,%r13), %ymm5
	vpxor	%xmm0, %xmm4, %xmm6
	vpmovzxbw	%xmm6, %ymm6
	vpmovzxbw	-1(%r15,%r13), %ymm7
	vpmullw	%ymm6, %ymm5, %ymm5
	vpmovzxbw	%xmm4, %ymm4
	vpmullw	%ymm4, %ymm7, %ymm4
	vpaddw	%ymm4, %ymm5, %ymm4
	vpaddw	%ymm1, %ymm4, %ymm4
	vpmulhuw	%ymm2, %ymm4, %ymm4
	vpsrlw	$7, %ymm4, %ymm4
	vpand	%ymm3, %ymm4, %ymm4
	vmovdqu	%ymm4, (%rbx,%r13,2)
	addq	$16, %r13
	decq	%r10
	jne	.LBB0_10
arm:

	ldr	q0, [x17]
	ldur	q2, [x17, #-1]
	ldur	q1, [x17, #-2]
	subs	x0, x0, #1                      // =1
	mvn	v3.16b, v0.16b
	umull	v4.8h, v2.8b, v0.8b
	umull2	v0.8h, v2.16b, v0.16b
	umlal	v4.8h, v1.8b, v3.8b
	umlal2	v0.8h, v1.16b, v3.16b
	urshr	v1.8h, v4.8h, #8
	urshr	v2.8h, v0.8h, #8
	raddhn	v1.8b, v1.8h, v4.8h
	raddhn	v0.8b, v2.8h, v0.8h
	ushll	v0.8h, v0.8b, #0
	ushll	v1.8h, v1.8b, #0
	add	x17, x17, #16                   // =16
	stp	q1, q0, [x18, #-16]
	add	x18, x18, #32                   // =32
	b.ne	.LBB0_10

After:

x86:

	vpmovzxbw	-2(%r15,%r13), %ymm3
	vmovdqu	(%r15,%r13), %xmm4
	vpxor	%xmm0, %xmm4, %xmm5
	vpmovzxbw	%xmm5, %ymm5
	vpmullw	%ymm5, %ymm3, %ymm3
	vpmovzxbw	-1(%r15,%r13), %ymm5
	vpmovzxbw	%xmm4, %ymm4
	vpmullw	%ymm4, %ymm5, %ymm4
	vpaddw	%ymm4, %ymm3, %ymm3
	vpaddw	%ymm1, %ymm3, %ymm3
	vpmulhuw	%ymm2, %ymm3, %ymm3
	vpsrlw	$7, %ymm3, %ymm3
	vmovdqu	%ymm3, (%rbp,%r13,2)
	addq	$16, %r13
	decq	%r10
	jne	.LBB0_10

arm:

	ldr	q0, [x17]
	ldur	q2, [x17, #-1]
	ldur	q1, [x17, #-2]
	subs	x0, x0, #1                      // =1
	mvn	v3.16b, v0.16b
	umull	v4.8h, v2.8b, v0.8b
	umull2	v0.8h, v2.16b, v0.16b
	umlal	v4.8h, v1.8b, v3.8b
	umlal2	v0.8h, v1.16b, v3.16b
	ursra	v4.8h, v4.8h, #8
	ursra	v0.8h, v0.8h, #8
	urshr	v1.8h, v4.8h, #8
	urshr	v0.8h, v0.8h, #8
	add	x17, x17, #16                   // =16
	stp	q1, q0, [x18, #-16]
	add	x18, x18, #32                   // =32
	b.ne	.LBB0_10

So on X86 we skip a pointless and instruction, and on ARM we get a
rounding add and shift right instead of a rounding narrowing add shift
right followed by a widen.
abadams added a commit that referenced this issue Dec 10, 2021
* Let lerp lowering incorporate a final cast

This lets it save a few instructions on x86 and arm.

cast(UInt(16), lerp(some_u8s)) produces the following, before and after
this PR

Before:

x86:

	vmovdqu	(%r15,%r13), %xmm4
	vpmovzxbw	-2(%r15,%r13), %ymm5
	vpxor	%xmm0, %xmm4, %xmm6
	vpmovzxbw	%xmm6, %ymm6
	vpmovzxbw	-1(%r15,%r13), %ymm7
	vpmullw	%ymm6, %ymm5, %ymm5
	vpmovzxbw	%xmm4, %ymm4
	vpmullw	%ymm4, %ymm7, %ymm4
	vpaddw	%ymm4, %ymm5, %ymm4
	vpaddw	%ymm1, %ymm4, %ymm4
	vpmulhuw	%ymm2, %ymm4, %ymm4
	vpsrlw	$7, %ymm4, %ymm4
	vpand	%ymm3, %ymm4, %ymm4
	vmovdqu	%ymm4, (%rbx,%r13,2)
	addq	$16, %r13
	decq	%r10
	jne	.LBB0_10
arm:

	ldr	q0, [x17]
	ldur	q2, [x17, #-1]
	ldur	q1, [x17, #-2]
	subs	x0, x0, #1                      // =1
	mvn	v3.16b, v0.16b
	umull	v4.8h, v2.8b, v0.8b
	umull2	v0.8h, v2.16b, v0.16b
	umlal	v4.8h, v1.8b, v3.8b
	umlal2	v0.8h, v1.16b, v3.16b
	urshr	v1.8h, v4.8h, #8
	urshr	v2.8h, v0.8h, #8
	raddhn	v1.8b, v1.8h, v4.8h
	raddhn	v0.8b, v2.8h, v0.8h
	ushll	v0.8h, v0.8b, #0
	ushll	v1.8h, v1.8b, #0
	add	x17, x17, #16                   // =16
	stp	q1, q0, [x18, #-16]
	add	x18, x18, #32                   // =32
	b.ne	.LBB0_10

After:

x86:

	vpmovzxbw	-2(%r15,%r13), %ymm3
	vmovdqu	(%r15,%r13), %xmm4
	vpxor	%xmm0, %xmm4, %xmm5
	vpmovzxbw	%xmm5, %ymm5
	vpmullw	%ymm5, %ymm3, %ymm3
	vpmovzxbw	-1(%r15,%r13), %ymm5
	vpmovzxbw	%xmm4, %ymm4
	vpmullw	%ymm4, %ymm5, %ymm4
	vpaddw	%ymm4, %ymm3, %ymm3
	vpaddw	%ymm1, %ymm3, %ymm3
	vpmulhuw	%ymm2, %ymm3, %ymm3
	vpsrlw	$7, %ymm3, %ymm3
	vmovdqu	%ymm3, (%rbp,%r13,2)
	addq	$16, %r13
	decq	%r10
	jne	.LBB0_10

arm:

	ldr	q0, [x17]
	ldur	q2, [x17, #-1]
	ldur	q1, [x17, #-2]
	subs	x0, x0, #1                      // =1
	mvn	v3.16b, v0.16b
	umull	v4.8h, v2.8b, v0.8b
	umull2	v0.8h, v2.16b, v0.16b
	umlal	v4.8h, v1.8b, v3.8b
	umlal2	v0.8h, v1.16b, v3.16b
	ursra	v4.8h, v4.8h, #8
	ursra	v0.8h, v0.8h, #8
	urshr	v1.8h, v4.8h, #8
	urshr	v0.8h, v0.8h, #8
	add	x17, x17, #16                   // =16
	stp	q1, q0, [x18, #-16]
	add	x18, x18, #32                   // =32
	b.ne	.LBB0_10

So on X86 we skip a pointless and instruction, and on ARM we get a
rounding add and shift right instead of a rounding narrowing add shift
right followed by a widen.

* Add test

* Fix bug in test

* Don't produce out-of-range lerp values
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New user-visible features or improvements to existing features. usability
Projects
None yet
Development

No branches or pull requests

1 participant