Permalink
Browse files

Near optimal mat44_multiply

  • Loading branch information...
1 parent 9d57f46 commit af9ddad76c542a924f2b56dd00f72ba7338569b4 @jcayzac committed Dec 26, 2010
Showing with 88 additions and 3 deletions.
  1. +53 −0 .attic/memcpy_memset.txt
  2. +1 −1 common.h
  3. +34 −2 vectormath/mat44_multiply.h
@@ -0,0 +1,53 @@
+
+(Need to enable PLD, see
+http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344b/Babjbfdb.html
+http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344k/Cbbbdaed.html
+http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344k/Bgbciiaf.html
+)
+
+memcpy_neon:
+ push {r4-r11}
+ mov r3, r0
+1: subs r2, r2, #128
+ pld [r1, #64]
+ pld [r1, #256]
+ pld [r1, #320]
+ ldm r1!, {r4-r11}
+ vld1.64 {d0-d3}, [r1,:128]!
+ vld1.64 {d4-d7}, [r1,:128]!
+ vld1.64 {d16-d19}, [r1,:128]!
+ stm r3!, {r4-r11}
+ vst1.64 {d0-d3}, [r3,:128]!
+ vst1.64 {d4-d7}, [r3,:128]!
+ vst1.64 {d16-d19}, [r3,:128]!
+ bgt 1b
+ pop {r4-r11}
+ bx lr
+
+memset_neon_1: http://gitorious.org/0xdroid/bionic/commit/780898e723d883e0ed13387f11066275121048b9
+memset_neon_2:
+ push {r4-r11}
+ mov r3, r0
+ vdup.8 q0, r1
+ vmov q1, q0
+ orr r4, r1, r1, lsl #8
+ orr r4, r4, r4, lsl #16
+ mov r5, r4
+ mov r6, r4
+ mov r7, r4
+ mov r8, r4
+ mov r9, r4
+ mov r10, r4
+ mov r11, r4
+ add r12, r3, r2, lsr #2
+1: subs r2, r2, #128
+ pld [r3, #64]
+ stm r3!, {r4-r11}
+ vst1.64 {d0-d3}, [r12,:128]!
+ vst1.64 {d0-d3}, [r12,:128]!
+ vst1.64 {d0-d3}, [r12,:128]!
+ bgt 1b
+ pop {r4-r11}
+ bx lr
+
+
View
@@ -6,7 +6,7 @@
#endif
#ifndef __ARM_NEON__
- #error "Please add -ffpu=neon to your compile flags"
+ #error "Please add -mfpu=neon to your compile flags"
#endif
// Most needed header
@@ -2,8 +2,39 @@
#include <armv7-functions/common.h>
ARMV7_FUNC_API void mat44_multiply(float32x4x4_t& result, const float32x4x4_t& a, const float32x4x4_t& b) {
- asm volatile (
- "\n\t# *static_cast<mat44*>(%m[result]) = *static_cast<mat44*>(%m[a]) * *static_cast<mat44*>(%m[b]);\n\t"
+ // result = first column of B x first row of A
+ result.val[0] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[0]), 0);
+ result.val[1] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[1]), 0);
+ result.val[2] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[2]), 0);
+ result.val[3] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[3]), 0);
+ // result += second column of B x second row of A
+ result.val[0] = vmlaq_lane_f32(result.val[0], b.val[1], vget_low_f32(a.val[0]), 1);
+ result.val[1] = vmlaq_lane_f32(result.val[1], b.val[1], vget_low_f32(a.val[1]), 1);
+ result.val[2] = vmlaq_lane_f32(result.val[2], b.val[1], vget_low_f32(a.val[2]), 1);
+ result.val[3] = vmlaq_lane_f32(result.val[3], b.val[1], vget_low_f32(a.val[3]), 1);
+ // result += third column of B x third row of A
+ result.val[0] = vmlaq_lane_f32(result.val[0], b.val[2], vget_high_f32(a.val[0]), 0);
+ result.val[1] = vmlaq_lane_f32(result.val[1], b.val[2], vget_high_f32(a.val[1]), 0);
+ result.val[2] = vmlaq_lane_f32(result.val[2], b.val[2], vget_high_f32(a.val[2]), 0);
+ result.val[3] = vmlaq_lane_f32(result.val[3], b.val[2], vget_high_f32(a.val[3]), 0);
+ // result += last column of B x last row of A
+ result.val[0] = vmlaq_lane_f32(result.val[0], b.val[3], vget_high_f32(a.val[0]), 1);
+ result.val[1] = vmlaq_lane_f32(result.val[1], b.val[3], vget_high_f32(a.val[1]), 1);
+ result.val[2] = vmlaq_lane_f32(result.val[2], b.val[3], vget_high_f32(a.val[2]), 1);
+ result.val[3] = vmlaq_lane_f32(result.val[3], b.val[3], vget_high_f32(a.val[3]), 1);
+
+#if 0
+ // Original, hand-written assembly:
+ // Pros:
+ // * used vldmia/vstmia, which gcc can't at this point
+ // * used Um constraints and %m operand, allowing gcc
+ // to use "sp" where it made sense
+ // Cons:
+ // * performed the full matrix multiplication, even
+ // when only a sub-expression was really used. The
+ // intrinsics code abode, OTOH, has parts optimized
+ // away by the compiler.
+ asm volatile(
"vldmia %m[a], {q4-q7}\n\t"
"vldmia %m[b], {q8-q11}\n\t"
"vmul.f32 q0, q8, d8[0]\n\t"
@@ -30,5 +61,6 @@ ARMV7_FUNC_API void mat44_multiply(float32x4x4_t& result, const float32x4x4_t& a
"q4", "q5", "q6", "q7",
"q8", "q9","q10", "q11"
);
+#endif
}

0 comments on commit af9ddad

Please sign in to comment.