Skip to content
This repository
Browse code

Near optimal mat44_multiply

  • Loading branch information...
commit af9ddad76c542a924f2b56dd00f72ba7338569b4 1 parent 9d57f46
Julien Cayzac authored December 26, 2010
53  .attic/memcpy_memset.txt
... ...
@@ -0,0 +1,53 @@
  1
+
  2
+(Need to enable PLD, see
  3
+http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344b/Babjbfdb.html
  4
+http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344k/Cbbbdaed.html
  5
+http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344k/Bgbciiaf.html
  6
+)
  7
+
  8
+memcpy_neon:
  9
+        push            {r4-r11}
  10
+        mov             r3, r0
  11
+1:      subs            r2, r2, #128
  12
+        pld             [r1, #64]
  13
+        pld             [r1, #256]
  14
+        pld             [r1, #320]
  15
+        ldm             r1!, {r4-r11}
  16
+        vld1.64         {d0-d3},   [r1,:128]!
  17
+        vld1.64         {d4-d7},   [r1,:128]!
  18
+        vld1.64         {d16-d19}, [r1,:128]!
  19
+        stm             r3!, {r4-r11}
  20
+        vst1.64         {d0-d3},   [r3,:128]!
  21
+        vst1.64         {d4-d7},   [r3,:128]!
  22
+        vst1.64         {d16-d19}, [r3,:128]!
  23
+        bgt             1b
  24
+        pop             {r4-r11}
  25
+        bx              lr
  26
+
  27
+memset_neon_1: http://gitorious.org/0xdroid/bionic/commit/780898e723d883e0ed13387f11066275121048b9
  28
+memset_neon_2:
  29
+        push            {r4-r11}
  30
+        mov             r3,  r0
  31
+        vdup.8          q0,  r1
  32
+        vmov            q1,  q0
  33
+        orr             r4,  r1, r1, lsl #8
  34
+        orr             r4,  r4, r4, lsl #16
  35
+        mov             r5,  r4
  36
+        mov             r6,  r4
  37
+        mov             r7,  r4
  38
+        mov             r8,  r4
  39
+        mov             r9,  r4
  40
+        mov             r10, r4
  41
+        mov             r11, r4
  42
+        add             r12, r3,  r2, lsr #2
  43
+1:      subs            r2,  r2, #128
  44
+        pld             [r3, #64]
  45
+        stm             r3!, {r4-r11}
  46
+        vst1.64         {d0-d3},   [r12,:128]!
  47
+        vst1.64         {d0-d3},   [r12,:128]!
  48
+        vst1.64         {d0-d3},   [r12,:128]!
  49
+        bgt             1b
  50
+        pop             {r4-r11}
  51
+        bx              lr
  52
+
  53
+
2  common.h
@@ -6,7 +6,7 @@
6 6
 #endif
7 7
 
8 8
 #ifndef __ARM_NEON__
9  
-	#error "Please add -ffpu=neon to your compile flags"
  9
+	#error "Please add -mfpu=neon to your compile flags"
10 10
 #endif
11 11
 
12 12
 // Most needed header
36  vectormath/mat44_multiply.h
@@ -2,8 +2,39 @@
2 2
 #include <armv7-functions/common.h>
3 3
 
4 4
 ARMV7_FUNC_API void mat44_multiply(float32x4x4_t& result, const float32x4x4_t& a, const float32x4x4_t& b) {
5  
-	asm volatile (
6  
-	"\n\t# *static_cast<mat44*>(%m[result]) = *static_cast<mat44*>(%m[a]) * *static_cast<mat44*>(%m[b]);\n\t"
  5
+	// result = first column of B x first row of A
  6
+	result.val[0] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[0]), 0);
  7
+	result.val[1] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[1]), 0);
  8
+	result.val[2] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[2]), 0);
  9
+	result.val[3] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[3]), 0);
  10
+	// result += second column of B x second row of A
  11
+	result.val[0] = vmlaq_lane_f32(result.val[0], b.val[1], vget_low_f32(a.val[0]), 1);
  12
+	result.val[1] = vmlaq_lane_f32(result.val[1], b.val[1], vget_low_f32(a.val[1]), 1);
  13
+	result.val[2] = vmlaq_lane_f32(result.val[2], b.val[1], vget_low_f32(a.val[2]), 1);
  14
+	result.val[3] = vmlaq_lane_f32(result.val[3], b.val[1], vget_low_f32(a.val[3]), 1);
  15
+	// result += third column of B x third row of A
  16
+	result.val[0] = vmlaq_lane_f32(result.val[0], b.val[2], vget_high_f32(a.val[0]), 0);
  17
+	result.val[1] = vmlaq_lane_f32(result.val[1], b.val[2], vget_high_f32(a.val[1]), 0);
  18
+	result.val[2] = vmlaq_lane_f32(result.val[2], b.val[2], vget_high_f32(a.val[2]), 0);
  19
+	result.val[3] = vmlaq_lane_f32(result.val[3], b.val[2], vget_high_f32(a.val[3]), 0);
  20
+	// result += last column of B x last row of A
  21
+	result.val[0] = vmlaq_lane_f32(result.val[0], b.val[3], vget_high_f32(a.val[0]), 1);
  22
+	result.val[1] = vmlaq_lane_f32(result.val[1], b.val[3], vget_high_f32(a.val[1]), 1);
  23
+	result.val[2] = vmlaq_lane_f32(result.val[2], b.val[3], vget_high_f32(a.val[2]), 1);
  24
+	result.val[3] = vmlaq_lane_f32(result.val[3], b.val[3], vget_high_f32(a.val[3]), 1);
  25
+
  26
+#if 0
  27
+	// Original, hand-written assembly:
  28
+	// Pros:
  29
+	//    * used vldmia/vstmia, which gcc can't at this point
  30
+	//    * used Um constraints and %m operand, allowing gcc
  31
+	//      to use "sp" where it made sense
  32
+	// Cons:
  33
+	//    * performed the full matrix multiplication, even
  34
+	//      when only a sub-expression was really used. The
  35
+	//      intrinsics code abode, OTOH, has parts optimized
  36
+	//      away by the compiler.
  37
+	asm volatile(
7 38
 	"vldmia   %m[a], {q4-q7}\n\t"
8 39
 	"vldmia   %m[b], {q8-q11}\n\t"
9 40
 	"vmul.f32 q0,  q8,  d8[0]\n\t"
@@ -30,5 +61,6 @@ ARMV7_FUNC_API void mat44_multiply(float32x4x4_t& result, const float32x4x4_t& a
30 61
 	  "q4", "q5", "q6", "q7",
31 62
 	  "q8", "q9","q10", "q11"
32 63
 	);
  64
+#endif
33 65
 }
34 66
 

0 notes on commit af9ddad

Please sign in to comment.
Something went wrong with that request. Please try again.