Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

openssl: fix build

  • Loading branch information...
commit ee73a660540528774c9a5b9894d5953313435303 1 parent 3e5b404
Fedor Indutny authored
Showing with 22,671 additions and 10,797 deletions.
  1. +6 −0 deps/openssl/asm/Makefile
  2. +17 −8 deps/openssl/asm/x64-elf-gas/aes/aes-x86_64.s
  3. +1,257 −54 deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s
  4. +1,260 −0 deps/openssl/asm/x64-elf-gas/rc4/rc4-md5-x86_64.s
  5. +464 −270 deps/openssl/asm/x64-elf-gas/rc4/rc4-x86_64.s
  6. +2,370 −1,161 deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s
  7. +979 −1,171 deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s
  8. +56 −13 deps/openssl/asm/x64-elf-gas/x86_64cpuid.s
  9. +13 −4 deps/openssl/asm/x64-macosx-gas/aes/aes-x86_64.s
  10. +1,256 −53 deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s
  11. +0 −1  deps/openssl/asm/x64-macosx-gas/md5/md5-x86_64.s
  12. +1,259 −0 deps/openssl/asm/x64-macosx-gas/rc4/rc4-md5-x86_64.s
  13. +462 −269 deps/openssl/asm/x64-macosx-gas/rc4/rc4-x86_64.s
  14. +2,369 −1,160 deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s
  15. +979 −1,171 deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s
  16. +55 −13 deps/openssl/asm/x64-macosx-gas/x86_64cpuid.s
  17. +27 −18 deps/openssl/asm/x64-win32-masm/aes/aes-x86_64.asm
  18. +1,421 −119 deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm
  19. +2 −2 deps/openssl/asm/x64-win32-masm/camellia/cmll-x86_64.asm
  20. +140 −140 deps/openssl/asm/x64-win32-masm/md5/md5-x86_64.asm
  21. +1,375 −0 deps/openssl/asm/x64-win32-masm/rc4/rc4-md5-x86_64.asm
  22. +469 −275 deps/openssl/asm/x64-win32-masm/rc4/rc4-x86_64.asm
  23. +2,457 −1,167 deps/openssl/asm/x64-win32-masm/sha/sha1-x86_64.asm
  24. +1,031 −1,223 deps/openssl/asm/x64-win32-masm/sha/sha512-x86_64.asm
  25. +102 −102 deps/openssl/asm/x64-win32-masm/whrlpool/wp-x86_64.asm
  26. +57 −13 deps/openssl/asm/x64-win32-masm/x86_64cpuid.asm
  27. +11 −11 deps/openssl/asm/x86-elf-gas/aes/aes-586.s
  28. +5 −5 deps/openssl/asm/x86-elf-gas/camellia/cmll-x86.s
  29. +180 −38 deps/openssl/asm/x86-elf-gas/rc4/rc4-586.s
  30. +600 −662 deps/openssl/asm/x86-elf-gas/sha/sha1-586.s
  31. +31 −34 deps/openssl/asm/x86-elf-gas/sha/sha256-586.s
  32. +88 −47 deps/openssl/asm/x86-elf-gas/x86cpuid.s
  33. +14 −10 deps/openssl/asm/x86-macosx-gas/aes/aes-586.s
  34. +3 −3 deps/openssl/asm/x86-macosx-gas/camellia/cmll-x86.s
  35. +10 −3 deps/openssl/asm/x86-macosx-gas/des/crypt586.s
  36. +193 −38 deps/openssl/asm/x86-macosx-gas/rc4/rc4-586.s
  37. +600 −662 deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s
  38. +31 −34 deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s
  39. +102 −47 deps/openssl/asm/x86-macosx-gas/x86cpuid.s
  40. +7 −7 deps/openssl/asm/x86-win32-masm/aes/aes-586.asm
  41. +3 −3 deps/openssl/asm/x86-win32-masm/camellia/cmll-x86.asm
  42. +187 −38 deps/openssl/asm/x86-win32-masm/rc4/rc4-586.asm
  43. +600 −662 deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm
  44. +31 −34 deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm
  45. +86 −44 deps/openssl/asm/x86-win32-masm/x86cpuid.asm
  46. +4 −5 deps/openssl/openssl.gyp
  47. +1 −1  deps/openssl/openssl/crypto/evp/e_aes.c
  48. +1 −1  deps/openssl/openssl/crypto/evp/e_aes_cbc_hmac_sha1.c
  49. +0 −1  deps/openssl/openssl/crypto/x86_64cpuid.pl
6 deps/openssl/asm/Makefile
View
@@ -24,6 +24,7 @@ OUTPUTS = \
x64-elf-gas/camellia/cmll-x86_64.s \
x64-elf-gas/md5/md5-x86_64.s \
x64-elf-gas/rc4/rc4-x86_64.s \
+ x64-elf-gas/rc4/rc4-md5-x86_64.s \
x64-elf-gas/sha/sha1-x86_64.s \
x64-elf-gas/sha/sha512-x86_64.s \
x64-elf-gas/whrlpool/wp-x86_64.s \
@@ -50,6 +51,7 @@ OUTPUTS = \
x64-macosx-gas/camellia/cmll-x86_64.s \
x64-macosx-gas/md5/md5-x86_64.s \
x64-macosx-gas/rc4/rc4-x86_64.s \
+ x64-macosx-gas/rc4/rc4-md5-x86_64.s \
x64-macosx-gas/sha/sha1-x86_64.s \
x64-macosx-gas/sha/sha512-x86_64.s \
x64-macosx-gas/whrlpool/wp-x86_64.s \
@@ -76,6 +78,7 @@ OUTPUTS = \
x64-win32-masm/camellia/cmll-x86_64.asm \
x64-win32-masm/md5/md5-x86_64.asm \
x64-win32-masm/rc4/rc4-x86_64.asm \
+ x64-win32-masm/rc4/rc4-md5-x86_64.asm \
x64-win32-masm/sha/sha1-x86_64.asm \
x64-win32-masm/sha/sha512-x86_64.asm \
x64-win32-masm/whrlpool/wp-x86_64.asm \
@@ -107,6 +110,7 @@ x64-elf-gas/bn/x86_64-mont.s: ../openssl/crypto/bn/asm/x86_64-mont.pl
x64-elf-gas/camellia/cmll-x86_64.s: ../openssl/crypto/camellia/asm/cmll-x86_64.pl
x64-elf-gas/md5/md5-x86_64.s: ../openssl/crypto/md5/asm/md5-x86_64.pl
x64-elf-gas/rc4/rc4-x86_64.s: ../openssl/crypto/rc4/asm/rc4-x86_64.pl
+x64-elf-gas/rc4/rc4-md5-x86_64.s: ../openssl/crypto/rc4/asm/rc4-md5-x86_64.pl
x64-elf-gas/sha/sha1-x86_64.s: ../openssl/crypto/sha/asm/sha1-x86_64.pl
x64-elf-gas/sha/sha512-x86_64.s: ../openssl/crypto/sha/asm/sha512-x86_64.pl
x64-elf-gas/whrlpool/wp-x86_64.s: ../openssl/crypto/whrlpool/asm/wp-x86_64.pl
@@ -116,6 +120,7 @@ x64-macosx-gas/bn/x86_64-mont.s: ../openssl/crypto/bn/asm/x86_64-mont.pl
x64-macosx-gas/camellia/cmll-x86_64.s: ../openssl/crypto/camellia/asm/cmll-x86_64.pl
x64-macosx-gas/md5/md5-x86_64.s: ../openssl/crypto/md5/asm/md5-x86_64.pl
x64-macosx-gas/rc4/rc4-x86_64.s: ../openssl/crypto/rc4/asm/rc4-x86_64.pl
+x64-macosx-gas/rc4/rc4-md5-x86_64.s: ../openssl/crypto/rc4/asm/rc4-md5-x86_64.pl
x64-macosx-gas/sha/sha1-x86_64.s: ../openssl/crypto/sha/asm/sha1-x86_64.pl
x64-macosx-gas/sha/sha512-x86_64.s: ../openssl/crypto/sha/asm/sha512-x86_64.pl
x64-macosx-gas/whrlpool/wp-x86_64.s: ../openssl/crypto/whrlpool/asm/wp-x86_64.pl
@@ -125,6 +130,7 @@ x64-win32-masm/bn/x86_64-mont.asm: ../openssl/crypto/bn/asm/x86_64-mont.pl
x64-win32-masm/camellia/cmll-x86_64.asm: ../openssl/crypto/camellia/asm/cmll-x86_64.pl
x64-win32-masm/md5/md5-x86_64.asm: ../openssl/crypto/md5/asm/md5-x86_64.pl
x64-win32-masm/rc4/rc4-x86_64.asm: ../openssl/crypto/rc4/asm/rc4-x86_64.pl
+x64-win32-masm/rc4/rc4-md5-x86_64.asm: ../openssl/crypto/rc4/asm/rc4-md5-x86_64.pl
x64-win32-masm/sha/sha1-x86_64.asm: ../openssl/crypto/sha/asm/sha1-x86_64.pl
x64-win32-masm/sha/sha512-x86_64.asm: ../openssl/crypto/sha/asm/sha512-x86_64.pl
x64-win32-masm/whrlpool/wp-x86_64.asm: ../openssl/crypto/whrlpool/asm/wp-x86_64.pl
25 deps/openssl/asm/x64-elf-gas/aes/aes-x86_64.s
View
@@ -333,6 +333,9 @@ _x86_64_AES_encrypt_compact:
.globl AES_encrypt
.type AES_encrypt,@function
.align 16
+.globl asm_AES_encrypt
+.hidden asm_AES_encrypt
+asm_AES_encrypt:
AES_encrypt:
pushq %rbx
pushq %rbp
@@ -780,6 +783,9 @@ _x86_64_AES_decrypt_compact:
.globl AES_decrypt
.type AES_decrypt,@function
.align 16
+.globl asm_AES_decrypt
+.hidden asm_AES_decrypt
+asm_AES_decrypt:
AES_decrypt:
pushq %rbx
pushq %rbp
@@ -843,10 +849,10 @@ AES_decrypt:
.Ldec_epilogue:
.byte 0xf3,0xc3
.size AES_decrypt,.-AES_decrypt
-.globl AES_set_encrypt_key
-.type AES_set_encrypt_key,@function
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,@function
.align 16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
pushq %rbx
pushq %rbp
pushq %r12
@@ -867,7 +873,7 @@ AES_set_encrypt_key:
addq $56,%rsp
.Lenc_key_epilogue:
.byte 0xf3,0xc3
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.type _x86_64_AES_set_encrypt_key,@function
.align 16
@@ -1109,10 +1115,10 @@ _x86_64_AES_set_encrypt_key:
.byte 0xf3,0xc3
.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
-.globl AES_set_decrypt_key
-.type AES_set_decrypt_key,@function
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,@function
.align 16
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
pushq %rbx
pushq %rbp
pushq %r12
@@ -1295,11 +1301,14 @@ AES_set_decrypt_key:
addq $56,%rsp
.Ldec_key_epilogue:
.byte 0xf3,0xc3
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.globl AES_cbc_encrypt
.type AES_cbc_encrypt,@function
.align 16
+.globl asm_AES_cbc_encrypt
+.hidden asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
AES_cbc_encrypt:
cmpq $0,%rdx
je .Lcbc_epilogue
1,311 deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s
View
@@ -5,6 +5,16 @@
.type bn_mul_mont,@function
.align 16
bn_mul_mont:
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ cmpq %rsi,%rdx
+ jne .Lmul4x_enter
+ jmp .Lsqr4x_enter
+
+.align 16
+.Lmul_enter:
pushq %rbx
pushq %rbp
pushq %r12
@@ -20,48 +30,63 @@ bn_mul_mont:
andq $-1024,%rsp
movq %r11,8(%rsp,%r9,8)
-.Lprologue:
+.Lmul_body:
movq %rdx,%r12
-
movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
xorq %r14,%r14
xorq %r15,%r15
- movq (%r12),%rbx
- movq (%rsi),%rax
+ movq %r8,%rbp
mulq %rbx
movq %rax,%r10
- movq %rdx,%r11
+ movq (%rcx),%rax
- imulq %r8,%rax
- movq %rax,%rbp
+ imulq %r10,%rbp
+ movq %rdx,%r11
- mulq (%rcx)
- addq %r10,%rax
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
adcq $0,%rdx
movq %rdx,%r13
leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
.L1st:
+ addq %rax,%r13
movq (%rsi,%r15,8),%rax
- mulq %rbx
- addq %r11,%rax
adcq $0,%rdx
- movq %rax,%r10
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
movq (%rcx,%r15,8),%rax
- movq %rdx,%r11
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
mulq %rbp
- addq %r13,%rax
- leaq 1(%r15),%r15
+ cmpq %r9,%r15
+ jne .L1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
adcq $0,%rdx
- addq %r10,%rax
+ addq %r11,%r13
adcq $0,%rdx
- movq %rax,-16(%rsp,%r15,8)
- cmpq %r9,%r15
+ movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
- jl .L1st
+ movq %r10,%r11
xorq %rdx,%rdx
addq %r11,%r13
@@ -70,50 +95,64 @@ bn_mul_mont:
movq %rdx,(%rsp,%r9,8)
leaq 1(%r14),%r14
-.align 4
+ jmp .Louter
+.align 16
.Louter:
- xorq %r15,%r15
-
movq (%r12,%r14,8),%rbx
- movq (%rsi),%rax
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
mulq %rbx
- addq (%rsp),%rax
+ addq %rax,%r10
+ movq (%rcx),%rax
adcq $0,%rdx
- movq %rax,%r10
- movq %rdx,%r11
- imulq %r8,%rax
- movq %rax,%rbp
+ imulq %r10,%rbp
+ movq %rdx,%r11
- mulq (%rcx,%r15,8)
- addq %r10,%rax
- movq 8(%rsp),%r10
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
adcq $0,%rdx
+ movq 8(%rsp),%r10
movq %rdx,%r13
leaq 1(%r15),%r15
-.align 4
+ jmp .Linner_enter
+
+.align 16
.Linner:
+ addq %rax,%r13
movq (%rsi,%r15,8),%rax
- mulq %rbx
- addq %r11,%rax
adcq $0,%rdx
- addq %rax,%r10
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
+ addq %r11,%r10
movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
mulq %rbp
- addq %r13,%rax
- leaq 1(%r15),%r15
- adcq $0,%rdx
- addq %r10,%rax
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
adcq $0,%rdx
+ addq %r10,%r13
movq (%rsp,%r15,8),%r10
- cmpq %r9,%r15
- movq %rax,-16(%rsp,%r15,8)
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
- jl .Linner
xorq %rdx,%rdx
addq %r11,%r13
@@ -127,35 +166,434 @@ bn_mul_mont:
cmpq %r9,%r14
jl .Louter
- leaq (%rsp),%rsi
- leaq -1(%r9),%r15
-
- movq (%rsi),%rax
xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
jmp .Lsub
.align 16
.Lsub: sbbq (%rcx,%r14,8),%rax
movq %rax,(%rdi,%r14,8)
- decq %r15
movq 8(%rsi,%r14,8),%rax
leaq 1(%r14),%r14
- jge .Lsub
+ decq %r15
+ jnz .Lsub
sbbq $0,%rax
+ xorq %r14,%r14
andq %rax,%rsi
notq %rax
movq %rdi,%rcx
andq %rax,%rcx
- leaq -1(%r9),%r15
+ movq %r9,%r15
orq %rcx,%rsi
.align 16
.Lcopy:
+ movq (%rsi,%r14,8),%rax
+ movq %r14,(%rsp,%r14,8)
+ movq %rax,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont,.-bn_mul_mont
+.type bn_mul4x_mont,@function
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 4(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
movq (%rsi,%r15,8),%rax
- movq %rax,(%rdi,%r15,8)
- movq %r14,(%rsp,%r15,8)
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jl .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -1(%r9),%r15
+ orq %rcx,%rsi
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
+ leaq 32(%r14),%r14
decq %r15
- jge .Lcopy
+ jnz .Lcopy4x
+ shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
movq (%rsi),%r15
@@ -165,8 +603,773 @@ bn_mul_mont:
movq 32(%rsi),%rbp
movq 40(%rsi),%rbx
leaq 48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
.byte 0xf3,0xc3
-.size bn_mul_mont,.-bn_mul_mont
+.size bn_mul4x_mont,.-bn_mul4x_mont
+.type bn_sqr4x_mont,@function
+.align 16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ movq %rsp,%r11
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,2),%rsp
+ andq $-1024,%rsp
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdi,32(%rsp)
+ movq %rcx,40(%rsp)
+ movq %r8,48(%rsp)
+ movq %r11,56(%rsp)
+.Lsqr4x_body:
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ xorq %r10,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ leaq -16(%rbp),%rcx
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 16(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp .Lsqr4x_1st
+
+.align 16
+.Lsqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,(%rdi,%rcx,1)
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,8(%rdi,%rcx,1)
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,16(%rdi,%rcx,1)
+
+
+ movq 24(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 32(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_1st
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ leaq 16(%rbp),%rbp
+ movq %r12,8(%rdi)
+ jmp .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ movq -24(%rdi,%rbp,1),%r10
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ xorq %r10,%r10
+ addq -16(%rdi,%rbp,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ leaq -16(%rbp),%rcx
+ xorq %r12,%r12
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,8(%rdi,%rcx,1)
+
+ leaq 16(%rcx),%rcx
+ jmp .Lsqr4x_inner
+
+.align 16
+.Lsqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,(%rdi,%rcx,1)
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 16(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_inner
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ movq %r12,8(%rdi)
+
+ addq $16,%rbp
+ jnz .Lsqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-24(%rdi)
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi)
+
+ movq -8(%rsi),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ movq %rdx,%r13
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi)
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ movq %r12,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 64(%rsp,%r9,2),%rdi
+ xorq %r10,%r10
+ movq -24(%rdi,%rbp,2),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,-40(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ jmp .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ addq $32,%rbp
+ jnz .Lsqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+ movq 40(%rsp),%rsi
+ movq 48(%rsp),%r8
+ xorq %rcx,%rcx
+ movq %r9,0(%rsp)
+ subq %r9,%rcx
+ movq 64(%rsp),%r10
+ movq %r8,%r14
+ leaq 64(%rsp,%r9,2),%rax
+ leaq 64(%rsp,%r9,1),%rdi
+ movq %rax,8(%rsp)
+ leaq (%rsi,%r9,1),%rsi
+ xorq %rbp,%rbp
+
+ movq 0(%rsi,%rcx,1),%rax
+ movq 8(%rsi,%rcx,1),%r9
+ imulq %r10,%r14
+ movq %rax,%rbx
+ jmp .Lsqr4x_mont_outer
+
+.align 16
+.Lsqr4x_mont_outer:
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+ movq %r8,%r15
+
+ xorq %r10,%r10
+ addq 8(%rdi,%rcx,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+
+ imulq %r11,%r15
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 24(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,16(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 24(%rdi,%rcx,1),%r11
+ leaq 32(%rcx),%rcx
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ jmp .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,-8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq (%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 8(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 8(%rdi,%rcx,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 24(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,16(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 24(%rdi,%rcx,1),%r11
+ leaq 32(%rcx),%rcx
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ cmpq $0,%rcx
+ jne .Lsqr4x_mont_inner
+
+ subq 0(%rsp),%rcx
+ movq %r8,%r14
+
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ movq %r12,-8(%rdi)
+
+ xorq %r11,%r11
+ addq (%rdi),%r10
+ adcq $0,%r11
+ movq 0(%rsi,%rcx,1),%rbx
+ addq %rbp,%r10
+ adcq $0,%r11
+
+ imulq 16(%rdi,%rcx,1),%r14
+ xorq %r12,%r12
+ movq 8(%rsi,%rcx,1),%r9
+ addq %r10,%r13
+ movq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+ movq %r13,(%rdi)
+
+ xorq %rbp,%rbp
+ addq 8(%rdi),%r12
+ adcq %rbp,%rbp
+ addq %r11,%r12
+ leaq 16(%rdi),%rdi
+ adcq $0,%rbp
+ movq %r12,-8(%rdi)
+ cmpq 8(%rsp),%rdi
+ jb .Lsqr4x_mont_outer
+
+ movq 0(%rsp),%r9
+ movq %rbp,(%rdi)
+ movq 64(%rsp,%r9,1),%rax
+ leaq 64(%rsp,%r9,1),%rbx
+ movq 40(%rsp),%rsi
+ shrq $5,%r9
+ movq 8(%rbx),%rdx
+ xorq %rbp,%rbp
+
+ movq 32(%rsp),%rdi
+ subq 0(%rsi),%rax
+ movq 16(%rbx),%r10
+ movq 24(%rbx),%r11
+ sbbq 8(%rsi),%rdx
+ leaq -1(%r9),%rcx
+ jmp .Lsqr4x_sub
+.align 16
+.Lsqr4x_sub:
+ movq %rax,0(%rdi,%rbp,8)
+ movq %rdx,8(%rdi,%rbp,8)
+ sbbq 16(%rsi,%rbp,8),%r10
+ movq 32(%rbx,%rbp,8),%rax
+ movq 40(%rbx,%rbp,8),%rdx
+ sbbq 24(%rsi,%rbp,8),%r11
+ movq %r10,16(%rdi,%rbp,8)
+ movq %r11,24(%rdi,%rbp,8)
+ sbbq 32(%rsi,%rbp,8),%rax
+ movq 48(%rbx,%rbp,8),%r10
+ movq 56(%rbx,%rbp,8),%r11
+ sbbq 40(%rsi,%rbp,8),%rdx
+ leaq 4(%rbp),%rbp
+ decq %rcx
+ jnz .Lsqr4x_sub
+
+ movq %rax,0(%rdi,%rbp,8)
+ movq 32(%rbx,%rbp,8),%rax
+ sbbq 16(%rsi,%rbp,8),%r10
+ movq %rdx,8(%rdi,%rbp,8)
+ sbbq 24(%rsi,%rbp,8),%r11
+ movq %r10,16(%rdi,%rbp,8)
+
+ sbbq $0,%rax
+ movq %r11,24(%rdi,%rbp,8)
+ xorq %rbp,%rbp
+ andq %rax,%rbx
+ notq %rax
+ movq %rdi,%rsi
+ andq %rax,%rsi
+ leaq -1(%r9),%rcx
+ orq %rsi,%rbx
+
+ pxor %xmm0,%xmm0
+ leaq 64(%rsp,%r9,8),%rsi
+ movdqu (%rbx),%xmm1
+ leaq (%rsi,%r9,8),%rsi
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm0,(%rsi)
+ movdqu %xmm1,(%rdi)
+ jmp .Lsqr4x_copy
+.align 16
+.Lsqr4x_copy:
+ movdqu 16(%rbx,%rbp,1),%xmm2
+ movdqu 32(%rbx,%rbp,1),%xmm1
+ movdqa %xmm0,80(%rsp,%rbp,1)
+ movdqa %xmm0,96(%rsp,%rbp,1)
+ movdqa %xmm0,16(%rsi,%rbp,1)
+ movdqa %xmm0,32(%rsi,%rbp,1)
+ movdqu %xmm2,16(%rdi,%rbp,1)
+ movdqu %xmm1,32(%rdi,%rbp,1)
+ leaq 32(%rbp),%rbp
+ decq %rcx
+ jnz .Lsqr4x_copy
+
+ movdqu 16(%rbx,%rbp,1),%xmm2
+ movdqa %xmm0,80(%rsp,%rbp,1)
+ movdqa %xmm0,16(%rsi,%rbp,1)
+ movdqu %xmm2,16(%rdi,%rbp,1)
+ movq 56(%rsp),%rsi
+ movq $1,%rax
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lsqr4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_sqr4x_mont,.-bn_sqr4x_mont
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
1,260 deps/openssl/asm/x64-elf-gas/rc4/rc4-md5-x86_64.s
View
@@ -0,0 +1,1260 @@
+.text
+
+.align 16
+
+.globl rc4_md5_enc
+.type rc4_md5_enc,@function
+rc4_md5_enc:
+ cmpq $0,%r9
+ je .Labort
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $40,%rsp
+.Lbody:
+ movq %rcx,%r11
+ movq %r9,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %r8,%r15
+ xorq %rbp,%rbp
+ xorq %rcx,%rcx
+
+ leaq 8(%rdi),%rdi
+ movb -8(%rdi),%bpl
+ movb -4(%rdi),%cl
+
+ incb %bpl
+ subq %r13,%r14
+ movl (%rdi,%rbp,4),%eax
+ addb %al,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ shlq $6,%r12
+ addq %r15,%r12
+ movq %r12,16(%rsp)
+
+ movq %r11,24(%rsp)
+ movl 0(%r11),%r8d
+ movl 4(%r11),%r9d
+ movl 8(%r11),%r10d
+ movl 12(%r11),%r11d
+ jmp .Loop
+
+.align 16
+.Loop:
+ movl %r8d,0(%rsp)
+ movl %r9d,4(%rsp)
+ movl %r10d,8(%rsp)
+ movl %r11d,%r12d
+ movl %r11d,12(%rsp)
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 0(%r15),%r8d
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ addl $3614090360,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,0(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 4(%r15),%r11d
+ addb %dl,%bl
+ movl 8(%rsi),%eax
+ addl $3905402710,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,4(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 8(%r15),%r10d
+ addb %dl,%al
+ movl 12(%rsi),%ebx
+ addl $606105819,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,8(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 12(%r15),%r9d
+ addb %dl,%bl
+ movl 16(%rsi),%eax
+ addl $3250441966,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,12(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 16(%r15),%r8d
+ addb %dl,%al
+ movl 20(%rsi),%ebx
+ addl $4118548399,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,16(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 20(%r15),%r11d
+ addb %dl,%bl
+ movl 24(%rsi),%eax
+ addl $1200080426,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,20(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 24(%r15),%r10d
+ addb %dl,%al
+ movl 28(%rsi),%ebx
+ addl $2821735955,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,24(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 28(%r15),%r9d
+ addb %dl,%bl
+ movl 32(%rsi),%eax
+ addl $4249261313,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,28(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 32(%r15),%r8d
+ addb %dl,%al
+ movl 36(%rsi),%ebx
+ addl $1770035416,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,32(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 36(%r15),%r11d
+ addb %dl,%bl
+ movl 40(%rsi),%eax
+ addl $2336552879,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,36(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 40(%r15),%r10d
+ addb %dl,%al
+ movl 44(%rsi),%ebx
+ addl $4294925233,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,40(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 44(%r15),%r9d
+ addb %dl,%bl
+ movl 48(%rsi),%eax
+ addl $2304563134,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,44(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 48(%r15),%r8d
+ addb %dl,%al
+ movl 52(%rsi),%ebx
+ addl $1804603682,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,48(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 52(%r15),%r11d
+ addb %dl,%bl
+ movl 56(%rsi),%eax
+ addl $4254626195,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,52(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 56(%r15),%r10d
+ addb %dl,%al
+ movl 60(%rsi),%ebx
+ addl $2792965006,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,56(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu (%r13),%xmm2
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 60(%r15),%r9d
+ addb %dl,%bl
+ movl 64(%rsi),%eax
+ addl $1236535329,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,60(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r10d,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm1,%xmm2
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 4(%r15),%r8d
+ addb %dl,%al
+ movl 68(%rsi),%ebx
+ addl $4129170786,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,64(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 24(%r15),%r11d
+ addb %dl,%bl
+ movl 72(%rsi),%eax
+ addl $3225465664,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,68(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 44(%r15),%r10d
+ addb %dl,%al
+ movl 76(%rsi),%ebx
+ addl $643717713,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,72(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 0(%r15),%r9d
+ addb %dl,%bl
+ movl 80(%rsi),%eax
+ addl $3921069994,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,76(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 20(%r15),%r8d
+ addb %dl,%al
+ movl 84(%rsi),%ebx
+ addl $3593408605,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,80(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 40(%r15),%r11d
+ addb %dl,%bl
+ movl 88(%rsi),%eax
+ addl $38016083,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,84(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 60(%r15),%r10d
+ addb %dl,%al
+ movl 92(%rsi),%ebx
+ addl $3634488961,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,88(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 16(%r15),%r9d
+ addb %dl,%bl
+ movl 96(%rsi),%eax
+ addl $3889429448,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,92(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 36(%r15),%r8d
+ addb %dl,%al
+ movl 100(%rsi),%ebx
+ addl $568446438,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,96(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 56(%r15),%r11d
+ addb %dl,%bl
+ movl 104(%rsi),%eax
+ addl $3275163606,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,100(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 12(%r15),%r10d
+ addb %dl,%al
+ movl 108(%rsi),%ebx
+ addl $4107603335,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,104(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 32(%r15),%r9d
+ addb %dl,%bl
+ movl 112(%rsi),%eax
+ addl $1163531501,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,108(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 52(%r15),%r8d
+ addb %dl,%al
+ movl 116(%rsi),%ebx
+ addl $2850285829,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,112(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 8(%r15),%r11d
+ addb %dl,%bl
+ movl 120(%rsi),%eax
+ addl $4243563512,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,116(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 28(%r15),%r10d
+ addb %dl,%al
+ movl 124(%rsi),%ebx
+ addl $1735328473,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,120(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 16(%r13),%xmm3
+ addb $32,%bpl
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 48(%r15),%r9d
+ addb %dl,%bl
+ movl 0(%rdi,%rbp,4),%eax
+ addl $2368359562,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,124(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r11d,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movq %rcx,%rsi
+ xorq %rcx,%rcx
+ movb %sil,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 20(%r15),%r8d
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ addl $4294588738,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,0(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 32(%r15),%r11d
+ addb %dl,%bl
+ movl 8(%rsi),%eax
+ addl $2272392833,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,4(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 44(%r15),%r10d
+ addb %dl,%al
+ movl 12(%rsi),%ebx
+ addl $1839030562,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,8(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 56(%r15),%r9d
+ addb %dl,%bl
+ movl 16(%rsi),%eax
+ addl $4259657740,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,12(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 4(%r15),%r8d
+ addb %dl,%al
+ movl 20(%rsi),%ebx
+ addl $2763975236,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,16(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 16(%r15),%r11d
+ addb %dl,%bl
+ movl 24(%rsi),%eax
+ addl $1272893353,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,20(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 28(%r15),%r10d
+ addb %dl,%al
+ movl 28(%rsi),%ebx
+ addl $4139469664,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,24(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 40(%r15),%r9d
+ addb %dl,%bl
+ movl 32(%rsi),%eax
+ addl $3200236656,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,28(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 52(%r15),%r8d
+ addb %dl,%al
+ movl 36(%rsi),%ebx
+ addl $681279174,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,32(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 0(%r15),%r11d
+ addb %dl,%bl
+ movl 40(%rsi),%eax
+ addl $3936430074,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,36(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 12(%r15),%r10d
+ addb %dl,%al
+ movl 44(%rsi),%ebx
+ addl $3572445317,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,40(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 24(%r15),%r9d
+ addb %dl,%bl
+ movl 48(%rsi),%eax
+ addl $76029189,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,44(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 36(%r15),%r8d
+ addb %dl,%al
+ movl 52(%rsi),%ebx
+ addl $3654602809,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,48(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 48(%r15),%r11d
+ addb %dl,%bl
+ movl 56(%rsi),%eax
+ addl $3873151461,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,52(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 60(%r15),%r10d
+ addb %dl,%al
+ movl 60(%rsi),%ebx
+ addl $530742520,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,56(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 32(%r13),%xmm4
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 8(%r15),%r9d
+ addb %dl,%bl
+ movl 64(%rsi),%eax
+ addl $3299628645,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,60(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm4
+ pxor %xmm1,%xmm4
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 0(%r15),%r8d
+ addb %dl,%al
+ movl 68(%rsi),%ebx
+ addl $4096336452,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,64(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 28(%r15),%r11d
+ addb %dl,%bl
+ movl 72(%rsi),%eax
+ addl $1126891415,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,68(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 56(%r15),%r10d
+ addb %dl,%al
+ movl 76(%rsi),%ebx
+ addl $2878612391,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,72(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 20(%r15),%r9d
+ addb %dl,%bl
+ movl 80(%rsi),%eax
+ addl $4237533241,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,76(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 48(%r15),%r8d
+ addb %dl,%al
+ movl 84(%rsi),%ebx
+ addl $1700485571,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,80(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 12(%r15),%r11d
+ addb %dl,%bl
+ movl 88(%rsi),%eax
+ addl $2399980690,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,84(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 40(%r15),%r10d
+ addb %dl,%al
+ movl 92(%rsi),%ebx
+ addl $4293915773,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,88(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 4(%r15),%r9d
+ addb %dl,%bl
+ movl 96(%rsi),%eax
+ addl $2240044497,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,92(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 32(%r15),%r8d
+ addb %dl,%al
+ movl 100(%rsi),%ebx
+ addl $1873313359,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,96(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 60(%r15),%r11d
+ addb %dl,%bl
+ movl 104(%rsi),%eax
+ addl $4264355552,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,100(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 24(%r15),%r10d
+ addb %dl,%al
+ movl 108(%rsi),%ebx
+ addl $2734768916,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,104(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 52(%r15),%r9d
+ addb %dl,%bl
+ movl 112(%rsi),%eax
+ addl $1309151649,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,108(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 16(%r15),%r8d
+ addb %dl,%al
+ movl 116(%rsi),%ebx
+ addl $4149444226,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,112(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 44(%r15),%r11d
+ addb %dl,%bl
+ movl 120(%rsi),%eax
+ addl $3174756917,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,116(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 8(%r15),%r10d
+ addb %dl,%al
+ movl 124(%rsi),%ebx
+ addl $718787259,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,120(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 48(%r13),%xmm5
+ addb $32,%bpl
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 36(%r15),%r9d
+ addb %dl,%bl
+ movl 0(%rdi,%rbp,4),%eax
+ addl $3951481745,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,124(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movq %rbp,%rsi
+ xorq %rbp,%rbp
+ movb %sil,%bpl
+ movq %rcx,%rsi
+ xorq %rcx,%rcx
+ movb %sil,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm5
+ pxor %xmm1,%xmm5
+ addl 0(%rsp),%r8d
+ addl 4(%rsp),%r9d