Permalink
Browse files

added libtommath-0.13

  • Loading branch information...
1 parent 57354e1 commit b66471f74f1f16ebd823f71c5602cc6b68e69574 Tom St Denis committed with sjaeckel Feb 28, 2003
Showing with 750 additions and 517 deletions.
  1. BIN bn.pdf
  2. +16 −17 bn.tex
  3. +14 −6 bn_fast_mp_invmod.c
  4. +51 −26 bn_fast_mp_montgomery_reduce.c
  5. +35 −27 bn_fast_s_mp_mul_digs.c
  6. +7 −6 bn_fast_s_mp_mul_high_digs.c
  7. +39 −18 bn_fast_s_mp_sqr.c
  8. +6 −2 bn_mp_2expt.c
  9. +5 −2 bn_mp_abs.c
  10. +2 −2 bn_mp_add.c
  11. +2 −3 bn_mp_add_d.c
  12. +2 −3 bn_mp_addmod.c
  13. +2 −2 bn_mp_and.c
  14. +7 −1 bn_mp_clamp.c
  15. +1 −3 bn_mp_cmp.c
  16. +1 −2 bn_mp_cmp_mag.c
  17. +15 −9 bn_mp_copy.c
  18. +2 −2 bn_mp_count_bits.c
  19. +7 −9 bn_mp_div.c
  20. +23 −10 bn_mp_div_2.c
  21. +3 −3 bn_mp_div_2d.c
  22. +2 −2 bn_mp_div_d.c
  23. +1 −1 bn_mp_exch.c
  24. +2 −2 bn_mp_expt_d.c
  25. +19 −10 bn_mp_exptmod.c
  26. +10 −22 bn_mp_exptmod_fast.c
  27. +5 −5 bn_mp_gcd.c
  28. +1 −2 bn_mp_grow.c
  29. +2 −3 bn_mp_init_copy.c
  30. +3 −5 bn_mp_invmod.c
  31. +3 −3 bn_mp_jacobi.c
  32. +2 −2 bn_mp_karatsuba_mul.c
  33. +2 −2 bn_mp_karatsuba_sqr.c
  34. +2 −2 bn_mp_lcm.c
  35. +1 −1 bn_mp_lshd.c
  36. +2 −2 bn_mp_mod.c
  37. +2 −3 bn_mp_mod_2d.c
  38. +2 −2 bn_mp_mod_d.c
  39. +53 −0 bn_mp_montgomery_calc_normalization.c
  40. +5 −10 bn_mp_montgomery_reduce.c
  41. +2 −2 bn_mp_montgomery_setup.c
  42. +16 −2 bn_mp_mul.c
  43. +36 −15 bn_mp_mul_2.c
  44. +2 −2 bn_mp_mul_2d.c
  45. +29 −18 bn_mp_mul_d.c
  46. +2 −2 bn_mp_mulmod.c
  47. +6 −2 bn_mp_n_root.c
  48. +1 −1 bn_mp_neg.c
  49. +2 −2 bn_mp_or.c
  50. +8 −7 bn_mp_rand.c
  51. +1 −1 bn_mp_read_signed_bin.c
  52. +1 −1 bn_mp_read_unsigned_bin.c
  53. +3 −3 bn_mp_reduce.c
  54. +1 −1 bn_mp_rshd.c
  55. +1 −1 bn_mp_set_int.c
  56. +9 −2 bn_mp_sqr.c
  57. +2 −2 bn_mp_sqrmod.c
  58. +1 −1 bn_mp_sub.c
  59. +2 −2 bn_mp_sub_d.c
  60. +2 −2 bn_mp_submod.c
  61. +1 −1 bn_mp_to_signed_bin.c
  62. +2 −2 bn_mp_to_unsigned_bin.c
  63. +1 −1 bn_mp_unsigned_bin_size.c
  64. +2 −2 bn_mp_xor.c
  65. +10 −11 bn_radix.c
  66. +1 −1 bn_reverse.c
  67. +35 −27 bn_s_mp_add.c
  68. +6 −20 bn_s_mp_mul_digs.c
  69. +8 −13 bn_s_mp_mul_high_digs.c
  70. +5 −15 bn_s_mp_sqr.c
  71. +32 −22 bn_s_mp_sub.c
  72. +3 −3 bncore.c
  73. +6 −0 changes.txt
  74. +20 −40 demo/demo.c
  75. +12 −7 etc/makefile
  76. +1 −1 etc/pprime.c
  77. +5 −5 etc/tune.c
  78. +27 −0 gen.pl
  79. BIN ltmtest.exe
  80. +4 −4 makefile
  81. BIN mpitest.exe
  82. +1 −1 mtest/mtest.c
  83. +36 −39 timings.txt
  84. +36 −0 timings2.txt
  85. +5 −0 timings3.txt
  86. +5 −1 tommath.h
View
Binary file not shown.
View
@@ -1,7 +1,7 @@
\documentclass{article}
\begin{document}
-\title{LibTomMath v0.12 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.13 \\ A Free Multiple Precision Integer Library}
\author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle
\newpage
@@ -409,7 +409,6 @@ \subsubsection{mp\_and(mp\_int *a, mp\_int *b, mp\_int *c)}
Computes $c = a \land b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length
of $c$ is the maximum length of the two inputs.
-
\subsection{Basic Arithmetic}
\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
@@ -440,19 +439,18 @@ \subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)}
Computes $c = a \cdot b$ using signed arithmetic. Handles the sign of the numbers correctly which means it will
correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.
-For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It
-requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used
-without losing carry digits. The comba method is faster than the baseline method but cannot always be used which is why
-both are provided. The code will automatically determine when it can be used. If the digit count is higher
-than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and
-$O(N^{lg(3)})$ time.
+This function requires $O(N^2)$ time for small inputs and $O(N^{1.584})$ time for relatively large
+inputs (\textit{above the }KARATSUBA\_MUL\_CUTOFF \textit{value defined in bncore.c.}). There is
+considerable overhead in the Karatsuba method which only pays off when the digit count is fairly high
+(\textit{typically around 80}). For small inputs the function requires $O(2N)$ memory, otherwise it
+requires $O(6 \cdot \mbox{lg}(N) \cdot N)$ memory.
+
\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
-Computes $b = a^2$.
-For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It
-requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used
-without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately
-$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
+Computes $b = a^2$ and fixes the sign of $b$ to be positive.
+
+This function has a running time and memory requirement profile very similar to that of the
+mp\_mul function. It is always faster and uses less memory for the larger inputs.
\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$. The division is signed which means the sign
@@ -482,7 +480,8 @@ \subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When
-$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.
+$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. If no inverse is found (e.g. $(a, b) \ne 1$) then
+the function returns \textbf{MP\_VAL} and the result in $c$ is undefined.
\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$. Will work with either positive
@@ -497,13 +496,13 @@ \subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)}
Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
-\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
+\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int *c)}
Finds the $b$'th root of $a$ and stores it in $b$. The roots are found such that $\vert c \vert^b \le \vert a \vert$.
Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result. Each iteration
requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
-If the input $a$ is negative and $b$ is even the function returns an error. Otherwise the function will return a root
-that has a sign that agrees with the sign of $a$.
+If the input $a$ is negative and $b$ is even the function returns \textbf{MP\_VAL}. Otherwise the function will
+return a root that has a sign that agrees with the sign of $a$.
\subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)}
Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed
View
@@ -14,13 +14,17 @@
*/
#include <tommath.h>
-/* computes the modular inverse via binary extended euclidean algorithm, that is c = 1/a mod b */
+/* computes the modular inverse via binary extended euclidean algorithm,
+ * that is c = 1/a mod b
+ *
+ * Based on mp_invmod except this is optimized for the case where b is
+ * odd as per HAC Note 14.64 on pp. 610
+ */
int
fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
{
- mp_int x, y, u, v, B, D;
- int res, neg;
-
+ mp_int x, y, u, v, B, D;
+ int res, neg;
if ((res = mp_init (&x)) != MP_OKAY) {
goto __ERR;
@@ -58,7 +62,10 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
goto __D;
}
- /* 2. [modified] if x,y are both even then return an error! */
+ /* 2. [modified] if x,y are both even then return an error!
+ *
+ * That is if gcd(x,y) = 2 * k then obviously there is no inverse.
+ */
if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
res = MP_VAL;
goto __D;
@@ -135,8 +142,9 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
}
/* if not zero goto step 4 */
- if (mp_iszero (&u) == 0)
+ if (mp_iszero (&u) == 0) {
goto top;
+ }
/* now a = C, b = D, gcd == g*v */
@@ -14,12 +14,19 @@
*/
#include <tommath.h>
-/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
+/* computes xR^-1 == x (mod N) via Montgomery Reduction
+ *
+ * This is an optimized implementation of mp_montgomery_reduce
+ * which uses the comba method to quickly calculate the columns of the
+ * reduction.
+ *
+ * Based on Algorithm 14.32 on pp.601 of HAC.
+*/
int
fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
{
- int ix, res, olduse;
- mp_word W[512];
+ int ix, res, olduse;
+ mp_word W[512];
/* get old used count */
olduse = a->used;
@@ -31,14 +38,22 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
}
}
- /* copy the digits of a */
- for (ix = 0; ix < a->used; ix++) {
- W[ix] = a->dp[ix];
- }
+ {
+ register mp_word *_W;
+ register mp_digit *tmpa;
+
+ _W = W;
+ tmpa = a->dp;
+
+ /* copy the digits of a */
+ for (ix = 0; ix < a->used; ix++) {
+ *_W++ = *tmpa++;
+ }
- /* zero the high words */
- for (; ix < m->used * 2 + 1; ix++) {
- W[ix] = 0;
+ /* zero the high words */
+ for (; ix < m->used * 2 + 1; ix++) {
+ *_W++ = 0;
+ }
}
for (ix = 0; ix < m->used; ix++) {
@@ -69,8 +84,10 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
register mp_digit *tmpx;
register mp_word *_W;
- /* aliases */
+ /* alias for the digits of the modulus */
tmpx = m->dp;
+
+ /* Alias for the columns set by an offset of ix */
_W = W + ix;
/* inner loop */
@@ -88,24 +105,32 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
}
- /* copy out, A = A/b^n
- *
- * The result is A/b^n but instead of converting from an array of mp_word
- * to mp_digit than calling mp_rshd we just copy them in the right
- * order
- */
- for (ix = 0; ix < m->used + 1; ix++) {
- a->dp[ix] = W[ix + m->used] & ((mp_word) MP_MASK);
- }
+ {
+ register mp_digit *tmpa;
+ register mp_word *_W;
- /* set the max used */
- a->used = m->used + 1;
+ /* copy out, A = A/b^n
+ *
+ * The result is A/b^n but instead of converting from an array of mp_word
+ * to mp_digit than calling mp_rshd we just copy them in the right
+ * order
+ */
+ tmpa = a->dp;
+ _W = W + m->used;
+
+ for (ix = 0; ix < m->used + 1; ix++) {
+ *tmpa++ = *_W++ & ((mp_word) MP_MASK);
+ }
- /* zero oldused digits, if the input a was larger than
- * m->used+1 we'll have to clear the digits */
- for (; ix < olduse; ix++) {
- a->dp[ix] = 0;
+ /* zero oldused digits, if the input a was larger than
+ * m->used+1 we'll have to clear the digits */
+ for (; ix < olduse; ix++) {
+ *tmpa++ = 0;
+ }
}
+
+ /* set the max used and clamp */
+ a->used = m->used + 1;
mp_clamp (a);
/* if A >= m then A = A - m */
@@ -21,13 +21,20 @@
* has the effect of making the nested loops that compute the columns very
* simple and schedulable on super-scalar processors.
*
+ * This has been modified to produce a variable number of digits of output so
+ * if say only a half-product is required you don't have to compute the upper half
+ * (a feature required for fast Barrett reduction).
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ *
*/
int
fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
{
- int olduse, res, pa, ix;
- mp_word W[512];
+ int olduse, res, pa, ix;
+ mp_word W[512];
+ /* grow the destination as required */
if (c->alloc < digs) {
if ((res = mp_grow (c, digs)) != MP_OKAY) {
return res;
@@ -43,11 +50,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
/* this multiplier has been modified to allow you to control how many digits
* of output are produced. So at most we want to make upto "digs" digits
- * of output
- */
-
-
- /* this adds products to distinct columns (at ix+iy) of W
+ * of output.
+ *
+ * this adds products to distinct columns (at ix+iy) of W
* note that each step through the loop is not dependent on
* the previous which means the compiler can easily unroll
* the loop without scheduling problems
@@ -85,27 +90,30 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
olduse = c->used;
c->used = digs;
+ {
+ register mp_digit *tmpc;
+
+ /* At this point W[] contains the sums of each column. To get the
+ * correct result we must take the extra bits from each column and
+ * carry them down
+ *
+ * Note that while this adds extra code to the multiplier it saves time
+ * since the carry propagation is removed from the above nested loop.
+ * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
+ * N^2 + N*c where c is the cost of the shifting. On very small numbers
+ * this is slower but on most cryptographic size numbers it is faster.
+ */
+ tmpc = c->dp;
+ for (ix = 1; ix < digs; ix++) {
+ W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+ *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+ }
+ *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
- /* At this point W[] contains the sums of each column. To get the
- * correct result we must take the extra bits from each column and
- * carry them down
- *
- * Note that while this adds extra code to the multiplier it saves time
- * since the carry propagation is removed from the above nested loop.
- * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
- * N^2 + N*c where c is the cost of the shifting. On very small numbers
- * this is slower but on most cryptographic size numbers it is faster.
- */
-
- for (ix = 1; ix < digs; ix++) {
- W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
- c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
- }
- c->dp[digs - 1] = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
-
- /* clear unused */
- for (; ix < olduse; ix++) {
- c->dp[ix] = 0;
+ /* clear unused */
+ for (; ix < olduse; ix++) {
+ *tmpc++ = 0;
+ }
}
mp_clamp (c);
@@ -20,14 +20,16 @@
*
* This is used in the Barrett reduction since for one of the multiplications
* only the higher digits were needed. This essentially halves the work.
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
*/
int
fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
{
- int oldused, newused, res, pa, pb, ix;
- mp_word W[512];
-
+ int oldused, newused, res, pa, pb, ix;
+ mp_word W[512];
+ /* calculate size of product and allocate more space if required */
newused = a->used + b->used + 1;
if (c->alloc < newused) {
if ((res = mp_grow (c, newused)) != MP_OKAY) {
@@ -38,7 +40,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
/* like the other comba method we compute the columns first */
pa = a->used;
pb = b->used;
- memset (&W[digs], 0, (pa + pb + 1 - digs) * sizeof (mp_word));
+ memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
for (ix = 0; ix < pa; ix++) {
{
register mp_digit tmpx, *tmpy;
@@ -75,8 +77,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
}
- c->dp[(pa + pb + 1) - 1] =
- (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
+ c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
for (; ix < oldused; ix++) {
c->dp[ix] = 0;
Oops, something went wrong.

0 comments on commit b66471f

Please sign in to comment.