Skip to content

Commit

Permalink
added libtommath-0.18
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom St Denis authored and sjaeckel committed Jul 15, 2010
1 parent fd181cc commit 0ef44ce
Show file tree
Hide file tree
Showing 108 changed files with 10,043 additions and 2,693 deletions.
Binary file modified bn.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion bn.tex
@@ -1,7 +1,7 @@
\documentclass[]{article} \documentclass[]{article}
\begin{document} \begin{document}


\title{LibTomMath v0.17 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org } \title{LibTomMath v0.18 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
\author{Tom St Denis \\ tomstdenis@iahu.ca} \author{Tom St Denis \\ tomstdenis@iahu.ca}
\maketitle \maketitle
\newpage \newpage
Expand Down
87 changes: 44 additions & 43 deletions bn_fast_mp_montgomery_reduce.c
Expand Up @@ -14,7 +14,7 @@
*/ */
#include <tommath.h> #include <tommath.h>


/* computes xR^-1 == x (mod N) via Montgomery Reduction /* computes xR**-1 == x (mod N) via Montgomery Reduction
* *
* This is an optimized implementation of mp_montgomery_reduce * This is an optimized implementation of mp_montgomery_reduce
* which uses the comba method to quickly calculate the columns of the * which uses the comba method to quickly calculate the columns of the
Expand All @@ -23,76 +23,77 @@
* Based on Algorithm 14.32 on pp.601 of HAC. * Based on Algorithm 14.32 on pp.601 of HAC.
*/ */
int int
fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp) fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
{ {
int ix, res, olduse; int ix, res, olduse;
mp_word W[MP_WARRAY]; mp_word W[MP_WARRAY];


/* get old used count */ /* get old used count */
olduse = a->used; olduse = x->used;


/* grow a as required */ /* grow a as required */
if (a->alloc < m->used + 1) { if (x->alloc < n->used + 1) {
if ((res = mp_grow (a, m->used + 1)) != MP_OKAY) { if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
return res; return res;
} }
} }


{ {
register mp_word *_W; register mp_word *_W;
register mp_digit *tmpa; register mp_digit *tmpx;


_W = W; _W = W;
tmpa = a->dp; tmpx = x->dp;


/* copy the digits of a into W[0..a->used-1] */ /* copy the digits of a into W[0..a->used-1] */
for (ix = 0; ix < a->used; ix++) { for (ix = 0; ix < x->used; ix++) {
*_W++ = *tmpa++; *_W++ = *tmpx++;
} }


/* zero the high words of W[a->used..m->used*2] */ /* zero the high words of W[a->used..m->used*2] */
for (; ix < m->used * 2 + 1; ix++) { for (; ix < n->used * 2 + 1; ix++) {
*_W++ = 0; *_W++ = 0;
} }
} }


for (ix = 0; ix < m->used; ix++) { for (ix = 0; ix < n->used; ix++) {
/* ui = ai * m' mod b /* mu = ai * m' mod b
* *
* We avoid a double precision multiplication (which isn't required) * We avoid a double precision multiplication (which isn't required)
* by casting the value down to a mp_digit. Note this requires that W[ix-1] have * by casting the value down to a mp_digit. Note this requires
* the carry cleared (see after the inner loop) * that W[ix-1] have the carry cleared (see after the inner loop)
*/ */
register mp_digit ui; register mp_digit mu;
ui = (((mp_digit) (W[ix] & MP_MASK)) * mp) & MP_MASK; mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;


/* a = a + ui * m * b^i /* a = a + mu * m * b**i
* *
* This is computed in place and on the fly. The multiplication * This is computed in place and on the fly. The multiplication
* by b^i is handled by offseting which columns the results * by b**i is handled by offseting which columns the results
* are added to. * are added to.
* *
* Note the comba method normally doesn't handle carries in the inner loop * Note the comba method normally doesn't handle carries in the
* In this case we fix the carry from the previous column since the Montgomery * inner loop In this case we fix the carry from the previous
* reduction requires digits of the result (so far) [see above] to work. This is * column since the Montgomery reduction requires digits of the
* handled by fixing up one carry after the inner loop. The carry fixups are done * result (so far) [see above] to work. This is
* in order so after these loops the first m->used words of W[] have the carries * handled by fixing up one carry after the inner loop. The
* fixed * carry fixups are done in order so after these loops the
* first m->used words of W[] have the carries fixed
*/ */
{ {
register int iy; register int iy;
register mp_digit *tmpx; register mp_digit *tmpn;
register mp_word *_W; register mp_word *_W;


/* alias for the digits of the modulus */ /* alias for the digits of the modulus */
tmpx = m->dp; tmpn = n->dp;


/* Alias for the columns set by an offset of ix */ /* Alias for the columns set by an offset of ix */
_W = W + ix; _W = W + ix;


/* inner loop */ /* inner loop */
for (iy = 0; iy < m->used; iy++) { for (iy = 0; iy < n->used; iy++) {
*_W++ += ((mp_word) ui) * ((mp_word) * tmpx++); *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
} }
} }


Expand All @@ -102,44 +103,44 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)




{ {
register mp_digit *tmpa; register mp_digit *tmpx;
register mp_word *_W, *_W1; register mp_word *_W, *_W1;


/* nox fix rest of carries */ /* nox fix rest of carries */
_W1 = W + ix; _W1 = W + ix;
_W = W + ++ix; _W = W + ++ix;


for (; ix <= m->used * 2 + 1; ix++) { for (; ix <= n->used * 2 + 1; ix++) {
*_W++ += *_W1++ >> ((mp_word) DIGIT_BIT); *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
} }


/* copy out, A = A/b^n /* copy out, A = A/b**n
* *
* The result is A/b^n but instead of converting from an array of mp_word * The result is A/b**n but instead of converting from an
* to mp_digit than calling mp_rshd we just copy them in the right * array of mp_word to mp_digit than calling mp_rshd
* order * we just copy them in the right order
*/ */
tmpa = a->dp; tmpx = x->dp;
_W = W + m->used; _W = W + n->used;


for (ix = 0; ix < m->used + 1; ix++) { for (ix = 0; ix < n->used + 1; ix++) {
*tmpa++ = *_W++ & ((mp_word) MP_MASK); *tmpx++ = *_W++ & ((mp_word) MP_MASK);
} }


/* zero oldused digits, if the input a was larger than /* zero oldused digits, if the input a was larger than
* m->used+1 we'll have to clear the digits */ * m->used+1 we'll have to clear the digits */
for (; ix < olduse; ix++) { for (; ix < olduse; ix++) {
*tmpa++ = 0; *tmpx++ = 0;
} }
} }


/* set the max used and clamp */ /* set the max used and clamp */
a->used = m->used + 1; x->used = n->used + 1;
mp_clamp (a); mp_clamp (x);


/* if A >= m then A = A - m */ /* if A >= m then A = A - m */
if (mp_cmp_mag (a, m) != MP_LT) { if (mp_cmp_mag (x, n) != MP_LT) {
return s_mp_sub (a, m, a); return s_mp_sub (x, n, x);
} }
return MP_OKAY; return MP_OKAY;
} }
48 changes: 22 additions & 26 deletions bn_fast_s_mp_sqr.c
Expand Up @@ -16,15 +16,17 @@


/* fast squaring /* fast squaring
* *
* This is the comba method where the columns of the product are computed first * This is the comba method where the columns of the product
* then the carries are computed. This has the effect of making a very simple * are computed first then the carries are computed. This
* inner loop that is executed the most * has the effect of making a very simple inner loop that
* is executed the most
* *
* W2 represents the outer products and W the inner. * W2 represents the outer products and W the inner.
* *
* A further optimizations is made because the inner products are of the form * A further optimizations is made because the inner
* "A * B * 2". The *2 part does not need to be computed until the end which is * products are of the form "A * B * 2". The *2 part does
* good because 64-bit shifts are slow! * not need to be computed until the end which is good
* because 64-bit shifts are slow!
* *
* Based on Algorithm 14.16 on pp.597 of HAC. * Based on Algorithm 14.16 on pp.597 of HAC.
* *
Expand All @@ -48,26 +50,15 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
* Note that there are two buffers. Since squaring requires * Note that there are two buffers. Since squaring requires
* a outter and inner product and the inner product requires * a outter and inner product and the inner product requires
* computing a product and doubling it (a relatively expensive * computing a product and doubling it (a relatively expensive
* op to perform n^2 times if you don't have to) the inner and * op to perform n**2 times if you don't have to) the inner and
* outer products are computed in different buffers. This way * outer products are computed in different buffers. This way
* the inner product can be doubled using n doublings instead of * the inner product can be doubled using n doublings instead of
* n^2 * n**2
*/ */
memset (W, 0, newused * sizeof (mp_word)); memset (W, 0, newused * sizeof (mp_word));
memset (W2, 0, newused * sizeof (mp_word)); memset (W2, 0, newused * sizeof (mp_word));


/* note optimization /* This computes the inner product. To simplify the inner N**2 loop
* values in W2 are only written in even locations which means
* we can collapse the array to 256 words [and fixup the memset above]
* provided we also fix up the summations below. Ideally
* the fixup loop should be unrolled twice to handle the even/odd
* cases, and then a final step to handle odd cases [e.g. newused == odd]
*
* This will not only save ~8*256 = 2KB of stack but lower the number of
* operations required to finally fix up the columns
*/

/* This computes the inner product. To simplify the inner N^2 loop
* the multiplication by two is done afterwards in the N loop. * the multiplication by two is done afterwards in the N loop.
*/ */
for (ix = 0; ix < pa; ix++) { for (ix = 0; ix < pa; ix++) {
Expand Down Expand Up @@ -101,28 +92,33 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
} }


/* setup dest */ /* setup dest */
olduse = b->used; olduse = b->used;
b->used = newused; b->used = newused;


/* double first value, since the inner products are half of what they should be */
W[0] += W[0] + W2[0];

/* now compute digits */ /* now compute digits */
{ {
register mp_digit *tmpb; register mp_digit *tmpb;


tmpb = b->dp; /* double first value, since the inner products are
* half of what they should be
*/
W[0] += W[0] + W2[0];


tmpb = b->dp;
for (ix = 1; ix < newused; ix++) { for (ix = 1; ix < newused; ix++) {
/* double/add next digit */ /* double/add next digit */
W[ix] += W[ix] + W2[ix]; W[ix] += W[ix] + W2[ix];


W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
*tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
} }
/* set the last value. Note even if the carry is zero
* this is required since the next step will not zero
* it if b originally had a value at b->dp[2*a.used]
*/
*tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));


/* clear high */ /* clear high digits */
for (; ix < olduse; ix++) { for (; ix < olduse; ix++) {
*tmpb++ = 0; *tmpb++ = 0;
} }
Expand Down
2 changes: 1 addition & 1 deletion bn_mp_2expt.c
Expand Up @@ -14,7 +14,7 @@
*/ */
#include <tommath.h> #include <tommath.h>


/* computes a = 2^b /* computes a = 2**b
* *
* Simple algorithm which zeroes the int, grows it then just sets one bit * Simple algorithm which zeroes the int, grows it then just sets one bit
* as required. * as required.
Expand Down
2 changes: 1 addition & 1 deletion bn_mp_copy.c
Expand Up @@ -21,7 +21,7 @@ mp_copy (mp_int * a, mp_int * b)
int res, n; int res, n;


/* if dst == src do nothing */ /* if dst == src do nothing */
if (a == b || a->dp == b->dp) { if (a == b) {
return MP_OKAY; return MP_OKAY;
} }


Expand Down
4 changes: 4 additions & 0 deletions bn_mp_count_bits.c
Expand Up @@ -21,11 +21,15 @@ mp_count_bits (mp_int * a)
int r; int r;
mp_digit q; mp_digit q;


/* shortcut */
if (a->used == 0) { if (a->used == 0) {
return 0; return 0;
} }


/* get number of digits and add that */
r = (a->used - 1) * DIGIT_BIT; r = (a->used - 1) * DIGIT_BIT;

/* take the last digit and count the bits in it */
q = a->dp[a->used - 1]; q = a->dp[a->used - 1];
while (q > ((mp_digit) 0)) { while (q > ((mp_digit) 0)) {
++r; ++r;
Expand Down
3 changes: 1 addition & 2 deletions bn_mp_div_2d.c
Expand Up @@ -14,7 +14,7 @@
*/ */
#include <tommath.h> #include <tommath.h>


/* shift right by a certain bit count (store quotient in c, remainder in d) */ /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
int int
mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
{ {
Expand Down Expand Up @@ -81,7 +81,6 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
} }
} }
mp_clamp (c); mp_clamp (c);
res = MP_OKAY;
if (d != NULL) { if (d != NULL) {
mp_exch (&t, d); mp_exch (&t, d);
} }
Expand Down
64 changes: 64 additions & 0 deletions bn_mp_div_3.c
@@ -0,0 +1,64 @@
/* LibTomMath, multiple-precision integer library -- Tom St Denis
*
* LibTomMath is library that provides for multiple-precision
* integer arithmetic as well as number theoretic functionality.
*
* The library is designed directly after the MPI library by
* Michael Fromberger but has been written from scratch with
* additional optimizations in place.
*
* The library is free for all purposes without any express
* guarantee it works.
*
* Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
*/
#include <tommath.h>

/* divide by three (based on routine from MPI and the GMP manual) */
int
mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
{
mp_int q;
mp_word w, t;
mp_digit b;
int res, ix;

/* b = 2**DIGIT_BIT / 3 */
b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);

if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
return res;
}

q.used = a->used;
q.sign = a->sign;
w = 0;
for (ix = a->used - 1; ix >= 0; ix--) {
w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);

if (w >= 3) {
t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
w -= (t << ((mp_word)1)) + t;
while (w >= 3) {
t += 1;
w -= 3;
}
} else {
t = 0;
}
q.dp[ix] = t;
}

if (d != NULL) {
*d = w;
}

if (c != NULL) {
mp_clamp(&q);
mp_exch(&q, c);
}
mp_clear(&q);

return res;
}

0 comments on commit 0ef44ce

Please sign in to comment.