Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

added libtommath-0.01

  • Loading branch information...
commit 390fa39dc519551eb3c6e93fe3fa52a9ec9d9833 0 parents
Tom St Denis authored sjaeckel committed
3  b.bat
@@ -0,0 +1,3 @@
+nasm -f coff timer.asm
+gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER demo.c bn.c timer.o -o demo
+gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER demo.c mtest/mpi.c timer.o -o mpidemo
1,983 bn.c
@@ -0,0 +1,1983 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include "bn.h"
+
+/* chars used in radix conversions */
+static const char *s_rmap =
+ "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+
+#undef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#undef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+
+/* init a new bigint */
+int mp_init(mp_int *a)
+{
+ a->dp = calloc(sizeof(mp_digit), 16);
+ if (a->dp == NULL) {
+ return MP_MEM;
+ }
+ a->used = 0;
+ a->alloc = 16;
+ a->sign = MP_ZPOS;
+ return MP_OKAY;
+}
+
+/* clear one (frees) */
+void mp_clear(mp_int *a)
+{
+ if (a->dp != NULL) {
+ memset(a->dp, 0, sizeof(mp_digit) * a->alloc);
+ free(a->dp);
+ a->dp = NULL;
+ }
+}
+
+/* grow as required */
+static int mp_grow(mp_int *a, int size)
+{
+ int i;
+
+ /* if the alloc size is smaller alloc more ram */
+ if (a->alloc < size) {
+ size += 16 - (size & 15); /* ensure its to the next multiple of 16 words */
+ a->dp = realloc(a->dp, sizeof(mp_digit) * size);
+ if (a->dp == NULL) {
+ return MP_MEM;
+ }
+ i = a->alloc;
+ a->alloc = size;
+
+ /* zero top words */
+ for (; i < size; i++) {
+ a->dp[i] = 0;
+ }
+ }
+ return MP_OKAY;
+}
+
+/* shrink a bignum */
+int mp_shrink(mp_int *a)
+{
+ if (a->alloc != a->used) {
+ if ((a->dp = realloc(a->dp, sizeof(mp_digit) * a->used)) == NULL) {
+ return MP_MEM;
+ }
+ a->alloc = a->used;
+ }
+ return MP_OKAY;
+}
+
+/* trim unused digits */
+static void mp_clamp(mp_int *a)
+{
+ while (a->used > 0 && a->dp[a->used-1] == 0) --(a->used);
+ if (a->used == 0) {
+ a->sign = MP_ZPOS;
+ }
+}
+
+/* set to zero */
+void mp_zero(mp_int *a)
+{
+ a->sign = MP_ZPOS;
+ a->used = 0;
+ memset(a->dp, 0, sizeof(mp_digit) * a->alloc);
+}
+
+/* set to a digit */
+void mp_set(mp_int *a, mp_digit b)
+{
+ mp_zero(a);
+ a->dp[0] = b & MP_MASK;
+ a->used = 1;
+}
+
+/* set a 32-bit const */
+int mp_set_int(mp_int *a, unsigned long b)
+{
+ int res, x;
+ if ((res = mp_grow(a, 32/DIGIT_BIT + 1)) != MP_OKAY) {
+ return res;
+ }
+ /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
+ for (x = 0; x < 8; x++) {
+ mp_mul_2d(a, 4, a);
+ a->dp[0] |= (b>>28)&15;
+ b <<= 4;
+ }
+ mp_clamp(a);
+ return MP_OKAY;
+}
+
+/* init a mp_init and grow it to a given size */
+int mp_init_size(mp_int *a, int size)
+{
+ int res;
+
+ if ((res = mp_init(a)) != MP_OKAY) {
+ return res;
+ }
+ return mp_grow(a, size);
+}
+
+/* copy, b = a */
+int mp_copy(mp_int *a, mp_int *b)
+{
+ int res, n;
+
+ /* if dst == src do nothing */
+ if (a->dp == b->dp)
+ return MP_OKAY;
+
+ /* grow dest */
+ if ((res = mp_grow(b, a->used)) != MP_OKAY) {
+ return res;
+ }
+
+ mp_zero(b);
+ b->used = a->used;
+ b->sign = a->sign;
+ for (n = 0; n < a->used; n++) {
+ b->dp[n] = a->dp[n];
+ }
+ return MP_OKAY;
+}
+
+/* creates "a" then copies b into it */
+int mp_init_copy(mp_int *a, mp_int *b)
+{
+ int res;
+
+ if ((res = mp_init(a)) != MP_OKAY) {
+ return res;
+ }
+ return mp_copy(b, a);
+}
+
+/* compare maginitude of two ints (unsigned) */
+static int s_mp_cmp(mp_int *a, mp_int *b)
+{
+ int n;
+
+ /* compare based on # of non-zero digits */
+ if (a->used > b->used) {
+ return MP_GT;
+ } else if (a->used < b->used) {
+ return MP_LT;
+ }
+
+ /* compare based on digits */
+ for (n = a->used - 1; n >= 0; n--) {
+ if (a->dp[n] > b->dp[n]) {
+ return MP_GT;
+ } else if (a->dp[n] < b->dp[n]) {
+ return MP_LT;
+ }
+ }
+ return MP_EQ;
+}
+
+/* compare two ints (signed)*/
+int mp_cmp(mp_int *a, mp_int *b)
+{
+ /* compare based on sign */
+ if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
+ return MP_LT;
+ } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
+ return MP_GT;
+ }
+ return s_mp_cmp(a, b);
+}
+
+/* compare a digit */
+int mp_cmp_d(mp_int *a, mp_digit b)
+{
+ if (a->sign == MP_NEG) {
+ return MP_LT;
+ }
+
+ if (a->used > 1) {
+ return MP_GT;
+ }
+
+ if (a->dp[0] > b) {
+ return MP_GT;
+ } else if (a->dp[0] < b) {
+ return MP_LT;
+ } else {
+ return MP_EQ;
+ }
+}
+
+/* shift right a certain amount of digits */
+void mp_rshd(mp_int *a, int b)
+{
+ int x;
+
+ /* if b <= 0 then ignore it */
+ if (b <= 0) {
+ return;
+ }
+
+ /* if b > used then simply zero it and return */
+ if (a->used < b) {
+ mp_zero(a);
+ return;
+ }
+
+ /* shift the digits down */
+ for (x = 0; x < (a->used - b); x++) {
+ a->dp[x] = a->dp[x + b];
+ }
+
+ /* zero the top digits */
+ for (; x < a->used; x++) {
+ a->dp[x] = 0;
+ }
+ mp_clamp(a);
+}
+
+/* shift left a certain amount of digits */
+int mp_lshd(mp_int *a, int b)
+{
+ int x, res;
+
+ /* if its less than zero return */
+ if (b <= 0)
+ return MP_OKAY;
+
+ /* grow to fit the new digits */
+ if ((res = mp_grow(a, a->used + b)) != MP_OKAY) {
+ return res;
+ }
+
+ /* increment the used by the shift amount than copy upwards */
+ a->used += b;
+ for (x = a->used-1; x >= b; x--) {
+ a->dp[x] = a->dp[x - b];
+ }
+
+ /* zero the lower digits */
+ for (x = 0; x < b; x++) {
+ a->dp[x] = 0;
+ }
+ mp_clamp(a);
+ return MP_OKAY;
+}
+
+/* calc a value mod 2^b */
+int mp_mod_2d(mp_int *a, int b, mp_int *c)
+{
+ int x, res;
+
+ /* if b is <= 0 then zero the int */
+ if (b <= 0) {
+ mp_zero(c);
+ return MP_OKAY;
+ }
+
+ /* if the modulus is larger than the value than return */
+ if (b > (int)(a->used * DIGIT_BIT)) {
+ return mp_copy(a, c);
+ }
+
+ /* copy */
+ if ((res = mp_copy(a, c)) != MP_OKAY) {
+ return res;
+ }
+
+ /* zero digits above */
+ for (x = (b/DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
+ c->dp[x] = 0;
+ }
+ /* clear the digit that is not completely outside/inside the modulus */
+ c->dp[b/DIGIT_BIT] &= (mp_digit)((((mp_digit)1)<<(b % DIGIT_BIT)) - ((mp_digit)1));
+ mp_clamp(c);
+ return MP_OKAY;
+}
+
+/* shift right by a certain bit count (store quotient in c, remainder in d) */
+int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
+{
+ mp_digit D, r, rr;
+ int x, res;
+
+ if (d != NULL) {
+ if ((res = mp_mod_2d(a, b, d)) != MP_OKAY) {
+ return res;
+ }
+ }
+
+ /* copy */
+ if ((res = mp_copy(a, c)) != MP_OKAY) {
+ return res;
+ }
+
+ /* shift by as many digits in the bit count */
+ mp_rshd(c, b/DIGIT_BIT);
+
+ /* shift any bit count < DIGIT_BIT */
+ D = (mp_digit)(b % DIGIT_BIT);
+ if (D != 0) {
+ r = 0;
+ for (x = c->used - 1; x >= 0; x--) {
+ rr = c->dp[x] & ((mp_digit)((1U<<D)-1U));
+ c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT-D));
+ r = rr;
+ }
+ }
+ mp_clamp(c);
+ return MP_OKAY;
+}
+
+/* shift left by a certain bit count */
+int mp_mul_2d(mp_int *a, int b, mp_int *c)
+{
+ mp_digit d, r, rr;
+ int x, res;
+
+ /* copy */
+ if ((res = mp_copy(a, c)) != MP_OKAY) {
+ return res;
+ }
+
+ if ((res = mp_grow(c, c->used + b/DIGIT_BIT + 1)) != MP_OKAY) {
+ return res;
+ }
+
+ /* shift by as many digits in the bit count */
+ if ((res = mp_lshd(c, b/DIGIT_BIT)) != MP_OKAY) {
+ return res;
+ }
+ c->used = c->alloc;
+
+ /* shift any bit count < DIGIT_BIT */
+ d = (mp_digit)(b % DIGIT_BIT);
+ if (d != 0) {
+ r = 0;
+ for (x = 0; x < a->used; x++) {
+ rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit)((1U<<d)-1U));
+ c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
+ r = rr;
+ }
+ }
+ mp_clamp(c);
+ return MP_OKAY;
+}
+
+/* b = a/2 */
+int mp_div_2(mp_int *a, mp_int *b)
+{
+ return mp_div_2d(a, 1, b, NULL);
+}
+
+/* b = a*2 */
+int mp_mul_2(mp_int *a, mp_int *b)
+{
+ return mp_mul_2d(a, 1, b);
+}
+
+/* low level addition */
+static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
+{
+ mp_int t, *x;
+ int res, min, max, i;
+ mp_digit u;
+
+ /* find sizes */
+ if (a->used > b->used) {
+ min = b->used;
+ max = a->used;
+ x = a;
+ } else if (a->used < b->used) {
+ min = a->used;
+ max = b->used;
+ x = b;
+ } else {
+ min = max = a->used;
+ x = NULL;
+ }
+
+ /* init result */
+ if ((res = mp_init_size(&t, max+1)) != MP_OKAY) {
+ return res;
+ }
+ t.used = max+1;
+
+ /* add digits from lower part */
+ u = 0;
+ for (i = 0; i < min; i++) {
+ t.dp[i] = a->dp[i] + b->dp[i] + u;
+ u = (t.dp[i] >> DIGIT_BIT) & 1;
+ t.dp[i] &= MP_MASK;
+ }
+
+ /* now copy higher words if any */
+ if (min != max) {
+ for (; i < max; i++) {
+ t.dp[i] = x->dp[i] + u;
+ u = (t.dp[i] >> DIGIT_BIT) & 1;
+ t.dp[i] &= MP_MASK;
+ }
+ }
+
+ /* add carry */
+ t.dp[i] = u;
+
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, c)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+ mp_clear(&t);
+ return MP_OKAY;
+}
+
+/* low level subtraction (assumes a > b) */
+static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
+{
+ mp_int t;
+ int res, min, max, i;
+ mp_digit u;
+
+ /* find sizes */
+ min = b->used;
+ max = a->used;
+
+ /* init result */
+ if ((res = mp_init_size(&t, max+1)) != MP_OKAY) {
+ return res;
+ }
+ t.used = max+1;
+
+ /* sub digits from lower part */
+ u = 0;
+ for (i = 0; i < min; i++) {
+ t.dp[i] = a->dp[i] - (b->dp[i] + u);
+ u = (t.dp[i] >> DIGIT_BIT) & 1;
+ t.dp[i] &= MP_MASK;
+ }
+
+ /* now copy higher words if any */
+ if (min != max) {
+ for (; i < max; i++) {
+ t.dp[i] = a->dp[i] - u;
+ u = (t.dp[i] >> DIGIT_BIT) & 1;
+ t.dp[i] &= MP_MASK;
+ }
+ }
+
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, c)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+
+ mp_clear(&t);
+ return MP_OKAY;
+}
+
+/* low level multiplication */
+#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
+
+/* fast multiplier */
+/* multiplies |a| * |b| and only computes upto digs digits of result */
+static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
+{
+ mp_int t;
+ int res, pa, pb, ix, iy;
+ mp_word W[512];
+ mp_digit tmpx, *tmpt, *tmpy;
+
+// printf("\nHOLA\n");
+
+ if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
+ return res;
+ }
+ t.used = digs;
+
+ /* clear temp buf */
+ memset(W, 0, digs*sizeof(mp_word));
+
+ pa = a->used;
+ for (ix = 0; ix < pa; ix++) {
+ pb = MIN(b->used, digs - ix);
+ tmpx = a->dp[ix];
+ tmpt = &(t.dp[ix]);
+ tmpy = b->dp;
+ for (iy = 0; iy < pb; iy++) {
+ W[iy+ix] += ((mp_word)tmpx) * ((mp_word)*tmpy++);
+ }
+ }
+
+ /* now convert the array W downto what we need */
+ for (ix = 1; ix < digs; ix++) {
+ W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+ t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+ }
+ t.dp[digs-1] = W[digs-1] & ((mp_word)MP_MASK);
+
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, c)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+ mp_clear(&t);
+
+ return MP_OKAY;
+}
+
+/* multiplies |a| * |b| and only computes upto digs digits of result */
+static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
+{
+ mp_int t;
+ int res, pa, pb, ix, iy;
+ mp_digit u;
+ mp_word r;
+ mp_digit tmpx, *tmpt, *tmpy;
+
+ /* can we use the fast multiplier? */
+ if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
+ return fast_s_mp_mul_digs(a,b,c,digs);
+ }
+
+ if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
+ return res;
+ }
+ t.used = digs;
+
+ pa = a->used;
+ for (ix = 0; ix < pa; ix++) {
+ u = 0;
+ pb = MIN(b->used, digs - ix);
+ tmpx = a->dp[ix];
+ tmpt = &(t.dp[ix]);
+ tmpy = b->dp;
+ for (iy = 0; iy < pb; iy++) {
+ r = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
+ *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
+ u = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
+ }
+ if (ix+iy<digs)
+ *tmpt = u;
+ }
+
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, c)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+ mp_clear(&t);
+
+ return MP_OKAY;
+}
+
+static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
+{
+ mp_int t;
+ int res, pa, pb, ix, iy;
+ mp_word W[512];
+ mp_digit tmpx, *tmpt, *tmpy;
+
+ if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
+ return res;
+ }
+ t.used = a->used + b->used + 1;
+
+ pa = a->used;
+ pb = b->used;
+ memset(W, 0, (pa + pb + 1) * sizeof(mp_word));
+ for (ix = 0; ix < pa; ix++) {
+ tmpx = a->dp[ix];
+ tmpt = &(t.dp[digs]);
+ tmpy = b->dp + (digs - ix);
+ for (iy = digs - ix; iy < pb; iy++) {
+ W[ix+iy] += ((mp_word)tmpx) * ((mp_word)*tmpy++);
+ }
+ }
+
+ /* now convert the array W downto what we need */
+ for (ix = 1; ix < (pa+pb+1); ix++) {
+ W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+ t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+ }
+ t.dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK);
+
+
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, c)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+ mp_clear(&t);
+
+ return MP_OKAY;
+}
+
+/* multiplies |a| * |b| and does not compute the lower digs digits
+ * [meant to get the higher part of the product]
+ */
+static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
+{
+ mp_int t;
+ int res, pa, pb, ix, iy;
+ mp_digit u;
+ mp_word r;
+ mp_digit tmpx, *tmpt, *tmpy;
+
+ /* can we use the fast multiplier? */
+ if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
+ return fast_s_mp_mul_high_digs(a,b,c,digs);
+ }
+
+ if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
+ return res;
+ }
+ t.used = a->used + b->used + 1;
+
+ pa = a->used;
+ pb = b->used;
+ for (ix = 0; ix < pa; ix++) {
+ u = 0;
+ tmpx = a->dp[ix];
+ tmpt = &(t.dp[digs]);
+ tmpy = b->dp + (digs - ix);
+ for (iy = digs - ix; iy < pb; iy++) {
+ r = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
+ *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
+ u = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
+ }
+ *tmpt = u;
+ }
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, c)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+ mp_clear(&t);
+
+ return MP_OKAY;
+}
+
+/* fast squaring */
+static int fast_s_mp_sqr(mp_int *a, mp_int *b)
+{
+ mp_int t;
+ int res, ix, iy, pa;
+ mp_word r, W[512];
+ mp_digit tmpx, *tmpy;
+
+ pa = a->used;
+ if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
+ return res;
+ }
+ t.used = pa + pa + 1;
+
+ /* zero temp buffer */
+ memset(W, 0, (pa+pa+1)*sizeof(mp_word));
+
+ for (ix = 0; ix < pa; ix++) {
+ W[ix+ix] += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
+ tmpx = a->dp[ix];
+ tmpy = &(a->dp[ix+1]);
+ for (iy = ix + 1; iy < pa; iy++) {
+ r = ((mp_word)tmpx) * ((mp_word)*tmpy++);
+ W[ix+iy] += r + r;
+ }
+ }
+ for (ix = 1; ix < (pa+pa+1); ix++) {
+ W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+ t.dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+ }
+ t.dp[(pa+pa+1)-1] = W[(pa+pa+1)-1] & ((mp_word)MP_MASK);
+
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, b)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+ mp_clear(&t);
+ return MP_OKAY;
+}
+
+/* low level squaring, b = a*a */
+static int s_mp_sqr(mp_int *a, mp_int *b)
+{
+ mp_int t;
+ int res, ix, iy, pa;
+ mp_word r, u;
+ mp_digit tmpx, *tmpt;
+
+ /* can we use the fast multiplier? */
+ if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
+ return fast_s_mp_sqr(a,b);
+ }
+
+ pa = a->used;
+ if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
+ return res;
+ }
+ t.used = pa + pa + 1;
+
+ for (ix = 0; ix < pa; ix++) {
+ r = ((mp_word)t.dp[ix+ix]) + ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
+ t.dp[ix+ix] = (mp_digit)(r & ((mp_word)MP_MASK));
+ u = (r >> ((mp_word)DIGIT_BIT));
+ tmpx = a->dp[ix];
+ tmpt = &(t.dp[ix+ix+1]);
+ for (iy = ix + 1; iy < pa; iy++) {
+ r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
+ r = ((mp_word)*tmpt) + r + r + ((mp_word)u);
+ *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
+ u = (r >> ((mp_word)DIGIT_BIT));
+ }
+ r = ((mp_word)*tmpt) + u;
+ *tmpt = (mp_digit)(r & ((mp_word)MP_MASK));
+ u = (r >> ((mp_word)DIGIT_BIT));
+ /* propagate upwards */
+ ++tmpt;
+ while (u != ((mp_word)0)) {
+ r = ((mp_word)*tmpt) + ((mp_word)1);
+ *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
+ u = (r >> ((mp_word)DIGIT_BIT));
+ }
+ }
+
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, b)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+ mp_clear(&t);
+ return MP_OKAY;
+}
+
+/* high level addition (handles signs) */
+int mp_add(mp_int *a, mp_int *b, mp_int *c)
+{
+ int sa, sb, res;
+
+ sa = a->sign;
+ sb = b->sign;
+
+ /* handle four cases */
+ if (sa == MP_ZPOS && sb == MP_ZPOS) {
+ /* both positive */
+ res = s_mp_add(a, b, c);
+ c->sign = MP_ZPOS;
+ } else if (sa == MP_ZPOS && sb == MP_NEG) {
+ /* a + -b == a - b, but if b>a then we do it as -(b-a) */
+ if (s_mp_cmp(a, b) == MP_LT) {
+ res = s_mp_sub(b, a, c);
+ c->sign = MP_NEG;
+ } else {
+ res = s_mp_sub(a, b, c);
+ c->sign = MP_ZPOS;
+ }
+ } else if (sa == MP_NEG && sb == MP_ZPOS) {
+ /* -a + b == b - a, but if a>b then we do it as -(a-b) */
+ if (s_mp_cmp(a, b) == MP_GT) {
+ res = s_mp_sub(a, b, c);
+ c->sign = MP_NEG;
+ } else {
+ res = s_mp_sub(b, a, c);
+ c->sign = MP_ZPOS;
+ }
+ } else {
+ /* -a + -b == -(a + b) */
+ res = s_mp_add(a, b, c);
+ c->sign = MP_NEG;
+ }
+ return res;
+}
+
+/* high level subtraction (handles signs) */
+int mp_sub(mp_int *a, mp_int *b, mp_int *c)
+{
+ int sa, sb, res;
+
+ sa = a->sign;
+ sb = b->sign;
+
+ /* handle four cases */
+ if (sa == MP_ZPOS && sb == MP_ZPOS) {
+ /* both positive, a - b, but if b>a then we do -(b - a) */
+ if (s_mp_cmp(a, b) == MP_LT) {
+ /* b>a */
+ res = s_mp_sub(b, a, c);
+ c->sign = MP_NEG;
+ } else {
+ res = s_mp_sub(a, b, c);
+ c->sign = MP_ZPOS;
+ }
+ } else if (sa == MP_ZPOS && sb == MP_NEG) {
+ /* a - -b == a + b */
+ res = s_mp_add(a, b, c);
+ c->sign = MP_ZPOS;
+ } else if (sa == MP_NEG && sb == MP_ZPOS) {
+ /* -a - b == -(a + b) */
+ res = s_mp_add(a, b, c);
+ c->sign = MP_NEG;
+ } else {
+ /* -a - -b == b - a, but if a>b == -(a - b) */
+ if (s_mp_cmp(a, b) == MP_GT) {
+ res = s_mp_sub(a, b, c);
+ c->sign = MP_NEG;
+ } else {
+ res = s_mp_sub(b, a, c);
+ c->sign = MP_ZPOS;
+ }
+ }
+ return res;
+}
+
+/* c = |a| * |b| using Karatsuba */
+static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
+{
+ mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1;
+ int B, err, neg, x;
+
+ err = MP_MEM;
+
+ /* min # of digits */
+ B = MIN(a->used, b->used);
+
+ /* now divide in two */
+ B = B/2;
+
+ /* init copy all the temps */
+ if (mp_init_size(&x0, B) != MP_OKAY) goto ERR;
+ if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0;
+ if (mp_init_size(&y0, B) != MP_OKAY) goto X1;
+ if (mp_init_size(&y1, b->used - B) != MP_OKAY) goto Y0;
+
+ /* init temps */
+ if (mp_init(&t1) != MP_OKAY) goto Y1;
+ if (mp_init(&t2) != MP_OKAY) goto T1;
+ if (mp_init(&x0y0) != MP_OKAY) goto T2;
+ if (mp_init(&x1y1) != MP_OKAY) goto X0Y0;
+
+ /* now shift the digits */
+ x0.sign = x1.sign = a->sign;
+ y0.sign = y1.sign = b->sign;
+
+ x0.used = y0.used = B;
+ x1.used = a->used - B;
+ y1.used = b->used - B;
+
+ for (x = 0; x < B; x++) {
+ x0.dp[x] = a->dp[x];
+ y0.dp[x] = b->dp[x];
+ }
+ for (x = B; x < a->used; x++) {
+ x1.dp[x-B] = a->dp[x];
+ }
+ for (x = B; x < b->used; x++) {
+ y1.dp[x-B] = b->dp[x];
+ }
+
+ mp_clamp(&x0);
+ mp_clamp(&x1);
+ mp_clamp(&y0);
+ mp_clamp(&y1);
+
+ /* now calc the products x0y0 and x1y1 */
+ if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */
+ if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1; /* x1y1 = x1*y1 */
+
+ /* now calc x1-x0 and y1-y0 */
+ if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1Y1; /* t1 = x1 - x0 */
+ if (mp_sub(&y1, &y0, &t2) != MP_OKAY) goto X1Y1; /* t2 = y1 - y0 */
+ neg = (t1.sign == t2.sign) ? MP_ZPOS : MP_NEG;
+ if (mp_mul(&t1, &t2, &t1) != MP_OKAY) goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */
+ t1.sign = neg;
+
+ /* add x0y0 */
+ if (mp_add(&x0y0, &x1y1, &t2) != MP_OKAY) goto X1Y1; /* t2 = x0y0 + x1y1 */
+ if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+ /* shift by B */
+ if (mp_lshd(&t1, B) != MP_OKAY) goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+ if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1; /* x1y1 = x1y1 << 2*B */
+
+ if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 */
+ if (mp_add(&t1, &x1y1, &t1) != MP_OKAY) goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
+
+ err = mp_copy(&t1, c);
+
+X1Y1: mp_clear(&x1y1);
+X0Y0: mp_clear(&x0y0);
+T2 : mp_clear(&t2);
+T1 : mp_clear(&t1);
+Y1 : mp_clear(&y1);
+Y0 : mp_clear(&y0);
+X1 : mp_clear(&x1);
+X0 : mp_clear(&x0);
+ERR :
+ return err;
+}
+
+/* high level multiplication (handles sign) */
+int mp_mul(mp_int *a, mp_int *b, mp_int *c)
+{
+ int res;
+ if (MIN(a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
+ res = mp_karatsuba_mul(a, b, c);
+ } else {
+ res = s_mp_mul(a, b, c);
+ }
+ c->sign = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+ return res;
+}
+
+/* Karatsuba squaring, computes b = a*a */
+static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
+{
+ mp_int x0, x1, t1, t2, x0x0, x1x1;
+ int B, err;
+
+ err = MP_MEM;
+
+ /* min # of digits */
+ B = a->used;
+
+ /* now divide in two */
+ B = B/2;
+
+ /* init copy all the temps */
+ if (mp_init_copy(&x0, a) != MP_OKAY) goto ERR;
+ if (mp_init_copy(&x1, a) != MP_OKAY) goto X0;
+
+ /* init temps */
+ if (mp_init(&t1) != MP_OKAY) goto X1;
+ if (mp_init(&t2) != MP_OKAY) goto T1;
+ if (mp_init(&x0x0) != MP_OKAY) goto T2;
+ if (mp_init(&x1x1) != MP_OKAY) goto X0X0;
+
+ /* now shift the digits */
+ mp_mod_2d(&x0, B*DIGIT_BIT, &x0);
+ mp_rshd(&x1, B);
+
+ /* now calc the products x0*x0 and x1*x1 */
+ if (s_mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */
+ if (s_mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */
+
+ /* now calc x1-x0 and y1-y0 */
+ if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */
+ if (s_mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */
+
+ /* add x0y0 */
+ if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */
+ if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+ /* shift by B */
+ if (mp_lshd(&t1, B) != MP_OKAY) goto X1X1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+ if (mp_lshd(&x1x1, B*2) != MP_OKAY) goto X1X1; /* x1y1 = x1y1 << 2*B */
+
+ if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 */
+ if (mp_add(&t1, &x1x1, &t1) != MP_OKAY) goto X1X1; /* t1 = x0y0 + t1 + x1y1 */
+
+ err = mp_copy(&t1, b);
+X1X1: mp_clear(&x1x1);
+X0X0: mp_clear(&x0x0);
+T2 : mp_clear(&t2);
+T1 : mp_clear(&t1);
+X1 : mp_clear(&x1);
+X0 : mp_clear(&x0);
+ERR :
+ return err;
+}
+
+/* computes b = a*a */
+int mp_sqr(mp_int *a, mp_int *b)
+{
+ int res;
+ if (a->used > KARATSUBA_SQR_CUTOFF) {
+ res = mp_karatsuba_sqr(a, b);
+ } else {
+ res = s_mp_sqr(a, b);
+ }
+ b->sign = MP_ZPOS;
+ return res;
+}
+
+
+/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder] */
+int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
+{
+ mp_int q, x, y, t1, t2;
+ int res, n, t, i, norm, neg;
+
+ /* if a < b then q=0, r = a */
+ if (s_mp_cmp(a, b) == MP_LT) {
+ if (d != NULL) {
+ res = mp_copy(a, d);
+ d->sign = a->sign;
+ } else {
+ res = MP_OKAY;
+ }
+ if (c != NULL) {
+ mp_zero(c);
+ }
+ return res;
+ }
+
+
+ if ((res = mp_init_size(&q, a->used + 2)) != MP_OKAY) {
+ return res;
+ }
+ q.used = a->used + 2;
+
+ if ((res = mp_init(&t1)) != MP_OKAY) {
+ goto __Q;
+ }
+
+ if ((res = mp_init(&t2)) != MP_OKAY) {
+ goto __T1;
+ }
+
+ if ((res = mp_init_copy(&x, a)) != MP_OKAY) {
+ goto __T2;
+ }
+
+ if ((res = mp_init_copy(&y, b)) != MP_OKAY) {
+ goto __X;
+ }
+
+ /* fix the sign */
+ neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+ x.sign = y.sign = MP_ZPOS;
+
+ /* normalize */
+ norm = 0;
+ while ((y.dp[y.used-1] & (((mp_digit)1)<<(DIGIT_BIT-1))) == ((mp_digit)0)) {
+ ++norm;
+ if ((res = mp_mul_2d(&x, 1, &x)) != MP_OKAY) {
+ goto __Y;
+ }
+ if ((res = mp_mul_2d(&y, 1, &y)) != MP_OKAY) {
+ goto __Y;
+ }
+ }
+
+ /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+ n = x.used - 1;
+ t = y.used - 1;
+
+ /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
+ if ((res = mp_lshd(&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */
+ goto __Y;
+ }
+
+ while (mp_cmp(&x, &y) != MP_LT) {
+ ++(q.dp[n - t]);
+ if ((res = mp_sub(&x, &y, &x)) != MP_OKAY) {
+ goto __Y;
+ }
+ }
+
+ /* reset y by shifting it back down */
+ mp_rshd(&y, n - t);
+
+ /* step 3. for i from n down to (t + 1) */
+ for (i = n; i >= (t + 1); i--) {
+ /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+ if (x.dp[i] == y.dp[t]) {
+ q.dp[i - t - 1] = ((1UL<<DIGIT_BIT)-1UL);
+ } else {
+ mp_word tmp;
+ tmp = ((mp_word)x.dp[i]) << ((mp_word)DIGIT_BIT);
+ tmp |= ((mp_word)x.dp[i-1]);
+ tmp /= ((mp_word)y.dp[t]);
+ if (tmp > (mp_word)MP_MASK) tmp = MP_MASK;
+ q.dp[i - t - 1] = (mp_digit)(tmp & (mp_word)(MP_MASK));
+ }
+
+ /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
+ q.dp[i-t-1] = (q.dp[i-t-1] + 1) & MP_MASK;
+ do {
+ q.dp[i-t-1] = (q.dp[i-t-1] - 1) & MP_MASK;
+
+ /* find left hand */
+ t1.dp[0] = (t-1 < 0) ? 0 : y.dp[t-1];
+ t1.dp[1] = y.dp[t];
+ t1.used = 2;
+ if ((res = mp_mul_d(&t1, q.dp[i-t-1], &t1)) != MP_OKAY) {
+ goto __Y;
+ }
+
+ /* find right hand */
+ t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i-2];
+ t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i-1];
+ t2.dp[2] = x.dp[i];
+ t2.used = 3;
+ } while (mp_cmp(&t1, &t2) == MP_GT);
+
+ /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
+ if ((res = mp_mul_d(&y, q.dp[i-t-1], &t1)) != MP_OKAY) {
+ goto __Y;
+ }
+
+ if ((res = mp_lshd(&t1, i - t - 1)) != MP_OKAY) {
+ goto __Y;
+ }
+
+ if ((res = mp_sub(&x, &t1, &x)) != MP_OKAY) {
+ goto __Y;
+ }
+
+ /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
+ if (x.sign == MP_NEG && x.used != 0) {
+ if ((res = mp_copy(&y, &t1)) != MP_OKAY) {
+ goto __Y;
+ }
+ if ((res = mp_lshd(&t1, i-t-1)) != MP_OKAY) {
+ goto __Y;
+ }
+ if ((res = mp_add(&x, &t1, &x)) != MP_OKAY) {
+ goto __Y;
+ }
+
+ q.dp[i-t-1] = (q.dp[i-t-1] - 1UL) & MP_MASK;
+ }
+ }
+
+ /* now q is the quotient and x is the remainder [which we have to normalize] */
+ if (c != NULL) {
+ mp_clamp(&q);
+ mp_copy(&q, c);
+ c->sign = neg;
+ }
+
+ if (d != NULL) {
+ x.sign = a->sign;
+ mp_div_2d(&x, norm, &x, NULL);
+ mp_clamp(&x);
+ mp_copy(&x, d);
+ }
+
+ res = MP_OKAY;
+
+__Y: mp_clear(&y);
+__X: mp_clear(&x);
+__T2: mp_clear(&t2);
+__T1: mp_clear(&t1);
+__Q: mp_clear(&q);
+ return res;
+}
+
+/* single digit addition */
+int mp_add_d(mp_int *a, mp_digit b, mp_int *c)
+{
+ mp_int t;
+ int res;
+
+ if ((res = mp_init(&t)) != MP_OKAY) {
+ return res;
+ }
+ mp_set(&t, b);
+ res = mp_add(a, &t, c);
+
+ mp_clear(&t);
+ return res;
+}
+
+/* single digit subtraction */
+int mp_sub_d(mp_int *a, mp_digit b, mp_int *c)
+{
+ mp_int t;
+ int res;
+
+ if ((res = mp_init(&t)) != MP_OKAY) {
+ return res;
+ }
+ mp_set(&t, b);
+ res = mp_sub(a, &t, c);
+
+ mp_clear(&t);
+ return res;
+}
+
+/* multiply by a digit */
+int mp_mul_d(mp_int *a, mp_digit b, mp_int *c)
+{
+ int res, pa, ix;
+ mp_word r;
+ mp_digit u;
+ mp_int t;
+
+ pa = a->used;
+ if ((res = mp_init_size(&t, pa + 2)) != MP_OKAY) {
+ return res;
+ }
+ t.used = pa + 2;
+
+ u = 0;
+ for (ix = 0; ix < pa; ix++) {
+ r = ((mp_word)u) + ((mp_word)a->dp[ix]) * ((mp_word)b);
+ t.dp[ix] = (mp_digit)(r & ((mp_word)MP_MASK));
+ u = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
+ }
+ t.dp[ix] = u;
+
+ mp_clamp(&t);
+ if ((res = mp_copy(&t, c)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+ mp_clear(&t);
+ return MP_OKAY;
+}
+
+/* single digit division */
+int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
+{
+ mp_int t, t2;
+ int res;
+
+ if ((res = mp_init(&t)) != MP_OKAY) {
+ return res;
+ }
+
+ if ((res = mp_init(&t2)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+
+ mp_set(&t, b);
+ res = mp_div(a, &t, c, &t2);
+
+ if (d != NULL) {
+ *d = t2.dp[0];
+ }
+
+ mp_clear(&t);
+ mp_clear(&t2);
+ return res;
+}
+
+/* simple modular functions */
+
+/* d = a + b (mod c) */
+int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
+{
+ int res;
+
+ if ((res = mp_add(a, b, d)) != MP_OKAY) {
+ return res;
+ }
+ return mp_mod(d, c, d);
+}
+
+/* d = a - b (mod c) */
+int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
+{
+ int res;
+
+ if ((res = mp_sub(a, b, d)) != MP_OKAY) {
+ return res;
+ }
+ return mp_mod(d, c, d);
+}
+
+/* d = a * b (mod c) */
+int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
+{
+ int res;
+
+ if ((res = mp_mul(a, b, d)) != MP_OKAY) {
+ return res;
+ }
+ return mp_mod(d, c, d);
+}
+
+/* c = a * a (mod b) */
+int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c)
+{
+ int res;
+
+ if ((res = mp_sqr(a, c)) != MP_OKAY) {
+ return res;
+ }
+ return mp_mod(c, b, c);
+}
+
+/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
+ */
+int mp_gcd(mp_int *a, mp_int *b, mp_int *c)
+{
+ mp_int u, v, t;
+ int k, res, neg;
+
+ /* either zero than gcd is the largest */
+ if (mp_iszero(a) == 1 && mp_iszero(b) == 0) {
+ return mp_copy(b, c);
+ }
+ if (mp_iszero(a) == 0 && mp_iszero(b) == 1) {
+ return mp_copy(a, c);
+ }
+ if (mp_iszero(a) == 1 && mp_iszero(b) == 1) {
+ mp_set(c, 1);
+ return MP_OKAY;
+ }
+
+ /* if both are negative they share (-1) as a common divisor */
+ neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
+
+ if ((res = mp_init_copy(&u, a)) != MP_OKAY) {
+ return res;
+ }
+
+ if ((res = mp_init_copy(&v, b)) != MP_OKAY) {
+ goto __U;
+ }
+
+ /* must be positive for the remainder of the algorithm */
+ u.sign = v.sign = MP_ZPOS;
+
+ if ((res = mp_init(&t)) != MP_OKAY) {
+ goto __V;
+ }
+
+ /* B1. Find power of two */
+ k = 0;
+ while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
+ ++k;
+ mp_div_2d(&u, 1, &u, NULL);
+ mp_div_2d(&v, 1, &v, NULL);
+ }
+
+ /* B2. Initialize */
+ if ((u.dp[0] & 1) == 1) {
+ if ((res = mp_copy(&v, &t)) != MP_OKAY) {
+ goto __T;
+ }
+ t.sign = MP_NEG;
+ } else {
+ if ((res = mp_copy(&u, &t)) != MP_OKAY) {
+ goto __T;
+ }
+ }
+
+ do {
+ /* B3 (and B4). Halve t, if even */
+ while (t.used != 0 && (t.dp[0] & 1) == 0) {
+ mp_div_2d(&t, 1, &t, NULL);
+ }
+
+ /* B5. if t>0 then u=t otherwise v=-t */
+ if (t.used != 0 && t.sign != MP_NEG) {
+ if ((res = mp_copy(&t, &u)) != MP_OKAY) {
+ goto __T;
+ }
+ } else {
+ if ((res = mp_copy(&t, &v)) != MP_OKAY) {
+ goto __T;
+ }
+ v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+ }
+
+ /* B6. t = u - v, if t != 0 loop otherwise terminate */
+ if ((res = mp_sub(&u, &v, &t)) != MP_OKAY) {
+ goto __T;
+ }
+ } while (t.used != 0);
+
+ if ((res = mp_mul_2d(&u, k, &u)) != MP_OKAY) {
+ goto __T;
+ }
+
+ if ((res = mp_copy(&u, c)) != MP_OKAY) {
+ goto __T;
+ }
+ c->sign = neg;
+__T: mp_clear(&t);
+__V: mp_clear(&u);
+__U: mp_clear(&v);
+
+ return res;
+}
+
+/* computes least common multipble as a*b/(a, b) */
+int mp_lcm(mp_int *a, mp_int *b, mp_int *c)
+{
+ int res;
+ mp_int t;
+
+ if ((res = mp_init(&t)) != MP_OKAY) {
+ return res;
+ }
+
+ if ((res = mp_mul(a, b, &t)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+
+ if ((res = mp_gcd(a, b, c)) != MP_OKAY) {
+ mp_clear(&t);
+ return res;
+ }
+
+ res = mp_div(&t, c, c, NULL);
+ mp_clear(&t);
+ return res;
+}
+
+/* computes the modular inverse via extended euclidean algorithm, that is c = 1/a mod b */
+int mp_invmod(mp_int *a, mp_int *b, mp_int *c)
+{
+ int res;
+ mp_int t1, t2, t3, u1, u2, u3, v1, v2, v3, q;
+
+ if ((res = mp_init(&t1)) != MP_OKAY) {
+ return res;
+ }
+
+ if ((res = mp_init(&t2)) != MP_OKAY) {
+ goto __T1;
+ }
+
+ if ((res = mp_init(&t3)) != MP_OKAY) {
+ goto __T2;
+ }
+
+ if ((res = mp_init(&u1)) != MP_OKAY) {
+ goto __T3;
+ }
+
+ if ((res = mp_init(&u2)) != MP_OKAY) {
+ goto __U1;
+ }
+
+ if ((res = mp_init(&u3)) != MP_OKAY) {
+ goto __U2;
+ }
+
+ if ((res = mp_init(&v1)) != MP_OKAY) {
+ goto __U3;
+ }
+
+ if ((res = mp_init(&v2)) != MP_OKAY) {
+ goto __V1;
+ }
+
+ if ((res = mp_init(&v3)) != MP_OKAY) {
+ goto __V2;
+ }
+
+ if ((res = mp_init(&q)) != MP_OKAY) {
+ goto __V3;
+ }
+
+ /* (u1, u2, u3) = (1, 0, a) */
+ mp_set(&u1, 1);
+ if ((res = mp_copy(a, &u3)) != MP_OKAY) {
+ goto __Q;
+ }
+
+ /* (v1, v2, v3) = (0, 1, b) */
+ mp_set(&u2, 1);
+ if ((res = mp_copy(b, &v3)) != MP_OKAY) {
+ goto __Q;
+ }
+
+ while (mp_iszero(&v3) == 0) {
+ if ((res = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY) {
+ goto __Q;
+ }
+
+ /* (t1, t2, t3) = (u1, u2, u3) - q*(v1, v2, v3) */
+ if ((res = mp_mul(&q, &v1, &t1)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_sub(&u1, &t1, &t1)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_mul(&q, &v2, &t2)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_sub(&u2, &t2, &t2)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_mul(&q, &v3, &t3)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_sub(&u3, &t3, &t3)) != MP_OKAY) { goto __Q; }
+
+ /* u = v */
+ if ((res = mp_copy(&v1, &u1)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_copy(&v2, &u2)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_copy(&v3, &u3)) != MP_OKAY) { goto __Q; }
+
+ /* v = t */
+ if ((res = mp_copy(&t1, &v1)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_copy(&t2, &v2)) != MP_OKAY) { goto __Q; }
+ if ((res = mp_copy(&t3, &v3)) != MP_OKAY) { goto __Q; }
+ }
+
+ /* if u3 != 1, then there is no inverse */
+ if (mp_cmp_d(&u3, 1) != MP_EQ) {
+ res = MP_VAL;
+ goto __Q;
+ }
+
+ /* u1 is the inverse */
+ res = mp_copy(&u1, c);
+__Q : mp_clear(&q);
+__V3: mp_clear(&v3);
+__V2: mp_clear(&v1);
+__V1: mp_clear(&v1);
+__U3: mp_clear(&u3);
+__U2: mp_clear(&u2);
+__U1: mp_clear(&u1);
+__T3: mp_clear(&t3);
+__T2: mp_clear(&t2);
+__T1: mp_clear(&t1);
+ return res;
+}
+
+/* pre-calculate the value required for Barrett reduction
+ * For a given modulus "b" it calulates the value required in "a"
+ */
+int mp_reduce_setup(mp_int *a, mp_int *b)
+{
+ int res;
+
+ mp_set(a, 1);
+ if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) {
+ return res;
+ }
+ return mp_div(a, b, a, NULL);
+}
+
+/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup */
+int mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
+{
+ mp_int q;
+ int res, um = m->used;
+
+ if((res = mp_init_copy(&q, x)) != MP_OKAY)
+ return res;
+
+ mp_rshd(&q, um - 1); /* q1 = x / b^(k-1) */
+
+ /* according to HAC this is optimization is ok */
+ if (((unsigned long)m->used) > (1UL<<(unsigned long)(DIGIT_BIT-1UL))) {
+ if ((res = mp_mul(&q, mu, &q)) != MP_OKAY) {
+ goto CLEANUP;
+ }
+ } else {
+ if ((res = s_mp_mul_high_digs(&q, mu, &q, um-1)) != MP_OKAY) {
+ goto CLEANUP;
+ }
+ }
+
+ mp_rshd(&q, um + 1); /* q3 = q2 / b^(k+1) */
+
+ /* x = x mod b^(k+1), quick (no division) */
+ mp_mod_2d(x, DIGIT_BIT * (um + 1), x);
+
+ /* q = q * m mod b^(k+1), quick (no division) */
+ s_mp_mul_digs(&q, m, &q, um + 1);
+
+ /* x = x - q */
+ if((res = mp_sub(x, &q, x)) != MP_OKAY)
+ goto CLEANUP;
+
+ /* If x < 0, add b^(k+1) to it */
+ if(mp_cmp_d(x, 0) == MP_LT) {
+ mp_set(&q, 1);
+ if((res = mp_lshd(&q, um + 1)) != MP_OKAY)
+ goto CLEANUP;
+ if((res = mp_add(x, &q, x)) != MP_OKAY)
+ goto CLEANUP;
+ }
+
+ /* Back off if it's too big */
+ while(mp_cmp(x, m) != MP_LT) {
+ if((res = s_mp_sub(x, m, x)) != MP_OKAY)
+ break;
+ }
+
+ CLEANUP:
+ mp_clear(&q);
+
+ return res;
+}
+
+int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
+{
+ mp_int M[64], res, mu;
+ mp_digit buf;
+ int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, z, winsize, tab[64];
+
+ /* find window size */
+ x = mp_count_bits(X);
+ if (x <= 18) { winsize = 2; }
+ else if (x <= 84) { winsize = 3; }
+ else if (x <= 300) { winsize = 4; }
+ else if (x <= 930) { winsize = 5; }
+ else { winsize = 6; }
+
+ /* init G array */
+ for (x = 0; x < (1<<winsize); x++) {
+ if ((err = mp_init(&M[x])) != MP_OKAY) {
+ for (y = 0; y < x; y++) {
+ mp_clear(&M[y]);
+ }
+ return err;
+ }
+ }
+
+ /* create mu, used for Barrett reduction */
+ if ((err = mp_init(&mu)) != MP_OKAY) {
+ goto __M;
+ }
+ if ((err = mp_reduce_setup(&mu, P)) != MP_OKAY) {
+ goto __MU;
+ }
+
+ /* create M table */
+ mp_set(&M[0], 1);
+ if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
+ goto __MU;
+ }
+
+ memset(tab, 0, sizeof(tab));
+ tab[0] = tab[1] = 1;
+
+ for (x = 2; x < (1 << winsize); x++) {
+ if (tab[x] == 0) {
+ if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
+ goto __MU;
+ }
+ if ((err = mp_reduce(&M[x], P, &mu)) != MP_OKAY) {
+ goto __MU;
+ }
+ tab[x] = 1;
+
+ y = x+x;
+ z = x;
+ while (y < (1 << winsize)) {
+ tab[y] = 1;
+ if ((err = mp_sqr(&M[z], &M[y])) != MP_OKAY) {
+ goto __MU;
+ }
+ if ((err = mp_reduce(&M[y], P, &mu)) != MP_OKAY) {
+ goto __MU;
+ }
+ z = y;
+ y = y + y;
+ }
+ }
+ }
+ /* init result */
+ if ((err = mp_init(&res)) != MP_OKAY) {
+ goto __MU;
+ }
+ mp_set(&res, 1);
+
+ /* set initial mode and bit cnt */
+ mode = 0;
+ bitcnt = 0;
+ buf = 0;
+ digidx = X->used - 1;
+ bitcpy = bitbuf = 0;
+
+ bitcnt = 1;
+ for (;;) {
+ /* grab next digit as required */
+ if (--bitcnt == 0) {
+ if (digidx == -1) {
+ break;
+ }
+ buf = X->dp[digidx--];
+ bitcnt = DIGIT_BIT;
+ }
+
+ /* grab the next msb from the exponent */
+ y = (buf >> (DIGIT_BIT - 1)) & 1;
+ buf <<= 1;
+
+ /* if the bit is zero and mode == 0 then we ignore it
+ * These represent the leading zero bits before the first 1 bit
+ * in the exponent. Technically this opt is not required but it
+ * does lower the # of trivial squaring/reductions used
+ */
+ if (y == 0 && mode == 0) continue;
+
+ /* if the bit is zero and mode == 1 then we square */
+ if (y == 0 && mode == 1) {
+ if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
+ goto __RES;
+ }
+ if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
+ goto __RES;
+ }
+ continue;
+ }
+
+ /* else we add it to the window */
+ bitbuf |= (y<<(winsize-++bitcpy));
+ mode = 2;
+
+ if (bitcpy == winsize) {
+ /* ok window is filled so square as required and multiply multiply */
+ /* square first */
+ for (x = 0; x < winsize; x++) {
+ if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
+ goto __RES;
+ }
+ if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
+ goto __RES;
+ }
+ }
+
+ /* then multiply */
+ if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
+ goto __MU;
+ }
+ if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
+ goto __MU;
+ }
+
+ /* empty window and reset */
+ bitcpy = bitbuf = 0;
+ mode = 1;
+ }
+ }
+
+ /* if bits remain then square/multiply */
+ if (mode == 2 && bitcpy > 0) {
+ bitbuf >>= (winsize - bitcpy);
+ /* square first */
+ for (x = 0; x < bitcpy; x++) {
+ if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
+ goto __RES;
+ }
+ if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
+ goto __RES;
+ }
+ }
+
+ /* then multiply */
+ if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
+ goto __MU;
+ }
+ if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
+ goto __MU;
+ }
+ }
+
+ err = mp_copy(&res, Y);
+__RES: mp_clear(&res);
+__MU : mp_clear(&mu);
+__M :
+ for (x = 0; x < (1<<winsize); x++) {
+ mp_clear(&M[x]);
+ }
+ return err;
+}
+
+/* --> radix conversion <-- */
+/* reverse an array, used for radix code */
+static void reverse(unsigned char *s, int len)
+{
+ int ix, iy;
+ unsigned char t;
+
+ ix = 0;
+ iy = len - 1;
+ while (ix < iy) {
+ t = s[ix]; s[ix] = s[iy]; s[iy] = t;
+ ++ix;
+ --iy;
+ }
+}
+
+/* returns the number of bits in an int */
+int mp_count_bits(mp_int *a)
+{
+ int r;
+ mp_digit q;
+
+ if (a->used == 0) {
+ return 0;
+ }
+
+ r = (a->used - 1) * DIGIT_BIT;
+ q = a->dp[a->used - 1];
+ while (q) {
+ ++r;
+ q >>= 1UL;
+ }
+ return r;
+}
+
+/* reads a unsigned char array, assumes the msb is stored first [big endian] */
+int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c)
+{
+ int res;
+
+ mp_zero(a);
+ a->used = (c/DIGIT_BIT) + ((c % DIGIT_BIT) != 0 ? 1: 0);
+ if ((res = mp_grow(a, a->used)) != MP_OKAY) {
+ return res;
+ }
+ while (c-- > 0) {
+ if ((res = mp_mul_2d(a, 8, a)) != MP_OKAY) {
+ return res;
+ }
+
+ if (DIGIT_BIT != 7) {
+ a->dp[0] |= *b++;
+ a->used += 1;
+ } else {
+ a->dp[0] = (*b & MP_MASK);
+ a->dp[1] |= ((*b++ >> 7U) & 1);
+ a->used += 2;
+ }
+ }
+ mp_clamp(a);
+ return MP_OKAY;
+}
+
+/* read signed bin, big endian, first byte is 0==positive or 1==negative */
+int mp_read_signed_bin(mp_int *a, unsigned char *b, int c)
+{
+ int res;
+
+ if ((res = mp_read_unsigned_bin(a, b + 1, c - 1)) != MP_OKAY) {
+ return res;
+ }
+ a->sign = ((b[0] == (unsigned char)0) ? MP_ZPOS : MP_NEG);
+ return MP_OKAY;
+}
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin(mp_int *a, unsigned char *b)
+{
+ int x, res;
+ mp_int t;
+
+ if ((res = mp_init_copy(&t, a)) != MP_OKAY) {
+ return res;
+ }
+
+ x = 0;
+ while (mp_iszero(&t) == 0) {
+ if (DIGIT_BIT != 7) {
+ b[x++] = (unsigned char)(t.dp[0] & 255);
+ } else {
+ b[x++] = (unsigned char)(t.dp[0] | ((t.dp[1] & 0x01) << 7));
+ }
+ mp_div_2d(&t, 8, &t, NULL);
+ }
+ reverse(b, x);
+ return MP_OKAY;
+}
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin(mp_int *a, unsigned char *b)
+{
+ int res;
+
+ if ((res = mp_to_unsigned_bin(a, b+1)) != MP_OKAY) {
+ return res;
+ }
+ b[0] = (unsigned char)((a->sign == MP_ZPOS) ? 0 : 1);
+ return MP_OKAY;
+}
+
+/* get the size for an unsigned equivalent */
+int mp_unsigned_bin_size(mp_int *a)
+{
+ return (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0));
+}
+
+/* get the size for an signed equivalent */
+int mp_signed_bin_size(mp_int *a)
+{
+ return 1 + (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0));
+}
+
+/* read a string [ASCII] in a given radix */
+int mp_read_radix(mp_int *a, unsigned char *str, int radix)
+{
+ int y, res, neg;
+
+ if (radix < 2 || radix > 64) {
+ return MP_VAL;
+ }
+
+ if (*str == (unsigned char)'-') {
+ ++str;
+ neg = MP_NEG;
+ } else {
+ neg = MP_ZPOS;
+ }
+
+ mp_zero(a);
+ while (*str) {
+ for (y = 0; y < 64; y++) {
+ if (*str == (unsigned char)s_rmap[y]) {
+ break;
+ }
+ }
+
+ if (y < radix) {
+ if ((res = mp_mul_d(a, (mp_digit)radix, a)) != MP_OKAY) {
+ return res;
+ }
+ if ((res = mp_add_d(a, (mp_digit)y, a)) != MP_OKAY) {
+ return res;
+ }
+ } else {
+ break;
+ }
+ ++str;
+ }
+ a->sign = neg;
+ return MP_OKAY;
+}
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int mp_toradix(mp_int *a, unsigned char *str, int radix)
+{
+ int res, digs;
+ mp_int t;
+ mp_digit d;
+ unsigned char *_s = str;
+
+ if (radix < 2 || radix > 64) {
+ return MP_VAL;
+ }
+
+ if ((res = mp_init_copy(&t, a)) != MP_OKAY) {
+ return res;
+ }
+
+ if (t.sign == MP_NEG) {
+ ++_s;
+ *str++ = '-';
+ t.sign = MP_ZPOS;
+ }
+
+ digs = 0;
+ while (mp_iszero(&t) == 0) {
+ if ((res = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY) {
+ return res;
+ }
+ *str++ = (unsigned char)s_rmap[d];
+ ++digs;
+ }
+ reverse(_s, digs);
+ *str++ = (unsigned char)'\0';
+ mp_clear(&t);
+ return MP_OKAY;
+}
+
225 bn.h
@@ -0,0 +1,225 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+
+#ifndef BN_H_
+#define BN_H_
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <limits.h>
+
+/* some default configurations.
+ *
+ * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits
+ * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits
+ *
+ * At the very least a mp_digit must be able to hold 7 bits
+ * [any size beyond that is ok provided it overflow the data type]
+ */
+#ifdef MP_8BIT
+ typedef unsigned char mp_digit;
+ typedef unsigned short mp_word;
+#elif defined(MP_16BIT)
+ typedef unsigned short mp_digit;
+ typedef unsigned long mp_word;
+#else
+ typedef unsigned long mp_digit;
+ typedef unsigned long long mp_word;
+ #define DIGIT_BIT 28U
+#endif
+
+#ifndef DIGIT_BIT
+ #define DIGIT_BIT ((CHAR_BIT * sizeof(mp_digit) - 1)) /* bits per digit */
+#endif
+
+#define MP_MASK ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
+
+/* equalities */
+#define MP_LT -1 /* less than */
+#define MP_EQ 0 /* equal to */
+#define MP_GT 1 /* greater than */
+
+#define MP_ZPOS 0 /* positive integer */
+#define MP_NEG 1 /* negative */
+
+#define MP_OKAY 0 /* ok result */
+#define MP_MEM 1 /* out of mem */
+#define MP_VAL 2 /* invalid input */
+
+#define KARATSUBA_MUL_CUTOFF 80 /* Min. number of digits before Karatsuba multiplication is used. */
+#define KARATSUBA_SQR_CUTOFF 80 /* Min. number of digits before Karatsuba squaring is used. */
+
+typedef struct {
+ int used, alloc, sign;
+ mp_digit *dp;
+} mp_int;
+
+/* ---> init and deinit bignum functions <--- */
+
+/* init a bignum */
+int mp_init(mp_int *a);
+
+/* free a bignum */
+void mp_clear(mp_int *a);
+
+/* shrink ram required for a bignum */
+int mp_shrink(mp_int *a);
+
+/* ---> Basic Manipulations <--- */
+
+#define mp_iszero(a) (((a)->used == 0) ? 1 : 0)
+#define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0)
+#define mp_isodd(a) (((a)->used > 0 || (((a)->dp[0] & 1) == 1)) ? 1 : 0)
+
+/* set to zero */
+void mp_zero(mp_int *a);
+
+/* set to a digit */
+void mp_set(mp_int *a, mp_digit b);
+
+/* set a 32-bit const */
+int mp_set_int(mp_int *a, unsigned long b);
+
+/* init to a given number of digits */
+int mp_init_size(mp_int *a, int size);
+
+/* copy, b = a */
+int mp_copy(mp_int *a, mp_int *b);
+
+/* inits and copies, a = b */
+int mp_init_copy(mp_int *a, mp_int *b);
+
+/* ---> digit manipulation <--- */
+
+/* right shift by "b" digits */
+void mp_rshd(mp_int *a, int b);
+
+/* left shift by "b" digits */
+int mp_lshd(mp_int *a, int b);
+
+/* c = a / 2^b */
+int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);
+
+/* b = a/2 */
+int mp_div_2(mp_int *a, mp_int *b);
+
+/* c = a * 2^b */
+int mp_mul_2d(mp_int *a, int b, mp_int *c);
+
+/* b = a*2 */
+int mp_mul_2(mp_int *a, mp_int *b);
+
+/* c = a mod 2^d */
+int mp_mod_2d(mp_int *a, int b, mp_int *c);
+
+/* ---> Basic arithmetic <--- */
+
+/* compare a to b */
+int mp_cmp(mp_int *a, mp_int *b);
+
+/* c = a + b */
+int mp_add(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a - b */
+int mp_sub(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a * b */
+int mp_mul(mp_int *a, mp_int *b, mp_int *c);
+
+/* b = a^2 */
+int mp_sqr(mp_int *a, mp_int *b);
+
+/* a/b => cb + d == a */
+int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* c == a mod b */
+#define mp_mod(a, b, c) mp_div(a, b, NULL, c)
+
+/* ---> single digit functions <--- */
+
+/* compare against a single digit */
+int mp_cmp_d(mp_int *a, mp_digit b);
+
+/* c = a + b */
+int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a - b */
+int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a * b */
+int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* a/b => cb + d == a */
+int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
+
+/* c = a mod b */
+#define mp_mod_d(a,b,c) mp_div_d(a, b, NULL, c)
+
+/* ---> number theory <--- */
+
+/* d = a + b (mod c) */
+int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* d = a - b (mod c) */
+int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* d = a * b (mod c) */
+int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* c = a * a (mod b) */
+int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = 1/a (mod b) */
+int mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = (a, b) */
+int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = [a, b] or (a*b)/(a, b) */
+int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
+
+/* used to setup the Barrett reduction for a given modulus b */
+int mp_reduce_setup(mp_int *a, mp_int *b);
+
+/* Barrett Reduction, computes a (mod b) with a precomputed value c */
+int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+
+/* d = a^b (mod c) */
+int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* ---> radix conversion <--- */
+
+int mp_count_bits(mp_int *a);
+
+int mp_unsigned_bin_size(mp_int *a);
+int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+
+int mp_signed_bin_size(mp_int *a);
+int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a, unsigned char *b);
+
+int mp_read_radix(mp_int *a, unsigned char *str, int radix);
+int mp_toradix(mp_int *a, unsigned char *str, int radix);
+
+#define mp_tobinary(M, S) mp_toradix((M), (S), 2)
+#define mp_tooctal(M, S) mp_toradix((M), (S), 8)
+#define mp_todecimal(M, S) mp_toradix((M), (S), 10)
+#define mp_tohex(M, S) mp_toradix((M), (S), 16)
+
+#endif
+
BIN  bn.pdf
Binary file not shown
411 bn.tex
@@ -0,0 +1,411 @@
+\documentclass{article}
+\begin{document}
+
+\title{LibTomMath v0.01 \\ A Free Multiple Precision Integer Library}
+\author{Tom St Denis \\ tomstdenis@iahu.ca}
+\maketitle
+\newpage
+
+\section{Introduction}
+``LibTomMath'' is a free and open source library that provides multiple-precision integer functions required to form a basis
+of a public key cryptosystem. LibTomMath is written entire in portable ISO C source code and designed to have an application
+interface much like that of MPI from Michael Fromberger.
+
+LibTomMath was written from scratch by Tom St Denis but designed to be drop in replacement for the MPI package. The
+algorithms within the library are derived from descriptions as provided in the Handbook of Applied Cryptography and Knuth's
+``The Art of Computer Programming''. The library has been extensively optimized and should provide quite comparable
+timings as compared to many free and commercial libraries.
+
+LibTomMath was designed with the following goals in mind:
+\begin{enumerate}
+\item Be a drop in replacement for MPI.
+\item Be much faster than MPI.
+\item Be written entirely in portable C.
+\end{enumerate}
+
+All three goals have been achieved. Particularly the speed increase goal. For example, a 512-bit modular exponentiation is
+four times faster\footnote{On an Athlon XP with GCC 3.2} with LibTomMath compared to MPI.
+
+Being compatible with MPI means that applications that already use it can be ported fairly quickly. Currently there are
+a few differences but there are many similarities. In fact the average MPI based application can be ported in under 15
+minutes.
+
+Thanks goes to Michael Fromberger for answering a couple questions and Colin Percival for having the patience and courtesy to
+help debug and suggest optimizations. They were both of great help!
+
+\section{Building Against LibTomMath}
+
+Building against LibTomMath is very simple because there is only one source file. Simply add ``bn.c'' to your project and
+copy both ``bn.c'' and ``bn.h'' into your project directory. There is no configuration nor building required before hand.
+
+If you are porting an MPI application to LibTomMath the first step will be to remove all references to MPI and replace them
+with references to LibTomMath. For example, substitute
+
+\begin{verbatim}
+#include "mpi.h"
+\end{verbatim}
+
+with
+
+\begin{verbatim}
+#include "bn.h"
+\end{verbatim}
+
+Remove ``mpi.c'' from your project and replace it with ``bn.c''. Note that currently MPI has a few more functions than
+LibTomMath has (e.g. no square-root code and a few others). Those are planned for future releases. In the interim work
+arounds can be sought. Note that LibTomMath doesn't lack any functions required to build a cryptosystem.
+
+\section{Programming with LibTomMath}
+
+\subsection{The mp\_int Structure}
+All multiple precision integers are stored in a structure called \textbf{mp\_int}. A multiple precision integer is
+essentially an array of \textbf{mp\_digit}. mp\_digit is defined at the top of bn.h. Its type can be changed to suit
+a particular platform.
+
+For example, when \textbf{MP\_8BIT} is defined\footnote{When building bn.c.} a mp\_digit is a unsigned char and holds
+seven bits. Similarly when \textbf{MP\_16BIT} is defined a mp\_digit is a unsigned short and holds 15 bits.
+By default a mp\_digit is a unsigned long and holds 28 bits.
+
+The choice of digit is particular to the platform at hand and what available multipliers are provided. For
+MP\_8BIT either a $8 \times 8 \Rightarrow 16$ or $16 \times 16 \Rightarrow 16$ multiplier is optimal. When
+MP\_16BIT is defined either a $16 \times 16 \Rightarrow 32$ or $32 \times 32 \Rightarrow 32$ multiplier is optimal. By
+default a $32 \times 32 \Rightarrow 64$ or $64 \times 64 \Rightarrow 64$ multiplier is optimal.
+
+This gives the library some flexibility. For example, a i8051 has a $8 \times 8 \Rightarrow 16$ multiplier. The
+16-bit x86 instruction set has a $16 \times 16 \Rightarrow 32$ multiplier. In practice this library is not particularly
+designed for small devices like an i8051 due to the size. It is possible to strip out functions which are not required
+to drop the code size. More realistically the library is well suited to 32 and 64-bit processors that have decent
+integer multipliers. The AMD Athlon XP and Intel Pentium 4 processors are examples of well suited processors.
+
+Throughout the discussions there will be references to a \textbf{used} and \textbf{alloc} members of an integer. The
+used member refers to how many digits are actually used in the representation of the integer. The alloc member refers
+to how many digits have been allocated off the heap. There is also the $\beta$ quantity which is equal to $2^W$ where
+$W$ is the number of bits in a digit (default is 28).
+
+\subsection{Basic Functionality}
+Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired. A
+function will return \textbf{MP\_OKAY} if the function was successful. A function will return \textbf{MP\_MEM} if
+it ran out of memory and \textbf{MP\_VAL} if the input was invalid.
+
+Before an mp\_int can be used it must be initialized with
+
+\begin{verbatim}
+int mp_init(mp_int *a);
+\end{verbatim}
+
+For example, consider the following.
+
+\begin{verbatim}
+#include "bn.h"
+int main(void)
+{
+ mp_int num;
+ if (mp_init(&num) != MP_OKAY) {
+ printf("Error initializing a mp_int.\n");
+ }
+ return 0;
+}
+\end{verbatim}
+
+A mp\_int can be freed from memory with
+
+\begin{verbatim}
+void mp_clear(mp_int *a);
+\end{verbatim}
+
+This will zero the memory and free the allocated data. There are a set of trivial functions to manipulate the
+value of an mp\_int.
+
+\begin{verbatim}
+/* set to zero */
+void mp_zero(mp_int *a);
+
+/* set to a digit */
+void mp_set(mp_int *a, mp_digit b);
+
+/* set a 32-bit const */
+int mp_set_int(mp_int *a, unsigned long b);
+
+/* init to a given number of digits */
+int mp_init_size(mp_int *a, int size);
+
+/* copy, b = a */
+int mp_copy(mp_int *a, mp_int *b);
+
+/* inits and copies, a = b */
+int mp_init_copy(mp_int *a, mp_int *b);
+\end{verbatim}
+
+The \textbf{mp\_zero} function will clear the contents of a mp\_int and set it to positive. The \textbf{mp\_set} function
+will zero the integer and set the first digit to a value specified. The \textbf{mp\_set\_int} function will zero the
+integer and set the first 32-bits to a given value. It is important to note that using mp\_set can have unintended
+side effects when either the MP\_8BIT or MP\_16BIT defines are enabled. By default the library will accept the
+ranges of values MPI will (and more).
+
+The \textbf{mp\_init\_size} function will initialize the integer and set the allocated size to a given value. The
+allocated digits are zero'ed by default but not marked as used. The \textbf{mp\_copy} function will copy the digits
+(and sign) of the first parameter into the integer specified by the second parameter. The \textbf{mp\_init\_copy} will
+initialize the first integer specified and copy the second one into it. Note that the order is reversed from that of
+mp\_copy. This odd ``bug'' was kept to maintain compatibility with MPI.
+
+\subsection{Digit Manipulations}
+
+There are a class of functions that provide simple digit manipulations such as shifting and modulo reduction of powers
+of two.
+
+\begin{verbatim}
+/* right shift by "b" digits */
+void mp_rshd(mp_int *a, int b);
+
+/* left shift by "b" digits */
+int mp_lshd(mp_int *a, int b);
+
+/* c = a / 2^b */
+int mp_div_2d(mp_int *a, int b, mp_int *c);
+
+/* b = a/2 */
+int mp_div_2(mp_int *a, mp_int *b);
+
+/* c = a * 2^b */
+int mp_mul_2d(mp_int *a, int b, mp_int *c);
+
+/* b = a*2 */
+int mp_mul_2(mp_int *a, mp_int *b);
+
+/* c = a mod 2^d */
+int mp_mod_2d(mp_int *a, int b, mp_int *c);
+\end{verbatim}
+
+Both the \textbf{mp\_rshd} and \textbf{mp\_lshd} functions provide shifting by whole digits. For example,
+mp\_rshd($x$, $n$) is the same as $x \leftarrow \lfloor x / \beta^n \rfloor$ while mp\_lshd($x$, $n$) is equivalent
+to $x \leftarrow x \cdot \beta^n$. Both functions are extremely fast as they merely copy digits within the array.
+
+Similarly the \textbf{mp\_div\_2d} and \textbf{mp\_mul\_2d} functions provide shifting but allow any bit count to
+be specified. For example, mp\_div\_2d($x$, $n$, $y$) is the same as $y =\lfloor x / 2^n \rfloor$ while
+mp\_mul\_2d($x$, $n$, $y$) is the same as $y = x \cdot 2^n$. The \textbf{mp\_div\_2} and \textbf{mp\_mul\_2}
+functions are legacy functions that merely shift right or left one bit respectively. The \textbf{mp\_mod\_2d} function
+reduces an integer mod a power of two. For example, mp\_mod\_2d($x$, $n$, $y$) is the same as
+$y \equiv x \mbox{ (mod }2^n\mbox{)}$.
+
+\subsection{Basic Arithmetic}
+
+Next are the class of functions which provide basic arithmetic.
+
+\begin{verbatim}
+/* compare a to b */
+int mp_cmp(mp_int *a, mp_int *b);
+
+/* c = a + b */
+int mp_add(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a - b */
+int mp_sub(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a * b */
+int mp_mul(mp_int *a, mp_int *b, mp_int *c);
+
+/* b = a^2 */
+int mp_sqr(mp_int *a, mp_int *b);
+
+/* a/b => cb + d == a */
+int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* c == a mod b */
+#define mp_mod(a, b, c) mp_div(a, b, NULL, c)
+\end{verbatim}
+
+The \textbf{mp\_cmp} will compare two integers. It will return \textbf{MP\_LT} if the first parameter is less than
+the second, \textbf{MP\_GT} if it is greater or \textbf{MP\_EQ} if they are equal. These constants are the same as from
+MPI.
+
+The \textbf{mp\_add}, \textbf{mp\_sub}, \textbf{mp\_mul}, \textbf{mp\_div}, \textbf{mp\_sqr} and \textbf{mp\_mod} are all
+fairly straight forward to understand. Note that in mp\_div either $c$ (the quotient) or $d$ (the remainder) can be
+passed as NULL to ignore it. For example, if you only want the quotient $z = \lfloor x/y \rfloor$ then a call such as
+mp\_div(\&x, \&y, \&z, NULL) is acceptable.
+
+There is a related class of ``single digit'' functions that are like the above except they use a digit as the second
+operand.
+
+\begin{verbatim}
+/* compare against a single digit */
+int mp_cmp_d(mp_int *a, mp_digit b);
+
+/* c = a + b */
+int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a - b */
+int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a * b */
+int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* a/b => cb + d == a */
+int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
+
+/* c = a mod b */
+#define mp_mod_d(a,b,c) mp_div_d(a, b, NULL, c)
+\end{verbatim}
+
+Note that care should be taken for the value of the digit passed. By default, any 28-bit integer is a valid digit that can
+be passed into the function. However, if MP\_8BIT or MP\_16BIT is defined only 7 or 15-bit (respectively) integers
+can be passed into it.
+
+\subsection{Modular Arithmetic}
+
+There are some trivial modular arithmetic functions.
+
+\begin{verbatim}
+/* d = a + b (mod c) */
+int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* d = a - b (mod c) */
+int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* d = a * b (mod c) */
+int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* c = a * a (mod b) */
+int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = 1/a (mod b) */
+int mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = (a, b) */
+int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = [a, b] or (a*b)/(a, b) */
+int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
+
+/* d = a^b (mod c) */
+int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+\end{verbatim}
+
+These are all fairly simple to understand. The \textbf{mp\_invmod} is a modular multiplicative inverse. That is it
+stores in the third parameter an integer such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$ provided such integer exists. If
+there is no such integer the function returns \textbf{MP\_VAL}.
+
+\subsection{Radix Conversions}
+To read or store integers in other formats there are the following functions.
+
+\begin{verbatim}
+int mp_unsigned_bin_size(mp_int *a);
+int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+
+int mp_signed_bin_size(mp_int *a);
+int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a, unsigned char *b);
+
+int mp_read_radix(mp_int *a, unsigned char *str, int radix);
+int mp_toradix(mp_int *a, unsigned char *str, int radix);
+\end{verbatim}
+
+The integers are stored in big endian format as most libraries (and MPI) expect. The \textbf{mp\_read\_radix} and
+\textbf{mp\_toradix} functions read and write (respectively) null terminated ASCII strings in a given radix. Valid values
+for the radix are between 2 and 64 (inclusively).
+
+\section{Timing Analysis}
+\subsection{Observed Timings}
+A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The
+test was conducted on an AMD Athlon XP processor with 266Mhz DDR memory and the GCC 3.2 compiler\footnote{With build
+options ``-O3 -fomit-frame-pointer -funroll-loops''}. The multiplications and squarings were repeated 10,000 times
+each while the modular exponentiation (exptmod) were performed 10 times each. The RDTSC (Read Time Stamp Counter) instruction
+was used to measure the time the entire iterations took and was divided by the number of iterations to get an
+average. The following results were observed.
+
+\begin{small}
+\begin{center}
+\begin{tabular}{c|c|c|c}
+\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
+\hline
+Multiply & 128 & 1,394 & 915 \\
+Multiply & 256 & 2,559 & 1,893 \\
+Multiply & 512 & 7,919 & 3,770 \\
+Multiply & 1024 & 28,460 & 9,970 \\
+Multiply & 2048 & 109,637 & 32,264 \\
+Multiply & 4096 & 467,226 & 129,645 \\
+\hline
+Square & 128 & 1,288 & 1,147 \\
+Square & 256 & 1,705 & 2,129 \\
+Square & 512 & 5,365 & 3,755 \\
+Square & 1024 & 18,836 & 9,267 \\
+Square & 2048 & 72,334 & 28,387 \\
+Square & 4096 & 306,252 & 112,391 \\
+\hline
+Exptmod & 512 & 30,497,732 & 7,222,872 \\
+Exptmod & 768 & 98,943,020 & 16,474,567 \\
+Exptmod & 1024 & 221,123,749 & 30,070,883 \\
+Exptmod & 2048 & 1,694,796,907 & 154,697,320 \\
+Exptmod & 2560 & 3,262,360,107 & 318,998,183 \\
+Exptmod & 3072 & 5,647,243,373 & 494,313,122 \\
+Exptmod & 4096 & 13,345,194,048 & 1,036,254,558
+
+\end{tabular}
+\end{center}
+\end{small}
+
+\subsection{Digit Size}
+The first major constribution to the time savings is the fact that 28 bits are stored per digit instead of the MPI
+defualt of 16. This means in many of the algorithms the savings can be considerable. Consider a baseline multiplier
+with a 1024-bit input. With MPI the input would be 64 16-bit digits whereas in LibTomMath it would be 37 28-bit digits.
+A savings of $64^2 - 37^2 = 2727$ single precision multiplications.
+
+\subsection{Multiplication Algorithms}
+For most inputs a typical baseline $O(n^2)$ multiplier is used which is similar to that of MPI. There are two variants
+of the baseline multiplier. The normal and the fast variants. The normal baseline multiplier is the exact same as the
+algorithm from MPI. The fast baseline multiplier is optimized for cases where the number of input digits $N$ is less
+than or equal to $2^{w}/\beta^2$. Where $w$ is the number of bits in a \textbf{mp\_word}. By default a mp\_word is
+64-bits which means $N \le 256$ is allowed which represents numbers upto $7168$ bits.
+
+The fast baseline multiplier is optimized by removing the carry operations from the inner loop. This is often referred
+to as the ``comba'' method since it computes the products a columns first then figures out the carries. This has the
+effect of making a very simple and paralizable inner loop.
+
+For large inputs, typically 80 digits\footnote{By default that is 2240-bits or more.} or more the Karatsuba method is
+used. This method has significant overhead but an asymptotic running time of $O(n^{1.584})$ which means for fairly large
+inputs this method is faster. The Karatsuba implementation is recursive which means for extremely large inputs they
+will benefit from the algorithm.
+
+MPI only implements the slower baseline multiplier where carries are dealt with in the inner loop. As a result even at
+smaller numbers (below the Karatsuba cutoff) the LibTomCrypt multipliers are faster.
+
+\subsection{Squaring Algorithms}
+
+Similar to the multiplication algorithms there are two baseline squaring algorithms. Both have an asymptotic running
+time of $O((t^2 + t)/2)$. The normal baseline squaring is the same from MPI and the fast is a ``comba'' squaring
+algorithm. The comba method is used if the number of digits $N$ is less than $2^{w-1}/\beta^2$ which by default
+covers numbers upto $3584$ bits.
+
+There is also a Karatsuba squaring method which achieves a running time of $O(n^{1.584})$ after considerably large
+inputs.
+
+MPI only implements the slower baseline squaring algorithm. As a result LibTomMath is considerably faster at squaring
+than MPI is.
+
+\subsection{Exponentiation Algorithms}
+
+LibTomMath implements a sliding window $k$-ary left to right exponentiation algorithm. For a given exponent size $L$ an
+appropriate window size $k$ is chosen. There are always at most $L$ modular squarings and $\lfloor L/k \rfloor$ modular
+multiplications. The $k$-ary method works by precomputing values $g(x) = b^x$ for $0 \le x < 2^k$ and a given base
+$b$. Then the multiplications are grouped in windows of $k$ bits. The sliding window technique has the benefit
+that it can skip multiplications if there are zero bits following or preceding a window. Consider the exponent
+$e = 11110001_2$ if $k = 2$ then there will be a two squarings, a multiplication of $g(3)$, two squarings, a multiplication
+of $g(3)$, four squarings and and a multiplication by $g(1)$. In total there are 8 squarings and 3 multiplications.
+
+MPI uses a binary square-multiply method. For the same exponent $e$ it would have had 8 squarings and 5 multiplications.
+There is a precomputation phase for the method LibTomCrypt uses but it generally cuts down considerably on the number
+of multiplications. Consider a 512-bit exponent. The worst case for the LibTomMath method results in 512 squarings and
+124 multiplications. The MPI method would have 512 squarings and 512 multiplications. Randomly every $2k$ bits another
+multiplication is saved via the sliding-window technique on top of the savings the $k$-ary method provides.
+
+Both LibTomMath and MPI use Barrett reduction instead of division to reduce the numbers modulo the modulus given.
+However, LibTomMath can take advantage of the fact that the multiplications required within the Barrett reduction
+do not to give full precision. As a result the reduction step is much faster and just as accurate. The LibTomMath code
+will automatically determine at run-time (e.g. when its called) whether the faster multiplier can be used. The
+faster multipliers have also been optimized into the two variants (baseline and fast baseline).
+
+As a result of all these changes exponentiation in LibTomMath is much faster than compared to MPI.
+
+
+
+\end{document}
6 changes.txt
@@ -0,0 +1,6 @@
+Dec 25th,2002
+v0.01 -- Initial release. Gimme a break.
+ -- Todo list,
+ add details to manual [e.g. algorithms]
+ more comments in code
+ example programs
238 demo.c
@@ -0,0 +1,238 @@
+#include <time.h>
+
+#ifdef U_MPI
+#include <stdio.h>
+#include <string.h>