# libtom/libtommath

### Subversion checkout URL

You can clone with
or
.

1 parent 3d0fcaa commit fdfa2f4f5098801dba17eb7e0e9e38fc4f97e6ae Tom St Denis committed with sjaeckel
BIN  bn.pdf
Binary file not shown
2  bn.tex
 @@ -49,7 +49,7 @@ \begin{document} \frontmatter \pagestyle{empty} -\title{LibTomMath User Manual \\ v0.34} +\title{LibTomMath User Manual \\ v0.35} \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle This text, the library and the accompanying textbook are all hereby placed in the public domain. This book has been
2  bn_fast_mp_invmod.c
 @@ -42,7 +42,7 @@ int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c) } /* we need y = |a| */ - if ((res = mp_abs (a, &y)) != MP_OKAY) { + if ((res = mp_mod (a, b, &y)) != MP_OKAY) { goto LBL_ERR; }
8 bn_fast_s_mp_mul_digs.c
 @@ -62,7 +62,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) tmpx = a->dp + tx; tmpy = b->dp + ty; - /* this is the number of times the loop will iterrate, essentially its + /* this is the number of times the loop will iterrate, essentially while (tx++ < a->used && ty-- >= 0) { ... } */ iy = MIN(a->used-tx, ty+1); @@ -80,16 +80,16 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) } /* store final carry */ - W[ix] = _W & MP_MASK; + W[ix] = (mp_digit)(_W & MP_MASK); /* setup dest */ olduse = c->used; - c->used = digs; + c->used = pa; { register mp_digit *tmpc; tmpc = c->dp; - for (ix = 0; ix < digs; ix++) { + for (ix = 0; ix < pa+1; ix++) { /* now extract the previous digit [below the carry] */ *tmpc++ = W[ix]; }
2  bn_fast_s_mp_mul_high_digs.c
 @@ -71,7 +71,7 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs) } /* store final carry */ - W[ix] = _W & MP_MASK; + W[ix] = (mp_digit)(_W & MP_MASK); /* setup dest */ olduse = c->used;
33 bn_fast_s_mp_sqr.c
 @@ -15,33 +15,14 @@ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org */ -/* fast squaring - * - * This is the comba method where the columns of the product - * are computed first then the carries are computed. This - * has the effect of making a very simple inner loop that - * is executed the most - * - * W2 represents the outer products and W the inner. - * - * A further optimizations is made because the inner - * products are of the form "A * B * 2". The *2 part does - * not need to be computed until the end which is good - * because 64-bit shifts are slow! - * - * Based on Algorithm 14.16 on pp.597 of HAC. - * - */ /* the jist of squaring... - -you do like mult except the offset of the tmpx [one that starts closer to zero] -can't equal the offset of tmpy. So basically you set up iy like before then you min it with -(ty-tx) so that it never happens. You double all those you add in the inner loop + * you do like mult except the offset of the tmpx [one that + * starts closer to zero] can't equal the offset of tmpy. + * So basically you set up iy like before then you min it with + * (ty-tx) so that it never happens. You double all those + * you add in the inner loop After that loop you do the squares and add them in. - -Remove W2 and don't memset W - */ int fast_s_mp_sqr (mp_int * a, mp_int * b) @@ -76,7 +57,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b) tmpx = a->dp + tx; tmpy = a->dp + ty; - /* this is the number of times the loop will iterrate, essentially its + /* this is the number of times the loop will iterrate, essentially while (tx++ < a->used && ty-- >= 0) { ... } */ iy = MIN(a->used-tx, ty+1); @@ -101,7 +82,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b) } /* store it */ - W[ix] = _W & MP_MASK; + W[ix] = (mp_digit)(_W & MP_MASK); /* make next carry */ W1 = _W >> ((mp_word)DIGIT_BIT);
7 bn_mp_exteuclid.c
 @@ -59,6 +59,13 @@ int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3) if ((err = mp_copy(&t3, &v3)) != MP_OKAY) { goto _ERR; } } + /* make sure U3 >= 0 */ + if (u3.sign == MP_NEG) { + mp_neg(&u1, &u1); + mp_neg(&u2, &u2); + mp_neg(&u3, &u3); + } + /* copy result out */ if (U1 != NULL) { mp_exch(U1, &u1); } if (U2 != NULL) { mp_exch(U2, &u2); }
4 bn_mp_invmod_slow.c
 @@ -33,8 +33,8 @@ int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c) } /* x = a, y = b */ - if ((res = mp_copy (a, &x)) != MP_OKAY) { - goto LBL_ERR; + if ((res = mp_mod(a, b, &x)) != MP_OKAY) { + goto LBL_ERR; } if ((res = mp_copy (b, &y)) != MP_OKAY) { goto LBL_ERR;
1  bn_mp_montgomery_calc_normalization.c
 @@ -28,7 +28,6 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b) /* how many bits of last digit does b use */ bits = mp_count_bits (b) % DIGIT_BIT; - if (b->used > 1) { if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) { return res;
10 bn_mp_neg.c
 @@ -19,12 +19,18 @@ int mp_neg (mp_int * a, mp_int * b) { int res; - if ((res = mp_copy (a, b)) != MP_OKAY) { - return res; + if (a != b) { + if ((res = mp_copy (a, b)) != MP_OKAY) { + return res; + } } + if (mp_iszero(b) != MP_YES) { b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; + } else { + b->sign = MP_ZPOS; } + return MP_OKAY; } #endif
 @@ -35,22 +35,29 @@ int mp_radix_size (mp_int * a, int radix, int *size) return MP_VAL; } - /* init a copy of the input */ - if ((res = mp_init_copy (&t, a)) != MP_OKAY) { - return res; + if (mp_iszero(a) == MP_YES) { + *size = 2; + return MP_OKAY; } /* digs is the digit count */ digs = 0; /* if it's negative add one for the sign */ - if (t.sign == MP_NEG) { + if (a->sign == MP_NEG) { ++digs; - t.sign = MP_ZPOS; } + /* init a copy of the input */ + if ((res = mp_init_copy (&t, a)) != MP_OKAY) { + return res; + } + + /* force temp to positive */ + t.sign = MP_ZPOS; + /* fetch out all of the digits */ - while (mp_iszero (&t) == 0) { + while (mp_iszero (&t) == MP_NO) { if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) { mp_clear (&t); return res;
4 bn_mp_rand.c
 @@ -29,14 +29,14 @@ mp_rand (mp_int * a, int digits) /* first place a random non-zero digit */ do { - d = ((mp_digit) abs (rand ())); + d = ((mp_digit) abs (rand ())) & MP_MASK; } while (d == 0); if ((res = mp_add_d (a, d, a)) != MP_OKAY) { return res; } - while (digits-- > 0) { + while (--digits > 0) { if ((res = mp_lshd (a, 1)) != MP_OKAY) { return res; }
4 bn_mp_reduce.c
 @@ -39,11 +39,11 @@ int mp_reduce (mp_int * x, mp_int * m, mp_int * mu) } } else { #ifdef BN_S_MP_MUL_HIGH_DIGS_C - if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) { + if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) { goto CLEANUP; } #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C) - if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) { + if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) { goto CLEANUP; } #else
7 bn_mp_toom_mul.c
 @@ -17,9 +17,10 @@ /* multiplication using the Toom-Cook 3-way algorithm * - * Much more complicated than Karatsuba but has a lower asymptotic running time of - * O(N**1.464). This algorithm is only particularly useful on VERY large - * inputs (we're talking 1000s of digits here...). + * Much more complicated than Karatsuba but has a lower + * asymptotic running time of O(N**1.464). This algorithm is + * only particularly useful on VERY large inputs + * (we're talking 1000s of digits here...). */ int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c) {
2  bn_mp_xor.c
 @@ -37,7 +37,7 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c) } for (ix = 0; ix < px; ix++) { - + t.dp[ix] ^= x->dp[ix]; } mp_clamp (&t); mp_exch (c, &t);
12 bn_mp_zero.c
 @@ -16,11 +16,17 @@ */ /* set to zero */ -void -mp_zero (mp_int * a) +void mp_zero (mp_int * a) { + int n; + mp_digit *tmp; + a->sign = MP_ZPOS; a->used = 0; - memset (a->dp, 0, sizeof (mp_digit) * a->alloc); + + tmp = a->dp; + for (n = 0; n < a->alloc; n++) { + *tmp++ = 0; + } } #endif
3  bn_s_mp_mul_digs.c
 @@ -19,8 +19,7 @@ * HAC pp. 595, Algorithm 14.12 Modified so you can control how * many digits of output are created. */ -int -s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) { mp_int t; int res, pa, pb, ix, iy;
3  bn_s_mp_sqr.c
 @@ -16,8 +16,7 @@ */ /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */ -int -s_mp_sqr (mp_int * a, mp_int * b) +int s_mp_sqr (mp_int * a, mp_int * b) { mp_int t; int res, ix, iy, pa;
891 callgraph.txt
11 changes.txt
 @@ -1,3 +1,14 @@ +March 12th, 2005 +v0.35 -- Stupid XOR function missing line again... oops. + -- Fixed bug in invmod not handling negative inputs correctly [Wolfgang Ehrhardt] + -- Made exteuclid always give positive u3 output...[ Wolfgang Ehrhardt ] + -- [Wolfgang Ehrhardt] Suggested a fix for mp_reduce() which avoided underruns. ;-) + -- mp_rand() would emit one too many digits and it was possible to get a 0 out of it ... oops + -- Added montgomery to the testing to make sure it handles 1..10 digit moduli correctly + -- Fixed bug in comba that would lead to possible erroneous outputs when "pa < digs" + -- Fixed bug in mp_toradix_size for "0" [Kevin Kenny] + -- Updated chapters 1-5 of the textbook ;-) It now talks about the new comba code! + February 12th, 2005 v0.34 -- Fixed two more small errors in mp_prime_random_ex() -- Fixed overflow in mp_mul_d() [Kevin Kenny]
41 demo/demo.c
 @@ -56,6 +56,7 @@ int main(void) gcd_n, lcm_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n, t; unsigned rr; int i, n, err, cnt, ix, old_kara_m, old_kara_s; + mp_digit mp; mp_init(&a); @@ -68,6 +69,40 @@ int main(void) srand(time(NULL)); #if 0 + // test montgomery + printf("Testing montgomery...\n"); + for (i = 1; i < 10; i++) { + printf("Testing digit size: %d\n", i); + for (n = 0; n < 1000; n++) { + mp_rand(&a, i); + a.dp[0] |= 1; + + // let's see if R is right + mp_montgomery_calc_normalization(&b, &a); + mp_montgomery_setup(&a, &mp); + + // now test a random reduction + for (ix = 0; ix < 100; ix++) { + mp_rand(&c, 1 + abs(rand()) % (2*i)); + mp_copy(&c, &d); + mp_copy(&c, &e); + + mp_mod(&d, &a, &d); + mp_montgomery_reduce(&c, &a, mp); + mp_mulmod(&c, &b, &a, &c); + + if (mp_cmp(&c, &d) != MP_EQ) { +printf("d = e mod a, c = e MOD a\n"); +mp_todecimal(&a, buf); printf("a = %s\n", buf); +mp_todecimal(&e, buf); printf("e = %s\n", buf); +mp_todecimal(&d, buf); printf("d = %s\n", buf); +mp_todecimal(&c, buf); printf("c = %s\n", buf); +printf("compare no compare!\n"); exit(EXIT_FAILURE); } + } + } + } + printf("done\n"); + // test mp_get_int printf("Testing: mp_get_int\n"); for (i = 0; i < 1000; ++i) { @@ -139,7 +174,7 @@ int main(void) printf("\n\n"); /* test for size */ - for (ix = 10; ix < 256; ix++) { + for (ix = 10; ix < 128; ix++) { printf("Testing (not safe-prime): %9d bits \r", ix); fflush(stdout); err = @@ -156,7 +191,7 @@ int main(void) } } - for (ix = 16; ix < 256; ix++) { + for (ix = 16; ix < 128; ix++) { printf("Testing ( safe-prime): %9d bits \r", ix); fflush(stdout); err = @@ -235,7 +270,7 @@ int main(void) mp_rand(&b, (cnt / DIGIT_BIT + 1) * 2); mp_copy(&c, &b); mp_mod(&c, &a, &c); - mp_reduce_2k(&b, &a, 1); + mp_reduce_2k(&b, &a, 2); if (mp_cmp(&c, &b)) { printf("FAILED\n"); exit(0);
2  makefile
 @@ -3,7 +3,7 @@ #Tom St Denis #version of library -VERSION=0.34 +VERSION=0.35 CFLAGS += -I./ -Wall -W -Wshadow -Wsign-compare
2  makefile.shared
 @@ -1,7 +1,7 @@ #Makefile for GCC # #Tom St Denis -VERSION=0:34 +VERSION=0:35 CC = libtool --mode=compile gcc CFLAGS += -I./ -Wall -W -Wshadow -Wsign-compare
BIN  poster.pdf
Binary file not shown
121 pre_gen/mpi.c
 @@ -49,7 +49,7 @@ \begin{document} \frontmatter \pagestyle{empty} -\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition } +\title{Multi--Precision Math} \author{\mbox{ %\begin{small} \begin{tabular}{c} @@ -66,7 +66,7 @@ QUALCOMM Australia \\ } } \maketitle -This text has been placed in the public domain. This text corresponds to the v0.34 release of the +This text has been placed in the public domain. This text corresponds to the v0.35 release of the LibTomMath project. \begin{alltt} @@ -85,66 +85,32 @@ This text is formatted to the international B5 paper size of 176mm wide by 250mm \tableofcontents \listoffigures -\chapter*{Prefaces to the Draft Edition} -I started this text in April 2003 to complement my LibTomMath library. That is, explain how to implement the functions -contained in LibTomMath. The goal is to have a textbook that any Computer Science student can use when implementing their -own multiple precision arithmetic. The plan I wanted to follow was flesh out all the -ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time. Chance -would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the -text. - -Choosing to not waste any time I dove right into the project even before my spring semester was finished. I wrote a bit -off and on at first. The moment my exams were finished I jumped into long 12 to 16 hour days. The result after only -a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted -to read it. I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara. So far I have -managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain -rewarding. - -Now we are past December 2003. By this time I had pictured that I would have at least finished my second draft of the text. -Currently I am far off from this goal. I've done partial re-writes of chapters one, two and three but they are not even -finished yet. I haven't given up on the project, only had some setbacks. First O'Reilly declined to publish the text then -Addison-Wesley and Greg is tried another which I don't know the name of. However, at this point I want to focus my energy -onto finishing the book not securing a contract. - -So why am I writing this text? It seems like a lot of work right? Most certainly it is a lot of work writing a textbook. -Even the simplest introductory material has to be lined with references and figures. A lot of the text has to be re-written -from point form to prose form to ensure an easier read. Why am I doing all this work for free then? Simple. My philosophy -is quite simply Open Source. Open Academia. Open Minds'' which means that to achieve a goal of open minds, that is, -people willing to accept new ideas and explore the unknown you have to make available material they can access freely -without hinderance. - -I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come -to depend upon. I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their -software. Several educational institutions use it as a matter of course and many freelance developers use it as -part of their projects. To further my contributions I started the LibTomMath project in December 2002 aimed at providing -multiple precision arithmetic routines that students could learn from. That is write routines that are not only easy -to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C. - -The second leg of my philosophy is Open Academia'' which is where this textbook comes in. In the end, when all is -said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic. - -At this time I feel I should share a little information about myself. The most common question I was asked at -Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended. The unfortunate -truth is that I neither teach at or attend a school of academic reputation. I'm currently at Algonquin College which -is what I'd like to call somewhat academic but mostly vocational'' college. In otherwords, job training. - -I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen -computer science fields, a few fields of mathematics and some English). I look forward to teaching someday but I am -still far off from that goal. - -Now it would be improper for me to not introduce the rest of the texts co-authors. While they are only contributing -corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out -in the text so far. Greg has always been there for me. He has tracked my LibTom projects since their inception and even -sent cheques to help pay tuition from time to time. His background has provided a wonderful source to bounce ideas off -of and improve the quality of my writing. Mads is another fellow who has just been there''. I don't even recall what -his interest in the LibTom projects is but I'm definitely glad he has been around. His ability to catch logical errors -in my written English have saved me on several occasions to say the least. - -What to expect next? Well this is still a rough draft. I've only had the chance to update a few chapters. However, I've -been getting the feeling that people are starting to use my text and I owe them some updated material. My current tenative -plan is to edit one chapter every two weeks starting January 4th. It seems insane but my lower course load at college -should provide ample time. By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many -people who will take it. +\chapter*{Prefaces} +When I tell people about my LibTom projects and that I release them as public domain they are often puzzled. +They ask why I did it and especially why I continue to work on them for free. The best I can explain it is Because I can.'' +Which seems odd and perhaps too terse for adult conversation. I often qualify it with I am able, I am willing.'' which +perhaps explains it better. I am the first to admit there is not anything that special with what I have done. Perhaps +others can see that too and then we would have a society to be proud of. My LibTom projects are what I am doing to give +back to society in the form of tools and knowledge that can help others in their endeavours. + +I started writing this book because it was the most logical task to further my goal of open academia. The LibTomMath source +code itself was written to be easy to follow and learn from. There are times, however, where pure C source code does not +explain the algorithms properly. Hence this book. The book literally starts with the foundation of the library and works +itself outwards to the more complicated algorithms. The use of both pseudo--code and verbatim source code provides a duality +of theory'' and practice'' that the computer science students of the world shall appreciate. I never deviate too far +from relatively straightforward algebra and I hope that this book can be a valuable learning asset. + +This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora +of kind people donating their time, resources and kind words to help support my work. Writing a text of significant +length (along with the source code) is a tiresome and lengthy process. Currently the LibTom project is four years old, +comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material. People like Mads and Greg +were there at the beginning to encourage me to work well. It is amazing how timely validation from others can boost morale to +continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003. + +To my many friends whom I have met through the years I thank you for the good times and the words of encouragement. I hope I +honour your kind gestures with this project. + +Open Source. Open Academia. Open Minds. \begin{flushright} Tom St Denis \end{flushright} @@ -937,7 +903,7 @@ assumed to contain undefined values they are initially set to zero. EXAM,bn_mp_grow.c -A quick optimization is to first determine if a memory re-allocation is required at all. The if statement (line @23,if@) checks +A quick optimization is to first determine if a memory re-allocation is required at all. The if statement (line @24,alloc@) checks if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count. If the count is not larger than \textbf{alloc} the function skips the re-allocation part thus saving time. @@ -1310,7 +1276,7 @@ After the function is completed, all of the digits are zeroed, the \textbf{used} With the mp\_int representation of an integer, calculating the absolute value is trivial. The mp\_abs algorithm will compute the absolute value of an mp\_int. -\newpage\begin{figure}[here] +\begin{figure}[here] \begin{center} \begin{tabular}{l} \hline Algorithm \textbf{mp\_abs}. \\ @@ -1335,6 +1301,9 @@ logic to handle it. EXAM,bn_mp_abs.c +This fairly trivial algorithm first eliminates non--required duplications (line @27,a != b@) and then sets the +\textbf{sign} flag to \textbf{MP\_ZPOS}. + \subsection{Integer Negation} With the mp\_int representation of an integer, calculating the negation is also trivial. The mp\_neg algorithm will compute the negative of an mp\_int input. @@ -1368,11 +1337,15 @@ zero as negative. EXAM,bn_mp_neg.c +Like mp\_abs() this function avoids non--required duplications (line @21,a != b@) and then sets the sign. We +have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}. If the mp\_int is zero +than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}. + \section{Small Constants} \subsection{Setting Small Constants} Often a mp\_int must be set to a relatively small value such as $1$ or $2$. For these cases the mp\_set algorithm is useful. -\begin{figure}[here] +\newpage\begin{figure}[here] \begin{center} \begin{tabular}{l} \hline Algorithm \textbf{mp\_set}. \\ @@ -1397,11 +1370,14 @@ single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adj EXAM,bn_mp_set.c -Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign. Line @22,MP_MASK@ copies the digit -into the least significant location. Note the usage of a new constant \textbf{MP\_MASK}. This constant is used to quickly -reduce an integer modulo $\beta$. Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with -$MP\_MASK = 2^k - 1$ to perform the reduction. Finally line @23,a->used@ will set the \textbf{used} member with respect to the -digit actually set. This function will always make the integer positive. +First we zero (line @21,mp_zero@) the mp\_int to make sure that the other members are initialized for a +small positive constant. mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count +is zero. Next we set the digit and reduce it modulo $\beta$ (line @22,MP_MASK@). After this step we have to +check if the resulting digit is zero or not. If it is not then we set the \textbf{used} count to one, otherwise +to zero. + +We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with +$2^k - 1$ will perform the same operation. One important limitation of this function is that it will only set one digit. The size of a digit is not fixed, meaning source that uses this function should take that into account. Only trivially small constants can be set using this function. @@ -1503,10 +1479,12 @@ the zero'th digit. If after all of the digits have been compared, no difference EXAM,bn_mp_cmp_mag.c -The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs. These two are performed before all of the digits -are compared since it is a very cheap test to perform and can potentially save considerable time. The implementation given is also not valid -without those two statements. $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the -array of digits. +The two if statements (lines @24,if@ and @28,if@) compare the number of digits in the two inputs. These two are +performed before all of the digits are compared since it is a very cheap test to perform and can potentially save +considerable time. The implementation given is also not valid without those two statements. $b.alloc$ may be +smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits. + + \subsection{Signed Comparisons} Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}). Based on an unsigned magnitude @@ -1539,9 +1517,9 @@ $\vert a \vert < \vert b \vert$. Step number four will compare the two when the EXAM,bn_mp_cmp.c -The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison. If the signs are not the equal then which ever -has the positive sign is larger. At line @30,if@, the inputs are compared based on magnitudes. If the signs were both negative then -the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}). Otherwise, the signs are assumed to +The two if statements (lines @22,if@ and @26,if@) perform the initial sign comparison. If the signs are not the equal then which ever +has the positive sign is larger. The inputs are compared (line @30,if@) based on magnitudes. If the signs were both +negative then the unsigned comparison is performed in the opposite direction (line @31,mp_cmp_mag@). Otherwise, the signs are assumed to be both positive and a forward direction unsigned comparison is performed. \section*{Exercises} @@ -1664,19 +1642,21 @@ The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are EXAM,bn_s_mp_add.c -Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables. Note that $x$ is a pointer to a -mp\_int assigned to the largest input, in effect it is a local alias. Lines @37,init@ to @42,}@ ensure that the destination is grown to -accomodate the result of the addition. +We first sort (lines @27,if@ to @35,}@) the inputs based on magnitude and determine the $min$ and $max$ variables. +Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias. Next we +grow the destination (@37,init@ to @42,}@) ensure that it can accomodate the result of the addition. Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style. The three aliases that are on lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively. These aliases are used to ensure the compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int. -The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the -implementation. The initial addition loop begins on line @66,for@ and ends on line @75,}@. Similarly the conditional addition loop -begins on line @81,for@ and ends on line @90,}@. The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@. -Note the ++'' operator on the same line. After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$. This is useful -for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero. +The initial carry $u$ will be cleared (line @65,u = 0@), note that $u$ is of type mp\_digit which ensures type +compatibility within the implementation. The initial addition (line @66,for@ to @75,}@) adds digits from +both inputs until the smallest input runs out of digits. Similarly the conditional addition loop +(line @81,for@ to @90,}@) adds the remaining digits from the larger of the two inputs. The addition is finished +with the final carry being stored in $tmpc$ (line @94,tmpc++@). Note the ++'' operator within the same expression. +After line @94,tmpc++@, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$. This is useful +for the next loop (line @97,for@ to @99,}@) which set any old upper digits to zero. \subsection{Low Level Subtraction} The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm. The principle difference is that the @@ -1692,7 +1672,7 @@ this algorithm we will assume that the variable $\gamma$ represents the number o mp\_digit (\textit{this implies $2^{\gamma} > \beta$}). For example, the default for LibTomMath is to use a unsigned long'' for the mp\_digit type'' while $\beta = 2^{28}$. In ISO C an unsigned long'' -data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$. +data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$. \newpage\begin{figure}[!here] \begin{center} @@ -1759,20 +1739,23 @@ If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and cop EXAM,bn_s_mp_sub.c -Line @24,min@ and @25,max@ perform the initial hardcoded sorting of the inputs. In reality the $min$ and $max$ variables are only aliases and are only -used to make the source code easier to read. Again the pointer alias optimization is used within this algorithm. Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for -$a$, $b$ and $c$ respectively. +Like low level addition we sort'' the inputs. Except in this case the sorting is hardcoded +(lines @24,min@ and @25,max@). In reality the $min$ and $max$ variables are only aliases and are only +used to make the source code easier to read. Again the pointer alias optimization is used +within this algorithm. The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized +(lines @42,tmpa@, @43,tmpb@ and @44,tmpc@) for $a$, $b$ and $c$ respectively. -The first subtraction loop occurs on lines @47,u = 0@ through @61,}@. The theory behind the subtraction loop is exactly the same as that for -the addition loop. As remarked earlier there is an implementation reason for using the awkward'' method of extracting the carry -(\textit{see line @57, >>@}). The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND -the least significant bit. The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry -occurs from subtraction. This carry extraction requires two relatively cheap operations to extract the carry. The other method is to simply -shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation. This optimization only works on -twos compliment machines which is a safe assumption to make. +The first subtraction loop (lines @47,u = 0@ through @61,}@) subtract digits from both inputs until the smaller of +the two inputs has been exhausted. As remarked earlier there is an implementation reason for using the awkward'' +method of extracting the carry (line @57, >>@). The traditional method for extracting the carry would be to shift +by $lg(\beta)$ positions and logically AND the least significant bit. The AND operation is required because all of +the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction. This carry +extraction requires two relatively cheap operations to extract the carry. The other method is to simply shift the +most significant bit to the least significant bit thus extracting the carry with a single cheap operation. This +optimization only works on twos compliment machines which is a safe assumption to make. -If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through -$a$ and copy the result to $c$. +If $a$ has a larger magnitude than $b$ an additional loop (lines @64,for@ through @73,}@) is required to propagate +the carry through $a$ and copy the result to $c$. \subsection{High Level Addition} Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be @@ -2098,10 +2081,11 @@ FIGU,sliding_window,Sliding Window Movement EXAM,bn_mp_lshd.c -The if statement on line @24,if@ ensures that the $b$ variable is greater than zero. The \textbf{used} count is incremented by $b$ before -the copy loop begins. This elminates the need for an additional variable in the for loop. The variable $top$ on line @42,top@ is an alias -for the leading digit while $bottom$ on line @45,bottom@ is an alias for the trailing edge. The aliases form a window of exactly $b$ digits -over the input. +The if statement (line @24,if@) ensures that the $b$ variable is greater than zero since we do not interpret negative +shift counts properly. The \textbf{used} count is incremented by $b$ before the copy loop begins. This elminates +the need for an additional variable in the for loop. The variable $top$ (line @42,top@) is an alias +for the leading digit while $bottom$ (line @45,bottom@) is an alias for the trailing edge. The aliases form a +window of exactly $b$ digits over the input. \subsection{Division by $x$} @@ -2151,9 +2135,9 @@ Once the window copy is complete the upper digits must be zeroed and the \textbf EXAM,bn_mp_rshd.c -The only noteworthy element of this routine is the lack of a return type. - --- Will update later to give it a return type...Tom +The only noteworthy element of this routine is the lack of a return type since it cannot fail. Like mp\_lshd() we +form a sliding window except we copy in the other direction. After the window (line @59,for (;@) we then zero +the upper digits of the input to make sure the result is correct. \section{Powers of Two} @@ -2214,7 +2198,15 @@ complete. It is possible to optimize this algorithm down to a $O(n)$ algorithm EXAM,bn_mp_mul_2d.c -Notes to be revised when code is updated. -- Tom +The shifting is performed in--place which means the first step (line @24,a != c@) is to copy the input to the +destination. We avoid calling mp\_copy() by making sure the mp\_ints are different. The destination then +has to be grown (line @31,grow@) to accomodate the result. + +If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples +of $lg(\beta)$. Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left. Inside the actual shift +loop (lines @45,if@ to @76,}@) we make use of pre--computed values $shift$ and $mask$. These are used to +extract the carry bit(s) to pass into the next iteration of the loop. The $r$ and $rr$ variables form a +chain between consecutive iterations to propagate the carry. \subsection{Division by Power of Two} @@ -2263,7 +2255,8 @@ ignored by passing \textbf{NULL} as the pointer to the mp\_int variable. The result of the remainder operation until the end. This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before the quotient is obtained. -The remainder of the source code is essentially the same as the source code for mp\_mul\_2d. (-- Fix this paragraph up later, Tom). +The remainder of the source code is essentially the same as the source code for mp\_mul\_2d. The only significant difference is +the direction of the shifts. \subsection{Remainder of Division by Power of Two} @@ -2306,7 +2299,13 @@ is copied to $b$, leading digits are removed and the remaining leading digit is EXAM,bn_mp_mod_2d.c --- Add comments later, Tom. +We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases. Next if $2^b$ is larger +than the input we just mp\_copy() the input and return right away. After this point we know we must actually +perform some work to produce the remainder. + +Recalling that reducing modulo $2^k$ and a binary and'' with $2^k - 1$ are numerically equivalent we can quickly reduce +the number. First we zero any digits above the last digit in $2^b$ (line @41,for@). Next we reduce the +leading digit of both (line @45,&=@) and then mp\_clamp(). \section*{Exercises} \begin{tabular}{cl} @@ -2464,33 +2463,46 @@ exceed the precision requested. EXAM,bn_s_mp_mul_digs.c -Lines @31,if@ to @35,}@ determine if the Comba method can be used first. The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and -the number of digits of output is less than \textbf{MP\_WARRAY}. This new constant is used to control -the stack usage in the Comba routines. By default it is set to $\delta$ but can be reduced when memory is at a premium. +First we determine (line @30,if@) if the Comba method can be used first since it's faster. The conditions for +sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than +\textbf{MP\_WARRAY}. This new constant is used to control the stack usage in the Comba routines. By default it is +set to $\delta$ but can be reduced when memory is at a premium. + +If we cannot use the Comba method we proceed to setup the baseline routine. We allocate the the destination mp\_int +$t$ (line @36,init@) to the exact size of the output to avoid further re--allocations. At this point we now +begin the $O(n^2)$ loop. -Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@. Note how all of the -variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$. That is to ensure that double precision operations -are used instead of single precision. The multiplication on line @65,) * (@ makes use of a specific GCC optimizer behaviour. On the outset it looks like -the compiler will have to use a double precision multiplication to produce the result required. Such an operation would be horribly slow on most -processors and drag this to a crawl. However, GCC is smart enough to realize that double wide output single precision multipliers can be used. For -example, the instruction MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result. +This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of +digits as output. In each iteration of the outer loop the $pb$ variable is set (line @48,MIN@) to the maximum +number of inner loop iterations. + +Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the +carry from the previous iteration. A particularly important observation is that most modern optimizing +C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that +is required for the product. In x86 terms for example, this means using the MUL instruction. + +Each digit of the product is stored in turn (line @68,tmpt@) and the carry propagated (line @71,>>@) to the +next iteration. \subsection{Faster Multiplication by the Comba'' Method} MARK,COMBA -One of the huge drawbacks of the baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards. This -makes the nested loop very sequential and hard to unroll and implement in parallel. The Comba'' \cite{COMBA} method is named after little known -(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested -carry fixup operations. As an interesting aside it seems that Paul Barrett describes a similar technique in -his 1986 paper \cite{BARRETT} written five years before. +One of the huge drawbacks of the baseline'' algorithms is that at the $O(n^2)$ level the carry must be +computed and propagated upwards. This makes the nested loop very sequential and hard to unroll and implement +in parallel. The Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. +Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations. As an +interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written +five years before. -At the heart of the Comba technique is once again the long-hand algorithm. Except in this case a slight twist is placed on how -the columns of the result are produced. In the standard long-hand algorithm rows of products are produced then added together to form the -final result. In the baseline algorithm the columns are added together after each iteration to get the result instantaneously. +At the heart of the Comba technique is once again the long-hand algorithm. Except in this case a slight +twist is placed on how the columns of the result are produced. In the standard long-hand algorithm rows of products +are produced then added together to form the final result. In the baseline algorithm the columns are added together +after each iteration to get the result instantaneously. -In the Comba algorithm the columns of the result are produced entirely independently of each other. That is at the $O(n^2)$ level a -simple multiplication and addition step is performed. The carries of the columns are propagated after the nested loop to reduce the amount -of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. +In the Comba algorithm the columns of the result are produced entirely independently of each other. That is at +the $O(n^2)$ level a simple multiplication and addition step is performed. The carries of the columns are propagated +after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute +the product vector $\vec x$ as follows. \vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace @@ -2584,38 +2596,32 @@ $256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, \textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ \textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ \hline \\ -Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\ +Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\ 1. If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\ 2. If step 1 failed return(\textit{MP\_MEM}).\\ \\ -Zero the temporary array $\hat W$. \\ -3. for $n$ from $0$ to $digs - 1$ do \\ -\hspace{3mm}3.1 $\hat W_n \leftarrow 0$ \\ +3. $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\ \\ -Compute the columns. \\ -4. for $ix$ from $0$ to $a.used - 1$ do \\ -\hspace{3mm}4.1 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ -\hspace{3mm}4.2 If $pb < 1$ then goto step 5. \\ -\hspace{3mm}4.3 for $iy$ from $0$ to $pb - 1$ do \\ -\hspace{6mm}4.3.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\ +4. $\_ \hat W \leftarrow 0$ \\ +5. for $ix$ from 0 to $pa - 1$ do \\ +\hspace{3mm}5.1 $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\ +\hspace{3mm}5.2 $tx \leftarrow ix - ty$ \\ +\hspace{3mm}5.3 $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\ +\hspace{3mm}5.4 for $iz$ from 0 to $iy - 1$ do \\ +\hspace{6mm}5.4.1 $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\ +\hspace{3mm}5.5 $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\ +\hspace{3mm}5.6 $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\ +6. $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\ \\ -Propagate the carries upwards. \\ -5. $oldused \leftarrow c.used$ \\ -6. $c.used \leftarrow digs$ \\ -7. If $digs > 1$ then do \\ -\hspace{3mm}7.1. for $ix$ from $1$ to $digs - 1$ do \\ -\hspace{6mm}7.1.1 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\ -\hspace{6mm}7.1.2 $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\ -8. else do \\ -\hspace{3mm}8.1 $ix \leftarrow 0$ \\ -9. $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\ +7. $oldused \leftarrow c.used$ \\ +8. $c.used \leftarrow digs$ \\ +9. for $ix$ from $0$ to $pa$ do \\ +\hspace{3mm}9.1 $c_{ix} \leftarrow W_{ix}$ \\ +10. for $ix$ from $pa + 1$ to $oldused - 1$ do \\ +\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\ \\ -Zero excess digits. \\ -10. If $digs < oldused$ then do \\ -\hspace{3mm}10.1 for $n$ from $digs$ to $oldused - 1$ do \\ -\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ -11. Clamp excessive digits of $c$. (\textit{mp\_clamp}) \\ -12. Return(\textit{MP\_OKAY}). \\ +11. Clamp $c$. \\ +12. Return MP\_OKAY. \\ \hline \end{tabular} \end{center} @@ -2625,15 +2631,24 @@ Zero excess digits. \\ \end{figure} \textbf{Algorithm fast\_s\_mp\_mul\_digs.} -This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision. The algorithm -essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster. +This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision. + +The outer loop of this algorithm is more complicated than that of the baseline multiplier. This is because on the inside of the +loop we want to produce one column per pass. This allows the accumulator $\_ \hat W$ to be placed in CPU registers and +reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration. + +The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$. That way if $a$ has more digits than +$b$ this will be limited to $b.used - 1$. The $tx$ variable is set to the to the distance past $b.used$ the variable +$ix$ is. This is used for the immediately subsequent statement where we find $iy$. -The array $\hat W$ is meant to be on the stack when the algorithm is used. The size of the array does not change which is ideal. Note also that -unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$. +The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out. Computing one column at a time +means we have to scan one integer upwards and the other downwards. $a$ starts at $tx$ and $b$ starts at $ty$. In each +pass we are producing the $ix$'th output column and we note that $tx + ty = ix$. As we move $tx$ upwards we have to +move $ty$ downards so the equality remains valid. The $iy$ variable is the number of iterations until +$tx \ge a.used$ or $ty < 0$ occurs. -The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm. The lack of -a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions. Now that each -iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.