diff --git a/bn_mp_div_d.c b/bn_mp_div_d.c
index 7fff4664d..579602155 100644
--- a/bn_mp_div_d.c
+++ b/bn_mp_div_d.c
@@ -56,12 +56,10 @@ int mp_div_d(const mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
       return MP_OKAY;
    }
 
-#ifdef BN_MP_DIV_3_C
    /* three? */
-   if (b == 3u) {
+   if (MP_ENABLED(MP_DIV_3) && b == 3u) {
       return mp_div_3(a, c, d);
    }
-#endif
 
    /* no easy answer [c'est la vie].  Just division */
    if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
diff --git a/bn_mp_exptmod.c b/bn_mp_exptmod.c
index 63c3b5670..d03a85c2f 100644
--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@@ -18,8 +18,7 @@ int mp_exptmod(const mp_int *G, const mp_int *X, const mp_int *P, mp_int *Y)
    }
 
    /* if exponent X is negative we have to recurse */
-   if (X->sign == MP_NEG) {
-#ifdef BN_MP_INVMOD_C
+   if (MP_ENABLED(MP_INVMOD) && X->sign == MP_NEG) {
       mp_int tmpG, tmpX;
       int err;
 
@@ -46,50 +45,35 @@ int mp_exptmod(const mp_int *G, const mp_int *X, const mp_int *P, mp_int *Y)
       err = mp_exptmod(&tmpG, &tmpX, P, Y);
       mp_clear_multi(&tmpG, &tmpX, NULL);
       return err;
-#else
+   } else {
       /* no invmod */
       return MP_VAL;
-#endif
    }
 
    /* modified diminished radix reduction */
-#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
-   if (mp_reduce_is_2k_l(P) == MP_YES) {
+   if (MP_ENABLED(MP_REDUCE_IS_2K_L) && MP_ENABLED(MP_REDUCE_2K_L) && MP_ENABLED(S_MP_EXPTMOD) &&
+       mp_reduce_is_2k_l(P) == MP_YES) {
       return s_mp_exptmod(G, X, P, Y, 1);
    }
-#endif
 
-#ifdef BN_MP_DR_IS_MODULUS_C
-   /* is it a DR modulus? */
-   dr = mp_dr_is_modulus(P);
-#else
-   /* default to no */
-   dr = 0;
-#endif
+   /* is it a DR modulus? default to no */
+   dr = MP_ENABLED(MP_DR_IS_MODULUS) ? mp_dr_is_modulus(P) : 0;
 
-#ifdef BN_MP_REDUCE_IS_2K_C
    /* if not, is it a unrestricted DR modulus? */
-   if (dr == 0) {
+   if (MP_ENABLED(MP_REDUCE_IS_2K) && dr == 0) {
       dr = mp_reduce_is_2k(P) << 1;
    }
-#endif
 
    /* if the modulus is odd or dr != 0 use the montgomery method */
-#ifdef BN_MP_EXPTMOD_FAST_C
-   if (IS_ODD(P) || (dr !=  0)) {
+   if (MP_ENABLED(MP_EXPTMOD_FAST) && (IS_ODD(P) || (dr !=  0))) {
       return mp_exptmod_fast(G, X, P, Y, dr);
-   } else {
-#endif
-#ifdef BN_S_MP_EXPTMOD_C
+   } else if (MP_ENABLED(S_MP_EXPTMOD)) {
       /* otherwise use the generic Barrett reduction technique */
       return s_mp_exptmod(G, X, P, Y, 0);
-#else
+   } else {
       /* no exptmod for evens */
       return MP_VAL;
-#endif
-#ifdef BN_MP_EXPTMOD_FAST_C
    }
-#endif
 }
 
 #endif
diff --git a/bn_mp_invmod.c b/bn_mp_invmod.c
index f4dc65dde..e976f4fa8 100644
--- a/bn_mp_invmod.c
+++ b/bn_mp_invmod.c
@@ -11,17 +11,13 @@ int mp_invmod(const mp_int *a, const mp_int *b, mp_int *c)
       return MP_VAL;
    }
 
-#ifdef BN_FAST_MP_INVMOD_C
    /* if the modulus is odd we can use a faster routine instead */
-   if (IS_ODD(b)) {
+   if (MP_ENABLED(FAST_MP_INVMOD) && IS_ODD(b)) {
       return fast_mp_invmod(a, b, c);
    }
-#endif
 
-#ifdef BN_MP_INVMOD_SLOW_C
-   return mp_invmod_slow(a, b, c);
-#else
-   return MP_VAL;
-#endif
+   return MP_ENABLED(MP_INVMOD_SLOW)
+          ? mp_invmod_slow(a, b, c)
+          : MP_VAL;
 }
 #endif
diff --git a/bn_mp_mul.c b/bn_mp_mul.c
index 585f93cbf..289d648f6 100644
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@@ -6,82 +6,48 @@
 /* high level multiplication (handles sign) */
 int mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
 {
-   int     res, neg;
-#ifdef BN_MP_BALANCE_MUL_C
-   int len_b, len_a;
-#endif
+   int res, neg, min_len, max_len, digs;
+   min_len = MIN(a->used, b->used);
+   max_len = MAX(a->used, b->used);
+   digs = a->used + b->used + 1;
    neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-#ifdef BN_MP_BALANCE_MUL_C
-   len_a = a->used;
-   len_b = b->used;
-
-   if (len_a == len_b) {
-      goto GO_ON;
-   }
-   /*
-    * Check sizes. The smaller one needs to be larger than the Karatsuba cut-off.
-    * The bigger one needs to be at least about one KARATSUBA_MUL_CUTOFF bigger
-    * to make some sense, but it depends on architecture, OS, position of the
-    * stars... so YMMV.
-    * Using it to cut the input into slices small enough for fast_s_mp_mul_digs
-    * was actually slower on the author's machine, but YMMV.
-    */
-   if ((MIN(len_a, len_b) < KARATSUBA_MUL_CUTOFF)
-       || ((MAX(len_a, len_b)) / 2 < KARATSUBA_MUL_CUTOFF)) {
-      goto GO_ON;
-   }
-   /*
-    * Not much effect was observed below a ratio of 1:2, but again: YMMV.
-    */
-   if ((MAX(len_a, len_b) /  MIN(len_a, len_b)) < 2) {
-      goto GO_ON;
-   }
-
-   res = mp_balance_mul(a,b,c);
-   goto END;
-
-GO_ON:
-#endif
 
-   /* use Toom-Cook? */
-#ifdef BN_MP_TOOM_MUL_C
-   if (MIN(a->used, b->used) >= TOOM_MUL_CUTOFF) {
+   if (MP_ENABLED(MP_BALANCE_MUL) &&
+       /* Check sizes. The smaller one needs to be larger than the Karatsuba cut-off.
+        * The bigger one needs to be at least about one KARATSUBA_MUL_CUTOFF bigger
+        * to make some sense, but it depends on architecture, OS, position of the
+        * stars... so YMMV.
+        * Using it to cut the input into slices small enough for fast_s_mp_mul_digs
+        * was actually slower on the author's machine, but YMMV.
+        */
+       (min_len >= KARATSUBA_MUL_CUTOFF) &&
+       (max_len / 2 >= KARATSUBA_MUL_CUTOFF) &&
+       /* Not much effect was observed below a ratio of 1:2, but again: YMMV. */
+       (max_len >= (2 * min_len))) {
+      res = mp_balance_mul(a,b,c);
+   } else if (MP_ENABLED(MP_TOOM_MUL) &&
+              min_len >= TOOM_MUL_CUTOFF) {
       res = mp_toom_mul(a, b, c);
-   } else
-#endif
-#ifdef BN_MP_KARATSUBA_MUL_C
-      /* use Karatsuba? */
-      if (MIN(a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
-         res = mp_karatsuba_mul(a, b, c);
-      } else
-#endif
-      {
-         /* can we use the fast multiplier?
-          *
-          * The fast multiplier can be used if the output will
-          * have less than MP_WARRAY digits and the number of
-          * digits won't affect carry propagation
-          */
-         int     digs = a->used + b->used + 1;
-
-#ifdef BN_FAST_S_MP_MUL_DIGS_C
-         if ((digs < (int)MP_WARRAY) &&
-             (MIN(a->used, b->used) <=
-              (int)(1u << ((CHAR_BIT * sizeof(mp_word)) - (2u * (size_t)DIGIT_BIT))))) {
-            res = fast_s_mp_mul_digs(a, b, c, digs);
-         } else
-#endif
-         {
-#ifdef BN_S_MP_MUL_DIGS_C
-            res = s_mp_mul(a, b, c); /* uses s_mp_mul_digs */
-#else
-            res = MP_VAL;
-#endif
-         }
-      }
-END:
+   } else if (MP_ENABLED(MP_KARATSUBA_MUL) &&
+              min_len >= KARATSUBA_MUL_CUTOFF) {
+      res = mp_karatsuba_mul(a, b, c);
+   } else if (MP_ENABLED(FAST_S_MP_MUL_DIGS) &&
+              /* can we use the fast multiplier?
+               *
+               * The fast multiplier can be used if the output will
+               * have less than MP_WARRAY digits and the number of
+               * digits won't affect carry propagation
+               */
+              (digs < (int)MP_WARRAY) &&
+              (min_len <=
+               (int)(1u << ((CHAR_BIT * sizeof(mp_word)) - (2u * (size_t)DIGIT_BIT))))) {
+      res = fast_s_mp_mul_digs(a, b, c, digs);
+   } else if (MP_ENABLED(S_MP_MUL_DIGS)) {
+      res = s_mp_mul(a, b, c); /* uses s_mp_mul_digs */
+   } else {
+      res = MP_VAL;
+   }
    c->sign = (c->used > 0) ? neg : MP_ZPOS;
    return res;
 }
 #endif
-
diff --git a/bn_mp_reduce.c b/bn_mp_reduce.c
index 7a17b033a..3990895a3 100644
--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@@ -25,21 +25,15 @@ int mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu)
       if ((res = mp_mul(&q, mu, &q)) != MP_OKAY) {
          goto CLEANUP;
       }
+   } else if (MP_ENABLED(S_MP_MUL_HIGH_DIGS) &&
+              (res = s_mp_mul_high_digs(&q, mu, &q, um)) != MP_OKAY) {
+      goto CLEANUP;
+   } else if (MP_ENABLED(FAST_S_MP_MUL_HIGH_DIGS) &&
+              (res = fast_s_mp_mul_high_digs(&q, mu, &q, um)) != MP_OKAY) {
+      goto CLEANUP;
    } else {
-#ifdef BN_S_MP_MUL_HIGH_DIGS_C
-      if ((res = s_mp_mul_high_digs(&q, mu, &q, um)) != MP_OKAY) {
-         goto CLEANUP;
-      }
-#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-      if ((res = fast_s_mp_mul_high_digs(&q, mu, &q, um)) != MP_OKAY) {
-         goto CLEANUP;
-      }
-#else
-      {
-         res = MP_VAL;
-         goto CLEANUP;
-      }
-#endif
+      res = MP_VAL;
+      goto CLEANUP;
    }
 
    /* q3 = q2 / b**(k+1) */
diff --git a/bn_mp_sqr.c b/bn_mp_sqr.c
index 438677f5d..b521d235f 100644
--- a/bn_mp_sqr.c
+++ b/bn_mp_sqr.c
@@ -8,35 +8,22 @@ int mp_sqr(const mp_int *a, mp_int *b)
 {
    int     res;
 
-#ifdef BN_MP_TOOM_SQR_C
-   /* use Toom-Cook? */
-   if (a->used >= TOOM_SQR_CUTOFF) {
+   if (MP_ENABLED(MP_TOOM_SQR) && /* use Toom-Cook? */
+       a->used >= TOOM_SQR_CUTOFF) {
       res = mp_toom_sqr(a, b);
-      /* Karatsuba? */
-   } else
-#endif
-#ifdef BN_MP_KARATSUBA_SQR_C
-      if (a->used >= KARATSUBA_SQR_CUTOFF) {
-         res = mp_karatsuba_sqr(a, b);
-      } else
-#endif
-      {
-#ifdef BN_FAST_S_MP_SQR_C
-         /* can we use the fast comba multiplier? */
-         if ((((a->used * 2) + 1) < (int)MP_WARRAY) &&
-             (a->used <
-              (int)(1u << (((CHAR_BIT * sizeof(mp_word)) - (2u * (size_t)DIGIT_BIT)) - 1u)))) {
-            res = fast_s_mp_sqr(a, b);
-         } else
-#endif
-         {
-#ifdef BN_S_MP_SQR_C
-            res = s_mp_sqr(a, b);
-#else
-            res = MP_VAL;
-#endif
-         }
-      }
+   } else if (MP_ENABLED(MP_KARATSUBA_SQR) &&  /* Karatsuba? */
+              a->used >= KARATSUBA_SQR_CUTOFF) {
+      res = mp_karatsuba_sqr(a, b);
+   } else if (MP_ENABLED(FAST_S_MP_SQR) && /* can we use the fast comba multiplier? */
+              (((a->used * 2) + 1) < (int)MP_WARRAY) &&
+              (a->used <
+               (int)(1u << (((CHAR_BIT * sizeof(mp_word)) - (2u * (size_t)DIGIT_BIT)) - 1u)))) {
+      res = fast_s_mp_sqr(a, b);
+   } else if (MP_ENABLED(S_MP_SQR)) {
+      res = s_mp_sqr(a, b);
+   } else {
+      res = MP_VAL;
+   }
    b->sign = MP_ZPOS;
    return res;
 }
diff --git a/demo/test.c b/demo/test.c
index 3c61ff512..4a146258a 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -1,5 +1,8 @@
 #include "shared.h"
 
+/* We can also test the private API here */
+#include "tommath_private.h"
+
 static int test_trivial_stuff(void)
 {
    mp_int a, b, c, d;
@@ -1624,7 +1627,7 @@ static int test_mp_balance_mul(void)
       goto LTM_ERR;
    }
 
-   if ((e = mp_mul(&a, &b, &c)) != MP_OKAY) {
+   if ((e = mp_balance_mul(&a, &b, &c)) != MP_OKAY) {
       goto LTM_ERR;
    }
 
diff --git a/tommath.h b/tommath.h
index 41c9ec0b2..786db58f0 100644
--- a/tommath.h
+++ b/tommath.h
@@ -335,7 +335,6 @@ int mp_sub(const mp_int *a, const mp_int *b, mp_int *c);
 
 /* c = a * b */
 int mp_mul(const mp_int *a, const mp_int *b, mp_int *c);
-int mp_balance_mul(const mp_int *a, const mp_int *b, mp_int *c);
 
 /* b = a*a  */
 int mp_sqr(const mp_int *a, mp_int *b);
diff --git a/tommath_private.h b/tommath_private.h
index aeea59103..43aa4ac3e 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -33,6 +33,13 @@ extern void *XCALLOC(size_t nmemb, size_t size);
 extern void XFREE(void *mem, size_t size);
 #endif
 
+/* feature detection macro */
+#define _MP_ENABLED_TEST        ,
+#define MP_ENABLED(x)           _MP_ENABLED1(BN_##x##_C)
+#define _MP_ENABLED1(x)         _MP_ENABLED2(_MP_ENABLED_TEST##x)
+#define _MP_ENABLED2(x)         _MP_ENABLED3(x 1, 0)
+#define _MP_ENABLED3(x, y, ...) y
+
 /* ---> Basic Manipulations <--- */
 #define IS_ZERO(a) ((a)->used == 0)
 #define IS_EVEN(a) (((a)->used == 0) || (((a)->dp[0] & 1u) == 0u))
@@ -48,6 +55,7 @@ int fast_s_mp_mul_high_digs(const mp_int *a, const mp_int *b, mp_int *c, int dig
 int s_mp_mul_high_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs);
 int fast_s_mp_sqr(const mp_int *a, mp_int *b);
 int s_mp_sqr(const mp_int *a, mp_int *b);
+int mp_balance_mul(const mp_int *a, const mp_int *b, mp_int *c);
 int mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c);
 int mp_toom_mul(const mp_int *a, const mp_int *b, mp_int *c);
 int mp_karatsuba_sqr(const mp_int *a, mp_int *b);