diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 3e115c84580b..decd72531a5a 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -4987,48 +4987,58 @@ def luf(lamdaexpr, *args, **kwargs):
 
 add_newdoc('numpy.lib._compiled_base', 'bincount',
     """
-    bincount(x, weights=None, minlength=None)
+    bincount(list, weights=None, minlength=None, axis=None)
 
-    Count number of occurrences of each value in array of non-negative ints.
+    Count number of occurrences of each value in an array of non-negative ints
+    along the selected axes.
 
-    The number of bins (of size 1) is one larger than the largest value in
-    `x`. If `minlength` is specified, there will be at least this number
-    of bins in the output array (though it will be longer if necessary,
-    depending on the contents of `x`).
-    Each bin gives the number of occurrences of its index value in `x`.
-    If `weights` is specified the input array is weighted by it, i.e. if a
-    value ``n`` is found at position ``i``, ``out[n] += weight[i]`` instead
-    of ``out[n] += 1``.
+    Each bin gives the number of occurrences of its index value in `list`
+    along the axes in `axis`. If `weights` is specified, 'list' is weighted
+    by it, i.e. if ``list[i] == n``, then ``out[n] += weight[i]`` instead of
+    ``out[n] += 1``.
 
     Parameters
     ----------
-    x : array_like, 1 dimension, nonnegative ints
+    list : array_like, nonnegative ints
         Input array.
     weights : array_like, optional
-        Weights, array of the same shape as `x`.
+        Weights, array broadcastable with `list`.
     minlength : int, optional
         .. versionadded:: 1.6.0
-
         A minimum number of bins for the output array.
+    axis : int or tuple of ints, optional
+        .. versionadded:: 1.9.0
+       The axes of `x` over which to count, defaults to counting over
+        the flattened array.
 
     Returns
     -------
-    out : ndarray of ints
+    out : ndarray of ints (or doubles if weights is specified)
         The result of binning the input array.
-        The length of `out` is equal to ``np.amax(x)+1``.
 
     Raises
     ------
     ValueError
-        If the input is not 1-dimensional, or contains elements with negative
-        values, or if `minlength` is non-positive.
+        If the input contains elements with negative values, if there are
+        entries in axis repeated or out of bounds, or if the input array
+        and `weights` cannot be broadcasted together.
     TypeError
-        If the type of the input is float or complex.
+        If the type of the input, `minlength` or `axis` is not an integer, or
+        if `weights` cannot be safely cast to a double.
 
     See Also
     --------
     histogram, digitize, unique
 
+    Notes
+    -----
+    To determine the output shape, `list` and `weights` are broadcasted
+    together, the axes in `axis` are removed from the broadcasted shape, and
+    a dimension of size `max(max(list) + 1, minlength)` is appended to the
+    end. Note that the values in `axis` refer to the shape of `list` prior to
+    broadcasting, not after. Then entry at position `out[..., n]` will be the
+    (weighted) count of all instances of `n` in `list` along the selected axes.
+
     Examples
     --------
     >>> np.bincount(np.arange(5))
@@ -5037,7 +5047,7 @@ def luf(lamdaexpr, *args, **kwargs):
     array([1, 3, 1, 1, 0, 0, 0, 1])
 
     >>> x = np.array([0, 1, 1, 3, 2, 1, 7, 23])
-    >>> np.bincount(x).size == np.amax(x)+1
+    >>> np.bincount(x).size == np.max(x)+1
     True
 
     The input array needs to be of integer dtype, otherwise a
@@ -5046,16 +5056,42 @@ def luf(lamdaexpr, *args, **kwargs):
     >>> np.bincount(np.arange(5, dtype=np.float))
     Traceback (most recent call last):
       File "<stdin>", line 1, in <module>
-    TypeError: array cannot be safely cast to required type
+    TypeError: input must be an array of integers
 
-    A possible use of ``bincount`` is to perform sums over
-    variable-size chunks of an array, using the ``weights`` keyword.
+    A possible use of `bincount` is to perform sums over
+    variable-size chunks of an array, using the `weights` keyword.
 
     >>> w = np.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights
     >>> x = np.array([0, 1, 1, 2, 2, 2])
     >>> np.bincount(x,  weights=w)
     array([ 0.3,  0.7,  1.1])
 
+    Using the `axis` argument allows (weighted) counting over multiple axes:
+
+    >>> x = np.arange(2*3).reshape(3, 2, 1)
+    >>> np.bincount(x, axis=(1, 2))
+    array([[1, 1, 0, 0, 0, 0],
+           [0, 0, 1, 1, 0, 0],
+           [0, 0, 0, 0, 1, 1]])
+    >>> w = x * x
+    >>> np.bincount(x, w, axis=(1, 2))
+    array([[  0.,   1.,   0.,   0.,   0.,   0.],
+           [  0.,   0.,   4.,   9.,   0.,   0.],
+           [  0.,   0.,   0.,   0.,  16.,  25.]])
+
+    With broadcasting, multidimensional `weights` can be summed:
+
+    >>> x = [0, 1, 1, 2]
+    >>> w = np.arange(8).reshape(4, 2)
+    >>> np.bincount(x, w.T).T
+    array([[ 0.,  1.],
+           [ 6.,  8.],
+           [ 6.,  7.]])
+
+    Note that in this last example, the broadcasted shape of `x` is
+    `(1, 1, 4)`, but counting is only done over the last axis, not all three,
+    because the `axis` value (which defaults to all axes) refers to the shape
+    prior to broadcasting.
     """)
 
 add_newdoc('numpy.lib._compiled_base', 'ravel_multi_index',
diff --git a/numpy/lib/bento.info b/numpy/lib/bento.info
index 9f4fa6f0f579..9c939aab9f86 100644
--- a/numpy/lib/bento.info
+++ b/numpy/lib/bento.info
@@ -3,4 +3,4 @@ HookFile: bscript
 Library:
     Extension: _compiled_base
         Sources:
-             src/_compiled_base.c
+             src/_compiled_base.c.src
diff --git a/numpy/lib/setup.py b/numpy/lib/setup.py
index 153af314c7d2..e60bffdfa246 100644
--- a/numpy/lib/setup.py
+++ b/numpy/lib/setup.py
@@ -11,7 +11,7 @@ def configuration(parent_package='',top_path=None):
 
 
     config.add_extension('_compiled_base',
-                         sources=[join('src', '_compiled_base.c')]
+                         sources=[join('src', '_compiled_base.c.src')]
                          )
 
     config.add_data_dir('benchmarks')
diff --git a/numpy/lib/src/_compiled_base.c b/numpy/lib/src/_compiled_base.c.src
similarity index 66%
rename from numpy/lib/src/_compiled_base.c
rename to numpy/lib/src/_compiled_base.c.src
index 0f238a12a7b2..5fa232cb3b78 100644
--- a/numpy/lib/src/_compiled_base.c
+++ b/numpy/lib/src/_compiled_base.c.src
@@ -7,7 +7,6 @@
 #include "numpy/ufuncobject.h"
 #include "string.h"
 
-
 static npy_intp
 incr_slot_(double x, double *bins, npy_intp lbins)
 {
@@ -105,146 +104,667 @@ check_array_monotonic(const double *a, npy_int lena)
     }
 }
 
-/* Find the minimum and maximum of an integer array */
+/*
+ * Having multiple inner loops was a must to properly handle input integer
+ * arrays for types larger than NPY_INTP. And once started, there is no
+ * good reason not to specialize the inner loops for all integer types.
+ */
+
+typedef npy_intp (max_inner_loop_func)(const char*, npy_intp, npy_intp);
+typedef void (count_inner_loop_func)(const char *, npy_intp *, npy_intp,
+                                     npy_intp);
+typedef void (weights_inner_loop_func)(const char *, const char *,
+                                       npy_double *, npy_intp, npy_intp,
+                                       npy_intp);
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong#
+ * #type = npy_bool, npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
+ *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #sgnd = 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 #
+ */
+
+/*
+ * Returns the maximum value in an array of non-negative integers.
+ * Returns -1 if it finds any negative values, or -2 if the maximum
+ * will not fit in a NPY_INTP
+ */
+
+static npy_intp
+max_inner_loop_@suff@(const char *list, npy_intp str, npy_intp len)
+{
+    @type@ max = 0;
+
+    while (len--) {
+        const @type@ val = *(const @type@ *)list;
+        list += str;
+        #if @sgnd@
+            if (val < 0) {
+                max = -1;
+                break;
+            }
+        #endif
+        if (val > max) {
+            max = val;
+        }
+    }
+    if (max > NPY_MAX_INTP) {
+        max = -2;
+    }
+
+    return (npy_intp)max;
+}
+
+/*
+ * Writes to 'out[i]' the number of times that 'i' comes up in 'list'. The
+ * `out` array must be long enough to hold the largest value in `list`,
+ * typically calculated with a prior call to 'max_inner_loop_@suff@'
+ */
+
+static void
+count_inner_loop_@suff@(const char *list, npy_intp *out, npy_intp str,
+                        npy_intp len)
+{
+    while (len--) {
+        @type@ val = *(@type@ *)list;
+        list += str;
+        out[val]++;
+    }
+}
+
+/*
+ * Writes to 'out[i]' the sum of all the items in 'weights' at positions
+ * where `list` holds the value `i`. The `out` array must be long enough
+ * to hold the largest value in `list`, typically calculated with a prior
+ * call to 'max_inner_loop_@suff@'
+ */
+
 static void
-minmax(const npy_intp *data, npy_intp data_len, npy_intp *mn, npy_intp *mx)
+weights_inner_loop_@suff@(const char *list, const char *weights,
+                          npy_double *out, npy_intp str_lst,
+                          npy_intp str_wts, npy_intp len)
 {
-    npy_intp min = *data;
-    npy_intp max = *data;
+    while (len--) {
+        const @type@ val_lst = *(const @type@ *)list;
+        const npy_double val_wts = *(const npy_double *)weights;
+        list += str_lst;
+        weights += str_wts;
+        out[val_lst] += val_wts;
+    }
+}
+
+/**end repeat**/
+
+static max_inner_loop_func *max_func_map[] = {
+    /**begin repeat
+     *
+     * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+     *         LONGLONG, ULONGLONG#
+     * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+     *         longlong, ulonglong#
+     */
+        &max_inner_loop_@suff@,
+    /**end repeat**/
+};
+
+static count_inner_loop_func *count_func_map[] = {
+    /**begin repeat
+     *
+     * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+     *         LONGLONG, ULONGLONG#
+     * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+     *         longlong, ulonglong#
+     */
+        &count_inner_loop_@suff@,
+    /**end repeat**/
+};
+
+static weights_inner_loop_func *weights_func_map[] = {
+    /**begin repeat
+     *
+     * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+     *         LONGLONG, ULONGLONG#
+     * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+     *         longlong, ulonglong#
+     */
+        &weights_inner_loop_@suff@,
+    /**end repeat**/
+};
 
-    while (--data_len) {
-        const npy_intp val = *(++data);
-        if (val < min) {
-            min = val;
+
+/*
+ * On succesful ending, returns 0 and fills axes with the bounds-checked
+ * axes found in axes_in, and axes_len with the number of axes found. On
+ * failure sets an exception and returns -1;
+ */
+
+static int
+parse_axes(PyArrayObject *arr, PyObject *axes_in, int *axes, int *axes_len)
+{
+    int ndim = PyArray_NDIM(arr);
+    npy_intp naxes;
+    npy_intp j;
+
+    // if () {
+        // /* User left argument blank, default to last axis */
+        // naxes = 1;
+        // /* Set axis to 0 as default for scalar arrays */
+        // axes[0] = (ndim == 0) ? 0 : ndim - 1;
+    // }
+    // else
+    if (!axes_in || axes_in == Py_None) {
+        /* Convert blank argument or 'None' into all the axes */
+        naxes = ndim;
+        for (j = 0; j < naxes; j++) {
+            axes[j] = j;
         }
-        else if (val > max) {
-            max = val;
+    }
+    else if (PyTuple_Check(axes_in)) {
+        naxes = PyTuple_Size(axes_in);
+        if (naxes < 0 || naxes > NPY_MAXDIMS) {
+            PyErr_SetString(PyExc_ValueError, "too many values for 'axis'");
+            return -1;
+        }
+        for (j = 0; j < naxes; j++) {
+            PyObject *axis_obj = PyTuple_GET_ITEM(axes_in, j);
+            long axis;
+
+            if (!PyInt_Check(axis_obj)) {
+                /* Don't allow floats or the like to slip through as ints */
+                PyErr_SetString(PyExc_TypeError,
+                        "'axis' must be an integer or tuple of integers");
+                return -1;
+            }
+            axis = PyInt_AsLong(axis_obj);
+            if (axis == -1 && PyErr_Occurred()) {
+                return -1;
+            }
+            if (axis < 0) {
+                axis += ndim;
+            }
+            if (axis < 0 || axis >= ndim) {
+                PyErr_SetString(PyExc_ValueError,
+                                "'axis' entry is out of bounds");
+                return -1;
+            }
+            axes[j] = (int)axis;
+        }
+    }
+    else if (PyInt_Check(axes_in)){
+        /* `axis` can also be a single integer */
+        long axis = PyInt_AsLong(axes_in);
+        if (axis == -1 && PyErr_Occurred()) {
+            return -1;
+        }
+        if (axis < 0) {
+            axis += ndim;
+        }
+        /* Special case letting axis={0 or -1} slip through for scalars */
+        if (ndim == 0 && (axis == 0 || axis == -1)) {
+            axis = 0;
+        }
+        else if (axis < 0 || axis >= ndim) {
+            PyErr_SetString(PyExc_ValueError,
+                            "'axis' entry is out of bounds");
+            return -1;
         }
+        axes[0] = (int)axis;
+        naxes = 1;
+    } else {
+        /* Don't allow floats or the like to slip through as ints */
+        PyErr_SetString(PyExc_TypeError,
+                        "'axis' must be an integer or tuple of integers");
+        return -1;
     }
 
-    *mn = min;
-    *mx = max;
+    *axes_len = naxes;
+    return 0;
 }
+
+
 /*
- * arr_bincount is registered as bincount.
+ * arr_ndbincount is registered as bincount.
  *
  * bincount accepts one, two or three arguments. The first is an array of
- * non-negative integers The second, if present, is an array of weights,
- * which must be promotable to double. Call these arguments list and
- * weight. Both must be one-dimensional with len(weight) == len(list). If
- * weight is not present then bincount(list)[i] is the number of occurrences
- * of i in list.  If weight is present then bincount(self,list, weight)[i]
- * is the sum of all weight[j] where list [j] == i.  Self is not used.
- * The third argument, if present, is a minimum length desired for the
- * output array.
+ * non-negative integers. The second, if present, is an array of weights,
+ * which must be promotable to double. Call these arguments 'list' and
+ * 'weights'. Their shapes must be broadcastable. The fourth argument, call
+ * it `axis', holds a list of axes, and defaults to the last axis only. The
+ * return will have the shape of the broadcasted arrays, with the axes in
+ * `axis` removed, and an extra dimension added at the end
+ * If 'weights' is not present then 'bincount(list)[..., i]` is the number of
+ * occurrences of 'i' in 'list' over the removed 'axis' at each of the
+ * non-removed positions.
+ * If 'weights' is present then 'bincount(list, weights)[..., i]' is the sum
+ * of all 'weights[j]' where 'list[j] == i' over the removed 'axis' at each
+ * of the non-removed positions.
+ * The third argument, call it 'minlength', if present, is a minimum length
+ * for the last dimension of the output array.
+ * The entries in the fourth argument, 'axis', refer to the original shape of
+ * the 'list' argument before broadcasting.
  */
-static PyObject *
-arr_bincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+
+static PyObject*
+arr_ndbincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwargs)
 {
-    PyArray_Descr *type;
-    PyObject *list = NULL, *weight=Py_None, *mlength=Py_None;
-    PyArrayObject *lst=NULL, *ans=NULL, *wts=NULL;
-    npy_intp *numbers, *ians, len , mx, mn, ans_size, minlength;
-    int i;
-    double *weights , *dans;
-    static char *kwlist[] = {"list", "weights", "minlength", NULL};
+    /* Raw input arguments  */
+    PyObject *list = NULL;
+    PyObject *weights = NULL;
+    PyObject *axis = NULL;
+    PyObject *minlength = Py_None;
+    static char *kwlist[] = {"list", "weights", "minlength", "axis", NULL};
+    /* Array version of the inputs and output */
+    PyArrayObject *arr_lst = NULL;
+    PyArrayObject *arr_wts = NULL;
+    PyArrayObject *arr_ret = NULL;
+    /* Metadata of the input arguments */
+    int axes[NPY_MAXDIMS];
+    int naxes;
+    npy_bool axis_flags[NPY_MAXDIMS];
+    int ndim_lst, ndim_wts, ndim_ret;
+    npy_intp shape_ret[NPY_MAXDIMS], *shape_lst;
+    npy_intp max_lst = 0;
+    npy_intp min_len = 0;
+    npy_intp j, k;
+    /* We have to take ownership of this type reference */
+    PyArray_Descr *type_double = PyArray_DescrFromType(NPY_DOUBLE);
+    /* Iterator related declarations */
+    NpyIter *iter = NULL, *iter_outer = NULL, *iter_inner = NULL;
+    int ndim_outer, ndim_inner;
+    int op_axes_outer_arrays[3][NPY_MAXDIMS];
+    int op_axes_inner_arrays[2][NPY_MAXDIMS];
+    npy_bool no_iter_max = 0, no_iter_count = 0, no_iter_weights = 0;
+    npy_bool is_empty_list;
+    NPY_BEGIN_THREADS_DEF;
+
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOO", kwlist,
+                                     &list, &weights, &minlength, &axis)) {
+        goto fail;
+    }
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO",
-                kwlist, &list, &weight, &mlength)) {
-            goto fail;
+    /*
+     * Have to handle conversion of minlength to a npy_intp manually to
+     * allow None passed in without throwing an error. Ideally we would
+     * simply set its type to 'n' in the parser with a default of 0.
+     */
+    if (minlength == Py_None) {
+        min_len = 0;
+    }
+    else if (PyInt_Check(minlength)) {
+        min_len = (npy_intp)PyInt_AsSsize_t(minlength);
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "minlength must be an integer.");
+        goto fail;
     }
 
-    lst = (PyArrayObject *)PyArray_ContiguousFromAny(list, NPY_INTP, 1, 1);
-    if (lst == NULL) {
+    /* Convert list to array */
+    arr_lst = (PyArrayObject *)PyArray_FromAny(list, NULL, 0, 0, 0, NULL);
+    if (!arr_lst) {
+        PyErr_SetString(PyExc_TypeError, "could not parse input array");
+        goto fail;
+    }
+    is_empty_list = !(npy_bool)PyArray_SIZE(arr_lst);
+    if(!(PyArray_ISINTEGER(arr_lst) || PyArray_ISBOOL(arr_lst) ||
+         is_empty_list)) {
+        /* Without the is_empty check, empty lists raise this error */
+        PyErr_SetString(PyExc_TypeError,
+                        "input must be an array of integers");
         goto fail;
     }
-    len = PyArray_SIZE(lst);
-    type = PyArray_DescrFromType(NPY_INTP);
+    ndim_lst = PyArray_NDIM(arr_lst);
+    /*
+     * We don't need an iterator to find the max if the input array is behaved
+     * and either 1D or contiguous.
+     */
+    no_iter_max = PyArray_ISNOTSWAPPED(arr_lst) &&
+                  PyArray_ISALIGNED(arr_lst) &&
+                  ((ndim_lst == 1) || PyArray_ISCONTIGUOUS(arr_lst));
 
-    /* handle empty list */
-    if (len < 1) {
-        if (mlength == Py_None) {
-            minlength = 0;
+    /* Check axis in bounds */
+    if (parse_axes(arr_lst, axis, axes, &naxes) < 0) {
+        goto fail;
+    }
+    memset(axis_flags, 0, ndim_lst);
+    for (j = 0; j < naxes; j++) {
+        int axis = axes[j];
+        if (axis_flags[axis]) {
+            PyErr_SetString(PyExc_ValueError, "duplicate value in 'axis'");
+            goto fail;;
         }
-        else if (!(minlength = PyArray_PyIntAsIntp(mlength))) {
+        axis_flags[axis] = 1;
+    }
+
+    /*
+     * Convert weights to an array. Don't force a cast to NPY_DOUBLE, the
+     * iterator will handle that.
+     */
+    if (weights && weights != Py_None) {
+        arr_wts = (PyArrayObject *)PyArray_FromAny(weights, NULL,
+                                                   0, 0, 0, NULL);
+        if (!arr_wts) {
+            PyErr_SetString(PyExc_TypeError,
+                            "could not parse weights array");
             goto fail;
         }
-        if (!(ans = (PyArrayObject *)PyArray_Zeros(1, &minlength, type, 0))){
+        if(!PyArray_CanCastArrayTo(arr_wts, type_double, NPY_SAFE_CASTING)) {
+            PyErr_SetString(PyExc_TypeError,
+                            "cannot convert weights array to double");
             goto fail;
         }
-        Py_DECREF(lst);
-        return (PyObject *)ans;
+        ndim_wts = PyArray_NDIM(arr_wts);
+        if (ndim_wts == 0) {
+            /*
+             * The iterator has trouble with scalars, because no axes get
+             * selected, and casting and buffering doesn't happen, so we
+             * force the conversion.
+             */
+            PyArrayObject *temp = (PyArrayObject *)PyArray_Cast(arr_wts,
+                                                                NPY_DOUBLE);
+            Py_DECREF(arr_wts);
+            arr_wts = temp;
+        }
     }
 
-    numbers = (npy_intp *) PyArray_DATA(lst);
-    minmax(numbers, len, &mn, &mx);
-    if (mn < 0) {
-        PyErr_SetString(PyExc_ValueError,
-                "The first argument of bincount must be non-negative");
-        goto fail;
-    }
-    ans_size = mx + 1;
-    if (mlength != Py_None) {
-        if (!(minlength = PyArray_PyIntAsIntp(mlength))) {
-            goto fail;
+    /* Broadacst the arrays to create the return array shape */
+    shape_lst = PyArray_DIMS(arr_lst);
+
+    if (arr_wts) {
+        npy_intp *shape_wts = PyArray_DIMS(arr_wts);
+
+        if (ndim_wts > ndim_lst) {
+            j = ndim_lst - ndim_wts;
+            k = 0;
         }
-        if (minlength <= 0) {
-            /* superfluous, but may catch incorrect usage */
-            PyErr_SetString(PyExc_ValueError,
-                    "minlength must be positive");
-            goto fail;
+        else {
+            j = 0;
+            k = ndim_wts - ndim_lst;
         }
-        if (ans_size < minlength) {
-            ans_size = minlength;
+        for (ndim_inner = 0, ndim_outer = 0; j < ndim_lst; j++, k++) {
+
+            if (j < 0 || !axis_flags[j]) {
+                op_axes_outer_arrays[0][ndim_outer] = j < 0 ? -1 : j;
+                op_axes_outer_arrays[1][ndim_outer] = k < 0 ? -1 : k;
+                op_axes_outer_arrays[2][ndim_outer] = ndim_outer;
+                /*
+                 * We are not validating whether the arrays are
+                 * broadcastable, the iterator will handle that.
+                 * But if they are broadcastable, then this is the
+                 * broadcasted shape.
+                 */
+                if (j < 0 || (k >= 0 && shape_lst[j] == 1)) {
+                    shape_ret[ndim_outer++] = shape_wts[k];
+                }
+                else  {
+                    shape_ret[ndim_outer++] = shape_lst[j];
+                }
+            }
+            else {
+                op_axes_inner_arrays[0][ndim_inner] = j;
+                op_axes_inner_arrays[1][ndim_inner++] = k < 0 ? -1 : k;
+            }
         }
     }
-    if (weight == Py_None) {
-        ans = (PyArrayObject *)PyArray_Zeros(1, &ans_size, type, 0);
-        if (ans == NULL) {
-            goto fail;
+    else {
+        for(j = 0, ndim_inner = 0, ndim_outer = 0; j < ndim_lst; j++) {
+            if (!axis_flags[j]) {
+                op_axes_outer_arrays[0][ndim_outer] = j;
+                op_axes_outer_arrays[2][ndim_outer] = ndim_outer;
+                shape_ret[ndim_outer++] = shape_lst[j];
+            }
+            else {
+                op_axes_inner_arrays[0][ndim_inner++] = j;
+            }
         }
-        ians = (npy_intp *)(PyArray_DATA(ans));
-        NPY_BEGIN_ALLOW_THREADS;
-        for (i = 0; i < len; i++)
-            ians [numbers [i]] += 1;
-        NPY_END_ALLOW_THREADS;
-        Py_DECREF(lst);
+    }
+    ndim_ret = ndim_outer;
+
+    /* Find maximum entry in arr_lst and check for negative entries. */
+    if (is_empty_list) {
+        max_lst = -1;
+    }
+    else if (no_iter_max) {
+        /* Don't use an iterator if we do not need one */
+        int typenum_lst= PyArray_DESCR(arr_lst)->type_num;
+        max_inner_loop_func *max_func = max_func_map[typenum_lst];
+
+        NPY_BEGIN_THREADS;
+        max_lst = max_func((const char *)PyArray_DATA(arr_lst),
+                           ndim_lst ? PyArray_STRIDES(arr_lst)[ndim_lst-1] : 0,
+                           PyArray_SIZE(arr_lst));
+        NPY_END_THREADS;
     }
     else {
-        wts = (PyArrayObject *)PyArray_ContiguousFromAny(
-                                                weight, NPY_DOUBLE, 1, 1);
-        if (wts == NULL) {
+        NpyIter_IterNextFunc *iternext = NULL;
+        char **dataptr;
+        npy_intp *strideptr, *innersizeptr;
+        int typenum_lst = PyArray_DESCR(arr_lst)->type_num;
+        max_inner_loop_func *max_func = max_func_map[typenum_lst];
+
+        iter = NpyIter_New(arr_lst,
+                           NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP |
+                           NPY_ITER_BUFFERED | NPY_ITER_GROWINNER |
+                           NPY_ITER_NBO | NPY_ITER_ALIGNED,
+                           NPY_KEEPORDER, NPY_UNSAFE_CASTING, NULL);
+        if (!(iter && (iternext = NpyIter_GetIterNext(iter, NULL)))) {
             goto fail;
         }
-        weights = (double *)PyArray_DATA (wts);
-        if (PyArray_SIZE(wts) != len) {
-            PyErr_SetString(PyExc_ValueError,
-                    "The weights and list don't have the same length.");
-            goto fail;
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        strideptr = NpyIter_GetInnerStrideArray(iter);
+        innersizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+        do {
+            NPY_BEGIN_THREADS;
+            max_lst = max_func((const char *)dataptr[0],
+                               *strideptr,
+                               *innersizeptr);
+            NPY_END_THREADS;
+        } while (max_lst >= 0 && iternext(iter));
+
+        NpyIter_Deallocate(iter);
+        iter = NULL;
+    }
+
+    if (max_lst == -1 && !is_empty_list) {
+        /*
+         * If list is empty, we want max_lst = -1 not to raise an error,
+         * as it will set the return length to the right value.
+         */
+        PyErr_SetString(PyExc_ValueError, "input array must be non-negative");
+        goto fail;
+    }
+    if (max_lst == -2) {
+        PyErr_SetString(PyExc_ValueError, "value too large in input array");
+        goto fail;
+    }
+
+    /* Finish creating the return array shape */
+    shape_ret[ndim_ret++] = (min_len <= max_lst) ? max_lst + 1 : min_len;
+
+    /* Create the return array */
+    arr_ret = (PyArrayObject *)PyArray_ZEROS(ndim_ret, shape_ret,
+                                             arr_wts ? NPY_DOUBLE : NPY_INTP,
+                                             0);
+
+    /* Figure out whether we need an iterator for counting */
+    if (arr_wts) {
+        /*
+         * We do not need an iterator for weighted counting if both arrays
+         * are behaved, of the same size, are 1D, weights has the right type,
+         * and weighted counting is done over all axes.
+         */
+        no_iter_count = 0;
+        no_iter_weights = (ndim_lst == ndim_wts) &&
+                          (ndim_lst == ndim_inner) &&
+                          (ndim_lst <= 1) &&
+                          PyArray_SIZE(arr_lst) == PyArray_SIZE(arr_wts) &&
+                          PyArray_ISNOTSWAPPED(arr_wts) &&
+                          PyArray_ISALIGNED(arr_wts) &&
+                          PyArray_DESCR(arr_wts)->type_num == NPY_DOUBLE &&
+                          PyArray_ISNOTSWAPPED(arr_lst) &&
+                          PyArray_ISALIGNED(arr_lst);
+    }
+    else {
+        /*
+         * We do not need an iterator for counting if the input array is
+         * behaved, 1D or contiguous, and counting is done over all axes.
+         */
+        no_iter_weights = 0;
+        no_iter_count = (ndim_lst == ndim_inner) && no_iter_max;
+    }
+
+    /* Finally, lets count */
+    if (is_empty_list) {
+        goto finish;
+    }
+    else if (no_iter_count) {
+        int typenum_lst = PyArray_DESCR(arr_lst)->type_num;
+        count_inner_loop_func *cnt_func = count_func_map[typenum_lst];
+
+        NPY_BEGIN_THREADS;
+        cnt_func((const char *)PyArray_DATA(arr_lst),
+                 (npy_intp *)PyArray_DATA(arr_ret),
+                 ndim_lst ? PyArray_STRIDES(arr_lst)[ndim_lst-1] : 0,
+                 PyArray_SIZE(arr_lst));
+        NPY_END_THREADS;
+    }
+    else if (no_iter_weights) {
+        int typenum_lst = PyArray_DESCR(arr_lst)->type_num;
+        weights_inner_loop_func *w_func = weights_func_map[typenum_lst];
+
+        NPY_BEGIN_THREADS;
+        w_func((const char *)PyArray_DATA(arr_lst),
+               (const char *)PyArray_DATA(arr_wts),
+               (npy_double *)PyArray_DATA(arr_ret),
+               ndim_lst ? PyArray_STRIDES(arr_lst)[ndim_lst-1] : 0,
+               ndim_wts ? PyArray_STRIDES(arr_wts)[ndim_wts-1] : 0,
+               PyArray_SIZE(arr_lst));
+        NPY_END_THREADS;
+
+    }
+    else {
+        PyArrayObject *op[3];
+        int *op_axes_outer[3], *op_axes_inner[2];
+        npy_uint32 op_flags_outer[3], op_flags_inner[2];
+        npy_uint32 flags_inner;
+        PyArray_Descr *op_dtypes[2];
+        NpyIter_IterNextFunc *iternext_outer, *iternext_inner;
+        char **dataptrs_outer, **dataptrs_inner;
+        npy_intp *strideptr_inner;
+        npy_intp *innersizeptr_inner;
+        int nops = arr_wts ? 3 : 2;
+        int nop_ret = nops - 1;
+        PyArray_Descr *type_lst = PyArray_DESCR(arr_lst);
+        int typenum_lst = type_lst->type_num;
+
+        /* We need nested iterators, set them up... */
+        op[0] = arr_lst;
+        op[1] = arr_wts;
+        /* nop_ret will overwrite arr_wts if arr_wts == NULL */
+        op[nop_ret] = arr_ret;
+
+        /* ...the outer... */
+        op_axes_outer[0] = op_axes_outer_arrays[0];
+        op_axes_outer[1] = op_axes_outer_arrays[1];
+        op_axes_outer[nop_ret] = op_axes_outer_arrays[2];
+        op_flags_outer[0] = NPY_ITER_READONLY;
+        op_flags_outer[1] = NPY_ITER_READONLY;
+        op_flags_outer[nop_ret] = NPY_ITER_READWRITE;
+        iter_outer = NpyIter_AdvancedNew(nops, op, 0, NPY_KEEPORDER,
+                                         NPY_NO_CASTING, op_flags_outer,
+                                         NULL, ndim_outer, op_axes_outer,
+                                         NULL, -1);
+        if (!iter_outer ||
+            !(iternext_outer = NpyIter_GetIterNext(iter_outer, NULL))) {
+                goto fail;
         }
-        type = PyArray_DescrFromType(NPY_DOUBLE);
-        ans = (PyArrayObject *)PyArray_Zeros(1, &ans_size, type, 0);
-        if (ans == NULL) {
-            goto fail;
+        dataptrs_outer = NpyIter_GetDataPtrArray(iter_outer);
+
+        /* ...and the inner */
+        op_axes_inner[0] = op_axes_inner_arrays[0];
+        op_axes_inner[1] = op_axes_inner_arrays[1];
+        flags_inner = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_BUFFERED |
+                      NPY_ITER_GROWINNER;
+        op_flags_inner[0] = NPY_ITER_READONLY | NPY_ITER_NBO |
+                            NPY_ITER_ALIGNED;
+        op_flags_inner[1] = NPY_ITER_READONLY | NPY_ITER_NBO |
+                            NPY_ITER_ALIGNED;
+        op_dtypes[0] = PyArray_DESCR(arr_lst);
+        op_dtypes[1] = type_double;
+        iter_inner = NpyIter_AdvancedNew(nops - 1, op, flags_inner,
+                                         NPY_KEEPORDER, NPY_UNSAFE_CASTING,
+                                         op_flags_inner, op_dtypes,
+                                         ndim_inner, op_axes_inner,
+                                         NULL, -1);
+        if (!iter_inner ||
+            !(iternext_inner = NpyIter_GetIterNext(iter_inner, NULL))) {
+                goto fail;
         }
-        dans = (double *)PyArray_DATA(ans);
-        NPY_BEGIN_ALLOW_THREADS;
-        for (i = 0; i < len; i++) {
-            dans[numbers[i]] += weights[i];
+        dataptrs_inner = NpyIter_GetDataPtrArray(iter_inner);
+        strideptr_inner = NpyIter_GetInnerStrideArray(iter_inner);
+        innersizeptr_inner = NpyIter_GetInnerLoopSizePtr(iter_inner);
+
+        if (arr_wts) {
+            weights_inner_loop_func *wts_func = weights_func_map[typenum_lst];
+            do {
+                npy_double *data_ret = (npy_double *)dataptrs_outer[2];
+                NpyIter_ResetBasePointers(iter_inner, dataptrs_outer, NULL);
+                do {
+                    NPY_BEGIN_THREADS;
+                    wts_func((const char *)dataptrs_inner[0],
+                             (const char *)dataptrs_inner[1],
+                             data_ret, strideptr_inner[0], strideptr_inner[1],
+                             *innersizeptr_inner);
+                    NPY_END_THREADS;
+                } while (iternext_inner(iter_inner));
+            } while (iternext_outer(iter_outer));
         }
-        NPY_END_ALLOW_THREADS;
-        Py_DECREF(lst);
-        Py_DECREF(wts);
+        else {
+            count_inner_loop_func *cnt_func = count_func_map[typenum_lst];
+            do {
+                npy_intp *data_ret = (npy_intp *)dataptrs_outer[1];
+                NpyIter_ResetBasePointers(iter_inner, dataptrs_outer, NULL);
+                do {
+                    NPY_BEGIN_THREADS;
+                    cnt_func((const char *)dataptrs_inner[0],
+                             data_ret, strideptr_inner[0],
+                             *innersizeptr_inner);
+                    NPY_END_THREADS;
+                } while (iternext_inner(iter_inner));
+            } while (iternext_outer(iter_outer));
+        }
+
+        NpyIter_Deallocate(iter_inner);
+        NpyIter_Deallocate(iter_outer);
     }
-    return (PyObject *)ans;
 
-fail:
-    Py_XDECREF(lst);
-    Py_XDECREF(wts);
-    Py_XDECREF(ans);
-    return NULL;
-}
+    finish:
+        Py_DECREF(type_double);
+        Py_XDECREF(arr_lst);
+        Py_XDECREF(arr_wts);
+        return (PyObject *)arr_ret;
 
+    fail:
+        if (iter) {
+            NpyIter_Deallocate(iter);
+        }
+        if (iter_inner) {
+            NpyIter_Deallocate(iter_inner);
+        }
+        if (iter_outer) {
+            NpyIter_Deallocate(iter_outer);
+        }
+        Py_DECREF(type_double);
+        Py_XDECREF(arr_lst);
+        Py_XDECREF(arr_wts);
+        Py_XDECREF(arr_ret);
+        return NULL;
+}
 
 /*
  * digitize (x, bins, right=False) returns an array of python integers the same
@@ -1635,7 +2155,7 @@ io_unpack(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 static struct PyMethodDef methods[] = {
     {"_insert", (PyCFunction)arr_insert,
         METH_VARARGS | METH_KEYWORDS, arr_insert__doc__},
-    {"bincount", (PyCFunction)arr_bincount,
+    {"bincount", (PyCFunction)arr_ndbincount,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"digitize", (PyCFunction)arr_digitize,
         METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 003d3e541844..1efea3a0faba 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1461,6 +1461,193 @@ def test_empty_with_minlength(self):
         y = np.bincount(x, minlength=5)
         assert_array_equal(y, np.zeros(5, dtype=int))
 
+class TestNDBincount(TestCase):
+    def test_no_weights_1D(self):
+        int_types = ''.join(('?', np.typecodes['AllInteger']))
+        for type in int_types:
+            if type == '?':
+                z = [0, 5]
+            else:
+                z = [0, 2, 1, 0, 1, 1]
+            x = np.array([1, 5, 2, 4, 1], dtype=type)
+            y = np.bincount(x)
+            assert_array_equal(y, z)
+
+    def test_weights_1D(self):
+        int_types = ''.join(('?', np.typecodes['AllInteger']))
+        all_types = ''.join((np.typecodes['AllInteger'], 'efd'))
+        for type_x in int_types:
+            if type_x == '?':
+                z = [0, 13]
+            else:
+                z = [0, 2, 5, 0, 5, 1]
+            x = np.array([1, 2, 4, 5, 2], dtype=type_x)
+            for type_w in all_types:
+                w = np.array([2, 3, 5, 1, 2], dtype=type_w)
+                y = np.bincount(x, weights=w)
+                assert_array_equal(y, z)
+
+    def test_no_weights_nd(self):
+        int_types = np.typecodes['AllInteger']
+        for type in int_types:
+            x = np.arange(24, dtype=type).reshape(2, 3, 4) % 5
+            y = np.bincount(x, axis=None)
+            z = [5, 5, 5, 5, 4]
+            assert_array_equal(y, z)
+            y = np.bincount(x, axis=(0, 1, 2))
+            assert_array_equal(y, z)
+            y = np.bincount(x, axis=0)
+            z = [[[1, 0, 1, 0, 0],
+                  [0, 1, 0, 1, 0],
+                  [0, 0, 1, 0, 1],
+                  [1, 0, 0, 1, 0]],
+                 [[0, 1, 0, 0, 1],
+                  [1, 0, 1, 0, 0],
+                  [0, 1, 0, 1, 0],
+                  [0, 0, 1, 0, 1]],
+                 [[1, 0, 0, 1, 0],
+                  [0, 1, 0, 0, 1],
+                  [1, 0, 1, 0, 0],
+                  [0, 1, 0, 1, 0]]]
+            assert_array_equal(y, z)
+            y = np.bincount(x, axis=1)
+            z = [[[1, 0, 0, 1, 1],
+                  [1, 1, 0, 0, 1],
+                  [1, 1, 1, 0, 0],
+                  [0, 1, 1, 1, 0]],
+                 [[1, 1, 1, 0, 0],
+                  [0, 1, 1, 1, 0],
+                  [0, 0, 1, 1, 1],
+                  [1, 0, 0, 1, 1]]]
+            assert_array_equal(y, z)
+            y = np.bincount(x, axis=2)
+            z = [[[1, 1, 1, 1, 0],
+                  [1, 1, 1, 0, 1],
+                  [1, 1, 0, 1, 1]],
+                 [[1, 0, 1, 1, 1],
+                  [0, 1, 1, 1, 1],
+                  [1, 1, 1, 1, 0]]]
+            assert_array_equal(y, z)
+            y = np.bincount(x, axis=(0, 1))
+            z = [[2, 1, 1, 1, 1],
+                 [1, 2, 1, 1, 1],
+                 [1, 1, 2, 1, 1],
+                 [1, 1, 1, 2, 1]]
+            assert_array_equal(y, z)
+            y = np.bincount(x, axis=(0, 2))
+            z = [[2, 1, 2, 2, 1],
+                 [1, 2, 2, 1, 2],
+                 [2, 2, 1, 2, 1]]
+            assert_array_equal(y, z)
+            y = np.bincount(x, axis=(1, 2))
+            z = [[3, 3, 2, 2, 2],
+                 [2, 2, 3, 3, 2]]
+            assert_array_equal(y, z)
+            y = np.bincount(x, axis=())
+            z = [[[[1, 0, 0, 0, 0],
+                   [0, 1, 0, 0, 0],
+                   [0, 0, 1, 0, 0],
+                   [0, 0, 0, 1, 0]],
+                  [[0, 0, 0, 0, 1],
+                   [1, 0, 0, 0, 0],
+                   [0, 1, 0, 0, 0],
+                   [0, 0, 1, 0, 0]],
+                  [[0, 0, 0, 1, 0],
+                   [0, 0, 0, 0, 1],
+                   [1, 0, 0, 0, 0],
+                   [0, 1, 0, 0, 0]]],
+                 [[[0, 0, 1, 0, 0],
+                   [0, 0, 0, 1, 0],
+                   [0, 0, 0, 0, 1],
+                   [1, 0, 0, 0, 0]],
+                  [[0, 1, 0, 0, 0],
+                   [0, 0, 1, 0, 0],
+                   [0, 0, 0, 1, 0],
+                   [0, 0, 0, 0, 1]],
+                  [[1, 0, 0, 0, 0],
+                   [0, 1, 0, 0, 0],
+                   [0, 0, 1, 0, 0],
+                   [0, 0, 0, 1, 0]]]]
+            assert_array_equal(y, z)
+
+    def test_weights_nd(self):
+        int_types = np.typecodes['AllInteger']
+        all_types = ''.join((np.typecodes['AllInteger'], 'efd'))
+        for type_x in int_types:
+            x = np.arange(6, dtype=type_x).reshape(2,3) % 4
+            for type_w in all_types:
+                w = np.arange(6, 0, -1, dtype=type_w).reshape(2, 3)
+                y = np.bincount(x, weights=w, axis=None)
+                z = [8, 6, 4, 3]
+                assert_array_equal(y, z)
+                y = np.bincount(x, weights=w, axis=(0, 1))
+                assert_array_equal(y, z)
+                y = np.bincount(x, weights=w, axis=0)
+                z = [[6, 0, 0, 3],
+                     [2, 5, 0, 0],
+                     [0, 1, 4, 0]]
+                assert_array_equal(y, z)
+                y = np.bincount(x, weights=w, axis=1)
+                z = [[6, 5, 4, 0],
+                     [2, 1, 0, 3]]
+                assert_array_equal(y, z)
+
+    def test_scalars(self):
+        assert_array_equal(np.bincount(3), [0, 0, 0, 1])
+        assert_array_equal(np.bincount(3, 4), [0, 0, 0, 4])
+
+    def test_broadcasting(self):
+        x = [0, 1, 2, 0, 2]
+        w = np.arange(30).reshape(2, 3, 5)
+        y = np.bincount(x, weights=w)
+        z = [[[ 3,  1,  6],
+              [13,  6, 16],
+              [23, 11, 26]],
+             [[33, 16, 36],
+              [43, 21, 46],
+              [53, 26, 56]]]
+        assert_array_equal(y, z)
+
+    def test_non_aligned_non_nbo(self):
+        x = np.array([0, 1, 0, 2, 0, 3, 0], dtype='u1')
+        y = np.bincount(x[1:].view('<u2'))
+        z = [0, 1, 1, 1]
+        assert_array_equal(y, z)
+        y = np.bincount(x[:-1].view('>u2'))
+        assert_array_equal(y, z)
+
+    def test_strided(self):
+        x = np.arange(25).reshape(5, 5) % 3
+        y = np.bincount(x[:, 0])
+        z = [2, 1, 2]
+        assert_array_equal(y, z)
+        w = np.arange(25, 0, -1).reshape(5, 5)
+        y = np.bincount(x[:, 0], weights=w[:, 2])
+        z = [31, 13, 21]
+        assert_array_equal(y, z)
+
+    def test_errors(self):
+        # input array must be of integers
+        assert_raises(TypeError, np.bincount, [1.5, 2.3])
+        # minlength must be an integer
+        assert_raises(TypeError, np.bincount, range(5), minlength=1.3)
+        # axis must be in bounds
+        assert_raises(ValueError, np.bincount, range(5), axis=2)
+        # multiple axes must be in bounds
+        assert_raises(ValueError, np.bincount, range(5), axis=(0, 2))
+        # no duplicate entries in axis
+        assert_raises(ValueError, np.bincount, range(5), axis=(0, 0))
+        # weights must be castable to double
+        assert_raises(TypeError, np.bincount, [1, 2], [5.j, 2.j])
+        # no negative entries in input array
+        assert_raises(ValueError, np.bincount, [-1, 3])
+        # Make sure the iterator to find the max isn't deallocated twice,
+        # which crashed the system
+        assert_raises(ValueError, np.bincount,
+                      np.arange(-75, 50).reshape(5, 5, 5)[..., 0])
+        # no entry in input array that won't fit in a np.intp
+        assert_raises(ValueError, np.bincount, [np.iinfo(np.intp).max + 1])
+
 
 class TestInterp(TestCase):
     def test_exceptions(self):