In [67]:
import numpy as np
from scipy import stats
from sklearn.utils.extmath import weighted_mode
from sklearn.neighbors.base import _check_weights, _get_weights, NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin, SupervisedIntegerMixin
from sklearn.base import ClassifierMixin
from sklearn.utils import check_array

In [177]:
class RadiusKNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
                                SupervisedIntegerMixin, ClassifierMixin):

    def __init__(self, radius=1.0, weights='uniform',
                 algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                 outlier_label=None, metric_params=None, **kwargs):
        self._init_params(radius=radius,
                          algorithm=algorithm,
                          leaf_size=leaf_size,
                          metric=metric, p=p, metric_params=metric_params,
                          **kwargs)
        self.weights = _check_weights(weights)
        self.outlier_label = outlier_label

    def predict_proba(self, X):
        X = check_array(X, accept_sparse='csr')
        n_samples = X.shape[0]

        neigh_dist, neigh_ind = self.radius_neighbors(X)
        print('neigh_dist',neigh_dist)
        print('neigh_ind',neigh_ind)
        print('type',neigh_dist)
        print('type',neigh_ind)
        print('shape',neigh_dist.shape)
        print('shape',neigh_ind.shape)
        neigh_ind = neigh_ind.item()
        neigh_ind = neigh_ind.reshape(1, -1)
        neigh_dist = neigh_dist.item()
        neigh_dist = neigh_dist.reshape(1, -1)
        
        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]

        n_samples = X.shape[0]

        weights = _get_weights(neigh_dist, self.weights)
        if weights is None:
            weights = np.ones_like(neigh_ind)

        all_rows = np.arange(X.shape[0])
        probabilities = []
        for k, classes_k in enumerate(classes_):
            pred_labels = _y[:, k][neigh_ind]
            
            proba_k = np.zeros((n_samples, classes_k.size))
            
            # a simple ':' index doesn't work right
            for i, idx in enumerate(pred_labels.T):  # loop is O(n_neighbors)
                proba_k[all_rows, idx] += weights[:, i]

            # normalize 'votes' into real [0,1] probabilities
            normalizer = proba_k.sum(axis=1)[:, np.newaxis]
            normalizer[normalizer == 0.0] = 1.0
            proba_k /= normalizer

            probabilities.append(proba_k)

        if not self.outputs_2d_:
            probabilities = probabilities[0]

        return probabilities

In [178]:
X = [[1,1], [1,2], [1,3], [1,4], [2,1], [2,2], [2,3], [2,4], [3,1], [3,2], [3,3], [3,4], [4,1], [4,2], [4,3], [4,4]]
y = [1, 2, 4, 4, 2, 2, 4, 4, 3, 3, 5, 5, 3, 3, 5, 5]
#X = [[1], [2], [3], [4], [5]]
#y = [3, 2, 1, 2, 3]
clf = RadiusKNeighborsClassifier(radius=2, weights='distance', metric='manhattan')
clf.fit(X, y) 
print(clf.classes_)
print(clf.predict_proba([[1.1,1.2],[1.2,1.2]]))

[1 2 3 4 5]
neigh_dist [array([ 0.3,  0.9,  1.9,  1.1,  1.7])
 array([ 0.4,  1. ,  2. ,  1. ,  1.6,  2. ])]
neigh_ind [array([0, 1, 2, 4, 5], dtype=int64) array([0, 1, 2, 4, 5, 8], dtype=int64)]
type [array([ 0.3,  0.9,  1.9,  1.1,  1.7])
 array([ 0.4,  1. ,  2. ,  1. ,  1.6,  2. ])]
type [array([0, 1, 2, 4, 5], dtype=int64) array([0, 1, 2, 4, 5, 8], dtype=int64)]
shape (2,)
shape (2,)


ValueError: can only convert an array of size 1 to a Python scalar

In [184]:
arrs = [np.array([1,2,3]), np.array([4,5]), np.array([7,8,9])]

In [185]:
arrs

[array([1, 2, 3]), array([4, 5]), array([7, 8, 9])]

In [186]:
arr2d = np.array(arrs)

In [187]:
arr2d

array([array([1, 2, 3]), array([4, 5]), array([7, 8, 9])], dtype=object)

In [127]:
class KTNeighborsClassifier(NeighborsBase, KNeighborsMixin,
                           SupervisedIntegerMixin, ClassifierMixin):

    def __init__(self, n_neighbors=5,
                 weights='uniform', algorithm='auto', leaf_size=30,
                 p=2, metric='minkowski', metric_params=None, n_jobs=1,
                 **kwargs):

        self._init_params(n_neighbors=n_neighbors,
                          algorithm=algorithm,
                          leaf_size=leaf_size, metric=metric, p=p,
                          metric_params=metric_params, n_jobs=n_jobs, **kwargs)
        self.weights = _check_weights(weights)

    def predict_proba(self, X):
        X = check_array(X, accept_sparse='csr')

        neigh_dist, neigh_ind = self.kneighbors(X)
        print("neigh_dist", neigh_dist)
        print("neigh_ind", neigh_ind)
        print("type", type(neigh_dist))
        print("type", type(neigh_ind))
        print("shape", (neigh_dist.shape))
        print("shape", (neigh_ind.shape))
        
        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]

        n_samples = X.shape[0]

        weights = _get_weights(neigh_dist, self.weights)
        if weights is None:
            weights = np.ones_like(neigh_ind)

        all_rows = np.arange(X.shape[0])
        probabilities = []
        for k, classes_k in enumerate(classes_):
            pred_labels = _y[:, k][neigh_ind]
            proba_k = np.zeros((n_samples, classes_k.size))

            # a simple ':' index doesn't work right
            for i, idx in enumerate(pred_labels.T):  # loop is O(n_neighbors)
                proba_k[all_rows, idx] += weights[:, i]

            # normalize 'votes' into real [0,1] probabilities
            normalizer = proba_k.sum(axis=1)[:, np.newaxis]
            normalizer[normalizer == 0.0] = 1.0
            proba_k /= normalizer

            probabilities.append(proba_k)

        if not self.outputs_2d_:
            probabilities = probabilities[0]

        return probabilities


In [128]:
X = [[1,1], [1,2], [1,3], [1,4], [2,1], [2,2], [2,3], [2,4], [3,1], [3,2], [3,3], [3,4], [4,1], [4,2], [4,3], [4,4]]
y = [1, 2, 4, 4, 2, 2, 4, 4, 3, 3, 5, 5, 3, 3, 5, 5]
#X = [1,2,3,4,5]
#y = [3, 2, 1, 2, 3]
clf = KTNeighborsClassifier(weights='distance', metric='manhattan')
clf.fit(X, y) 
print(clf.predict_proba([[1, 1]]))

neigh_dist [[ 0.  1.  1.  2.  2.]]
neigh_ind [[0 4 1 5 2]]
type <class 'numpy.ndarray'>
type <class 'numpy.ndarray'>
shape (1, 5)
shape (1, 5)
[[ 1.  0.  0.  0.  0.]]


In [None]:
clf = RadiusKNeighborsClassifier(radius=Radius, weights='distance', 
                               metric='manhattan')
clf.fit(df_cell_train[feature_list], df_cell_train.place_id)

In [None]:
def weighted_mode3(a, w, axis=0):
    """Returns an array of the weighted modal (most common) value in a
    If there is more than one such value, only the first is returned.
    The bin-count for the modal bins is also returned.
    This is an extension of the algorithm in scipy.stats.mode.
    Parameters
    ----------
    a : array_like
        n-dimensional array of which to find mode(s).
    w : array_like
        n-dimensional array of weights for each value
    axis : int, optional
        Axis along which to operate. Default is 0, i.e. the first axis.
    Returns
    -------
    vals : ndarray
        Array of modal values.
    score : ndarray
        Array of weighted counts for each mode.
    Examples
    --------
    >>> from sklearn.utils.extmath import weighted_mode
    >>> x = [4, 1, 4, 2, 4, 2]
    >>> weights = [1, 1, 1, 1, 1, 1]
    >>> weighted_mode(x, weights)
    (array([ 4.]), array([ 3.]))
    The value 4 appears three times: with uniform weights, the result is
    simply the mode of the distribution.
    >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's
    >>> weighted_mode(x, weights)
    (array([ 2.]), array([ 3.5]))
    The value 2 has the highest score: it appears twice with weights of
    1.5 and 2: the sum of these is 3.
    See Also
    --------
    scipy.stats.mode
    """
    if axis is None:
        a = np.ravel(a)
        w = np.ravel(w)
        axis = 0
    else:
        a = np.asarray(a)
        w = np.asarray(w)
        axis = axis

    if a.shape != w.shape:
        w = np.zeros(a.shape, dtype=w.dtype) + w

    scores = np.unique(np.ravel(a))       # get ALL unique values
    testshape = list(a.shape)
    testshape[axis] = 1
    oldmostfreq = np.zeros(testshape)
    oldcounts = np.zeros(testshape)
    for score in scores:
        template = np.zeros(a.shape)
        ind = (a == score)
        template[ind] = w[ind]
        counts = np.expand_dims(np.sum(template, axis), axis)
        mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
        oldcounts = np.maximum(counts, oldcounts)
        oldmostfreq = mostfrequent
    return mostfrequent, oldcounts

In [None]:
class Radius3NeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
                                SupervisedIntegerMixin, ClassifierMixin):

    def __init__(self, radius=1.0, weights='uniform',
                 algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                 outlier_label=None, metric_params=None, **kwargs):
        self._init_params(radius=radius,
                          algorithm=algorithm,
                          leaf_size=leaf_size,
                          metric=metric, p=p, metric_params=metric_params,
                          **kwargs)
        self.weights = _check_weights(weights)
        self.outlier_label = outlier_label

    def predict(self, X, n):
        """Predict the class labels for the provided data
        Parameters
        ----------
        X : array-like, shape (n_query, n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            Test samples.
        Returns
        -------
        y : array of shape [n_samples] or [n_samples, n_outputs]
            Class labels for each data sample.
        """
        X = check_array(X, accept_sparse='csr')
        n_samples = X.shape[0]

        neigh_dist, neigh_ind = self.radius_neighbors(X)
        inliers = [i for i, nind in enumerate(neigh_ind) if len(nind) != 0]
        outliers = [i for i, nind in enumerate(neigh_ind) if len(nind) == 0]

        classes_ = self.classes_
        _y = self._y
        if not self.outputs_2d_:
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]
        n_outputs = len(classes_)

        if self.outlier_label is not None:
            neigh_dist[outliers] = 1e-6
        elif outliers:
            raise ValueError('No neighbors found for test samples %r, '
                             'you can try using larger radius, '
                             'give a label for outliers, '
                             'or consider removing them from your dataset.'
                             % outliers)

        weights = _get_weights(neigh_dist, self.weights)

        y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
        for k, classes_k in enumerate(classes_):
            pred_labels = np.array([_y[ind, k] for ind in neigh_ind],
                                   dtype=object)
            if weights is None:
                mode = np.array([stats.mode(pl)[0]
                                 for pl in pred_labels[inliers]], dtype=np.int)
            else:
                mode = np.array([weighted_mode3(pl, w)[0]
                                 for (pl, w)
                                 in zip(pred_labels[inliers], weights)],
                                dtype=np.int)

            mode = mode.ravel()

            y_pred[inliers, k] = classes_k.take(mode)

        if outliers:
            y_pred[outliers, :] = self.outlier_label

        if not self.outputs_2d_:
            y_pred = y_pred.ravel()

        return y_pred

In [None]:
X = [[1,1], [1,2], [1,3], [1,4], [2,1], [2,2], [2,3], [2,4], [3,1], [3,2], [3,3], [3,4], [4,1], [4,2], [4,3], [4,4]]
y = [1, 2, 4, 4, 2, 2, 4, 4, 3, 3, 5, 5, 3, 3, 5, 5]
#X = [1,2,3,4,5]
#y = [3, 2, 1, 2, 3]
clf = Radius3NeighborsClassifier(weights='distance', metric='manhattan')
clf.fit(X, y) 
print(clf.predict_proba([[1, 1]]))