In [39]:
from typing import Union,Sequence
import numpy as np
from scipy import stats
def discrete(x: np.ndarray,n:int=5)->np.ndarray:
    '''
    计算出每位数据所处的百分位数区间位置,默认分为5段,段数从1开始计数,数值越大越接近100%
    '''
    if n<=0 or isinstance(x[0],np.str_):
        return x
    else:
        res = np.array([0] * x.shape[-1], dtype=int)
        setp = 100/n
        for i in range(n):
            point1 = stats.scoreatpercentile(x, i * setp)
            point2 = stats.scoreatpercentile(x, (i + 1) * setp)
            x1 = x[np.where((x >= point1) & (x <= point2))]
            mask = np.in1d(x, x1)
            res[mask] = (i + 1)
        return res

def discrete_features(X: np.ndarray,N:Union[int,Sequence[int]]= 5)->np.ndarray:
    '''
    计算特征矩阵每一项特征的百分位数均分离散化,
    参数X必须为二维数组,且特征为列,数据为行,
    如果N中的某一项为0,那么该项不分区间认为是离散的
    '''
    X = X.T
    if isinstance(N,int):
        N = [N for i in range(X.shape[0])]
    if len(N) != X.shape[0]:
        raise AttributeError("N must have the same len with the rows' len")
    temp = []
    for i in range(X.shape[0]):
        x = X[i,:]
        x1 = discrete(x,n=N[i])
        temp.append(x1)
    return np.array(temp).T

In [22]:
x = np.array([1.0,4.0,5,6,3,6,7,8,3,6,7,8,9,11])

In [23]:
X = np.array([[1,4,5,6,3,6,7,8,3,6,7,8,9,11],
          [1,4,5,6,3,6,7,8,3,6,7,8,9,11]
         ],).T

In [33]:
y = np.array(["a","b"])

In [34]:
discrete(x)

20.0
40.0
60.0
80.0
100.0


array([1, 2, 2, 3, 1, 3, 4, 5, 1, 3, 4, 5, 5, 5])

In [35]:
discrete(y)

array(['a', 'b'], 
      dtype='<U1')

In [31]:
discrete(x,0)

array([  1.,   4.,   5.,   6.,   3.,   6.,   7.,   8.,   3.,   6.,   7.,
         8.,   9.,  11.])

In [29]:
discrete(x,9)

11.11111111111111
22.22222222222222
33.33333333333333
44.44444444444444
55.55555555555556
66.66666666666666
77.77777777777777
88.88888888888889
100.0


array([1, 3, 3, 5, 2, 5, 7, 8, 2, 5, 7, 8, 9, 9])

In [45]:
discrete_features(X,6)

array([[1, 1],
       [2, 2],
       [2, 2],
       [4, 4],
       [1, 1],
       [4, 4],
       [5, 5],
       [6, 6],
       [1, 1],
       [4, 4],
       [5, 5],
       [6, 6],
       [6, 6],
       [6, 6]])

In [44]:
from sklearn.preprocessing import OneHotEncoder

In [45]:
enc = OneHotEncoder()

In [46]:
enc.fit(["a","b"])

ValueError: could not convert string to float: 'a'