### 数据标准化
The standard score of a sample `x` is calculated as:

    z = (x - u) / s
here `u` is the mean of the training samples or zero if `with_mean=False`,
and `s` is the standard deviation of the training samples or one if
`with_std=False`.

In [1]:
import numpy as np
import pandas as pd

In [2]:
x = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]])
x

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [32]:
# Z-Score，或者去除均值和方差缩放
mean_x = np.mean(x, axis=0)
std_x = np.std(x, axis=0)

In [37]:
np.var(x,axis=0)

array([0.66666667, 0.66666667, 1.55555556])

In [33]:
mean_x

array([1.        , 0.        , 0.33333333])

In [35]:
def get_standard_data(a):
    return (a-mean_x)/std_x

In [36]:
np.apply_along_axis(get_standard_data,0,x)

array([[ 0.        , -2.44948974,  1.22474487],
       [ 2.44948974,  0.        ,  0.        ],
       [-0.26726124,  0.53452248, -1.06904497]])

In [11]:
from sklearn import preprocessing

In [12]:
x_scaled = preprocessing.scale(x)
x_scaled.mean(axis=0)
x_scaled.std(axis=0)

array([1., 1., 1.])

In [14]:
scaler = preprocessing.StandardScaler().fit(x)
scaler.mean_

array([1.        , 0.        , 0.33333333])

In [24]:
scaler.var_

array([0.66666667, 0.66666667, 1.55555556])

In [15]:
# 标准化数据
scaler.transform(x)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [45]:
#第二种是将属性缩放到一个指定范围,也是就是(x-min)/(max-min)
#依赖于preprocessing中的MinMaxScaler类
x_train = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(x_train)
x_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [None]:
# 当然，在构造类对象的时候也可以直接指定最大最小值的范围：feature_range = (min, max)，此时应用的公式变为：
x_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
x_scaled = X_std / (max - min) + min

### 正则化数据

In [44]:

#第三种是正则化Normalization

x = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]])
x_normalized = preprocessing.normalize(x, norm='l2')
x_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [43]:
# 可以使用processing.Normalizer()类实现对训练集和测试集的拟合和转换
normalizer = preprocessing.Normalizer().fit(x)
normalizer.transform(x)

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [46]:
import numpy as np
import collections


class KNN(object):
    def __init__(self, n_neighbors=3, p=2):
        """
        parameter: n_neighbors 临近点个数
        parameter: p 距离度量
        """
        self.n = n_neighbors
        self.p = p

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X):
        # 取出n个点
        knn_list = []
        for i in range(self.n):
            dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
            knn_list.append((dist, self.y_train[i]))

        for i in range(self.n, len(self.X_train)):
            max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
            dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
            if knn_list[max_index][0] > dist:
                knn_list[max_index] = (dist, self.y_train[i])

        # 统计
        knn = [k[-1] for k in knn_list]
        return collections.Counter(knn).most_common()[0][0]

    # 统计准确度
    def score(self, X_test, y_test):
        right_count = 0
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right_count += 1
        return right_count / len(X_test)

socre = 1.0


In [None]:

def model_test(model):
    """
    模型测试
    """
    knn = model()
    knn.fit(X_train, y_train)
    return knn.score(X_test, y_test)

In [49]:
def get_test_data():
    from sklearn import datasets
    from sklearn.model_selection import train_test_split
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    #     X = iris.data[:100, [0, 2]]
    #     y = iris.target[:100]
    #     y = np.where(y == 1, 1, -1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    return X_train, X_test, y_train, y_test


get_test_data()

(array([[5.7, 3. , 4.2, 1.2],
        [5.4, 3.4, 1.7, 0.2],
        [6.2, 2.2, 4.5, 1.5],
        [4.6, 3.6, 1. , 0.2],
        [5.8, 2.7, 5.1, 1.9],
        [6.8, 2.8, 4.8, 1.4],
        [5.5, 2.5, 4. , 1.3],
        [6.9, 3.2, 5.7, 2.3],
        [5.8, 2.7, 3.9, 1.2],
        [7.1, 3. , 5.9, 2.1],
        [6.4, 3.2, 4.5, 1.5],
        [5. , 2.3, 3.3, 1. ],
        [7.7, 2.6, 6.9, 2.3],
        [7.9, 3.8, 6.4, 2. ],
        [5. , 3.3, 1.4, 0.2],
        [5.4, 3.9, 1.3, 0.4],
        [6.3, 2.8, 5.1, 1.5],
        [6.4, 2.7, 5.3, 1.9],
        [5.7, 3.8, 1.7, 0.3],
        [5.6, 2.7, 4.2, 1.3],
        [6.3, 2.5, 4.9, 1.5],
        [6. , 2.9, 4.5, 1.5],
        [4.7, 3.2, 1.3, 0.2],
        [4.9, 2.5, 4.5, 1.7],
        [5.3, 3.7, 1.5, 0.2],
        [5.9, 3. , 5.1, 1.8],
        [7.2, 3. , 5.8, 1.6],
        [6.2, 2.8, 4.8, 1.8],
        [5.5, 3.5, 1.3, 0.2],
        [6. , 2.2, 4. , 1. ],
        [6.1, 2.6, 5.6, 1.4],
        [5.1, 2.5, 3. , 1.1],
        [5. , 3.4, 1.5, 0.2],
        [5

In [48]:
from sklearn import datasets
iris = datasets.load_iris()
iris.target[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])