In [8]:
import numpy as np
from sklearn.datasets import load_iris

dataset = load_iris()
x = dataset.data
y_true = dataset.target
n_samples, n_features = x.shape
# 计算每一项特征的平均值
attribute_means = x.mean(axis=0)
# 与平均值比较，大于等于的为“1”，小于的为“0”.将连续性的特征值变为离散性的类别型。
x_d = np.array(x >= attribute_means, dtype=np.int)

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_d, y_true, random_state=14)

In [18]:
from operator import itemgetter
from collections import  defaultdict

def train_feature_value(x, y_true, feature, value):
    '''找出某一特征下某一值最多的类别与错误'''
    class_counts = defaultdict(int)
    for sample, y in zip(x, y_true):
        if sample[feature] == value:
            class_counts[y] += 1
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    error = sum(class_count for class_value, class_count in class_counts.items()
               if class_value != most_frequent_class)
    return most_frequent_class, error
print(train_feature_value(x_train, y_train, 0, 1))
print(train_feature_value(x_train, y_train, 0, 0))

(2, 22)
(0, 19)


In [19]:
def train_on_feature(x, y_true, feature):
    '''找出错误最少的特征及该特征下各值所属的类别'''
    values = set(x[:, feature])
    predictors = {}
    errors = []
    
    for current_value in values:
        most_frequent_class, error = train_feature_value(x, y_true, feature, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    total_error = sum(errors)
    return predictors, total_error    
print(train_on_feature(x_train, y_train, 0))

({0: 0, 1: 2}, 41)


In [20]:
# 找到所有特征下的各特征值的类别，
# 格式就如：{0：({0: 0， 1: 2}， 41)}
# 首先为一个字典，字典的键是某个特征，字典的值由一个集合构成，
# 这个集合又是由一个字典和一个值组成，字典的键是特征值，字典的值为类别，最后一个单独的值是错误率。
all_predictors = {feature: train_on_feature(x_train, y_train, feature)
                  for feature in range(x_train.shape[1])}

# 筛选出每个特征下的错误率出来
errors = {feature: error for feature,(mapping, error) in all_predictors.items()}
print(all_predictors)
print(errors)

{0: ({0: 0, 1: 2}, 41), 1: ({0: 1, 1: 0}, 58), 2: ({0: 0, 1: 2}, 37), 3: ({0: 0, 1: 2}, 37)}
{0: 41, 1: 58, 2: 37, 3: 37}


In [21]:
# 对错误率排序，得到最优的特征和最低的错误率，以此为模型和规则。
# 这就是One Rule（OneR）算法。
best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print('The best model is based on feature {0} and has error {1:.2f}'
      .format(best_feature, best_error))

The best model is based on feature 2 and has error 37.00


In [22]:
# 建立模型
model = {'feature': best_feature,
        'predictor': all_predictors[best_feature][0]}
print(model)

{'feature': 2, 'predictor': {0: 0, 1: 2}}


In [34]:
# 开始测试——对最优特征下的特征值所属类别进行分类。
def predict(x_test, model):
    feature = model['feature']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[feature])] for sample in x_test])
    return y_predicted

y_predicted = predict(x_test, model)
print(y_predicted)

# 在这个最优特征下，各特征值的所属类别与测试数据集相对比，得到准确率。
accuracy = np.mean(y_predicted == y_test) * 100
print('The test accuracy is {0:.2f}%'.format(accuracy))

[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
 2]
The test accuracy is 65.79%


In [38]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predicted))


             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       0.00      0.00      0.00        13
          2       0.40      1.00      0.57         8

avg / total       0.51      0.66      0.55        38



  'precision', 'predicted', average, warn_for)
