##### 套路
-   1. 读取数据
-   2. 实现算法
-   3. 测试算法

### 一、亲和性分析
-    1. 如果客户买了商品X，那么他们可能愿意买商品Y

衡量方法：
-    2. 支持度support：所有买X的人数
-    3. 置信度confidence：$ \frac{所有买X和Y的人数}{所有买X的人数}$

In [163]:
import numpy as np
from operator import itemgetter
import pandas as pd
from collections import Counter
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [77]:
X = np.zeros((100, 5), dtype='int32')
for i in range(X.shape[0]):
    if np.random.random() < 0.3:
    # A bread winner
        X[i][0] = 1
        if np.random.random() < 0.5:
            # Who likes milk
            X[i][1] = 1
        if np.random.random() < 0.2:
            # Who likes cheese
            X[i][2] = 1
        if np.random.random() < 0.25:
            # Who likes apples
            X[i][3] = 1
        if np.random.random() < 0.5:
            # Who likes bananas
            X[i][4] = 1
    else:
        # Not a bread winner
        if np.random.random() < 0.5:
            # Who likes milk
            X[i][1] = 1
            if np.random.random() < 0.2:
                # Who likes cheese
                X[i][2] = 1
            if np.random.random() < 0.25:
                # Who likes apples
                X[i][3] = 1
            if np.random.random() < 0.5:
                # Who likes bananas
                X[i][4] = 1
        else:
            if np.random.random() < 0.8:
                # Who likes cheese
                X[i][2] = 1
            if np.random.random() < 0.6:
                # Who likes apples
                X[i][3] = 1
            if np.random.random() < 0.7:
                # Who likes bananas
                X[i][4] = 1
    if X[i].sum() == 0:
        X[i][4] = 1  # Must buy something, so gets bananas

data = pd.DataFrame(X, columns=['Bread', 'Milk', 'Cheese', 'Apples', 'Bananas'])
data.to_csv('./data/01_affinity_dataset.csv', index=False)

In [80]:
apples = data.query("Apples > 0")
apples_and_bananas = apples.query("Bananas > 0")
apples_and_not_bananas = apples.query("Bananas == 0")
print(f"买了苹果的有{len(apples)}人")
print(f"买了苹果并且买了香蕉的有{len(apples_and_bananas)}人")
print(f"买了苹果没有买香蕉的有{len(apples_and_not_bananas)}人")
support = len(apples_and_bananas)
confidence = len(apples_and_bananas) / len(apples)
print("支持度support = {0} 置信度confidence = {1:.3f}.".format(support, confidence))

买了苹果的有39人
买了苹果并且买了香蕉的有27人
买了苹果没有买香蕉的有12人
支持度support = 27 置信度confidence = 0.692.


In [94]:
features_range = []
for c1 in data.columns:
    for c2 in data.columns:
        if c1 == c2: continue
        features_range.append((c1, c2))

In [95]:
features_range

[('Bread', 'Milk'),
 ('Bread', 'Cheese'),
 ('Bread', 'Apples'),
 ('Bread', 'Bananas'),
 ('Milk', 'Bread'),
 ('Milk', 'Cheese'),
 ('Milk', 'Apples'),
 ('Milk', 'Bananas'),
 ('Cheese', 'Bread'),
 ('Cheese', 'Milk'),
 ('Cheese', 'Apples'),
 ('Cheese', 'Bananas'),
 ('Apples', 'Bread'),
 ('Apples', 'Milk'),
 ('Apples', 'Cheese'),
 ('Apples', 'Bananas'),
 ('Bananas', 'Bread'),
 ('Bananas', 'Milk'),
 ('Bananas', 'Cheese'),
 ('Bananas', 'Apples')]

In [101]:
supports = defaultdict()
confidences = defaultdict()

for c in features_range:
    x = len(data.query(f"{c[0]} > 0"))
    x_and_y = len(data.query(f"{c[0]} > 0 and {c[1]} > 0"))
    x_and_not_y = len(data.query(f"{c[0]} > 0 and {c[1]} == 0"))
    support = x_and_y
    confidence = x_and_y / x
#     print(f'买了{c[0]}, 并且买了{c[1]},\n support: {support},\n confidence: {confidence}')
    supports[c] = support
    confidences[c] = confidence

In [102]:
sorted(confidences.items(), key=itemgetter(1), reverse=True)

[(('Apples', 'Bananas'), 0.6923076923076923),
 (('Bread', 'Bananas'), 0.6896551724137931),
 (('Cheese', 'Bananas'), 0.6578947368421053),
 (('Milk', 'Bananas'), 0.5918367346938775),
 (('Bread', 'Milk'), 0.5517241379310345),
 (('Cheese', 'Apples'), 0.4473684210526316),
 (('Apples', 'Cheese'), 0.4358974358974359),
 (('Bananas', 'Milk'), 0.43283582089552236),
 (('Bananas', 'Apples'), 0.40298507462686567),
 (('Bread', 'Apples'), 0.3793103448275862),
 (('Bananas', 'Cheese'), 0.373134328358209),
 (('Apples', 'Milk'), 0.3333333333333333),
 (('Milk', 'Bread'), 0.32653061224489793),
 (('Bananas', 'Bread'), 0.29850746268656714),
 (('Cheese', 'Milk'), 0.2894736842105263),
 (('Apples', 'Bread'), 0.28205128205128205),
 (('Milk', 'Apples'), 0.2653061224489796),
 (('Milk', 'Cheese'), 0.22448979591836735),
 (('Bread', 'Cheese'), 0.20689655172413793),
 (('Cheese', 'Bread'), 0.15789473684210525)]

In [103]:
sorted(supports.items(), key=itemgetter(1), reverse=True)

[(('Milk', 'Bananas'), 29),
 (('Bananas', 'Milk'), 29),
 (('Apples', 'Bananas'), 27),
 (('Bananas', 'Apples'), 27),
 (('Cheese', 'Bananas'), 25),
 (('Bananas', 'Cheese'), 25),
 (('Bread', 'Bananas'), 20),
 (('Bananas', 'Bread'), 20),
 (('Cheese', 'Apples'), 17),
 (('Apples', 'Cheese'), 17),
 (('Bread', 'Milk'), 16),
 (('Milk', 'Bread'), 16),
 (('Milk', 'Apples'), 13),
 (('Apples', 'Milk'), 13),
 (('Bread', 'Apples'), 11),
 (('Milk', 'Cheese'), 11),
 (('Cheese', 'Milk'), 11),
 (('Apples', 'Bread'), 11),
 (('Bread', 'Cheese'), 6),
 (('Cheese', 'Bread'), 6)]

### 二、Iris分类
##### 算法
* For 给定的每个特征
    * For 该特征对应的真值（即植物是哪一类）
        * 预测值：基于该特征预测的次数最多的类，即在所有样本里该特征 10 次有 6 次预测了 A 类，那我们对所有样本都预测为 A 类
        * 计算预测值与真值的误差
    * 对上面计算的误差求和
* 使用误差最小的特征作为最终模型

In [160]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [108]:
data = load_iris()
X = data.data
y = data.target
n_examples, n_features = X.shape

In [136]:
attribute_mean = X.mean(axis=0)
assert attribute_mean.shape == (n_features, )
X_d = np.asarray(X >= attribute_mean, dtype='int')

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=2021)

In [147]:
def train(X, y_true, feature):
    n_samples, n_features = X.shape
    assert 0 <= feature < n_features
    values = set(X[:, feature])
    predictors = dict()
    errors = []
    for current in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature, current)
        predictors[current] = most_frequent_class
        errors.append(error)
    total_error = sum(errors)
    return predictors, total_error
    
def train_feature_value(X, y_true, feature, value):
    class_counts = defaultdict(int)
    for sample, y in zip(X, y_true):
        if sample[feature] == value:
            class_counts[y] += 1
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    n_samples = X.shape[1]
    error = sum([class_count for class_value, class_count in class_counts.items() if class_value != most_frequent_class])
    return most_frequent_class, error

In [151]:
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("最佳模型基于第 {0} 个变量，误差为 {1:.2f}".format(best_variable, best_error))
model = {
    'variable': best_variable, 
    'predictor': all_predictors[best_variable][0]
}

最佳模型基于第 2 个变量，误差为 36.00


In [155]:
def predict(model, X_test):
    variable = model['variable']
    predictor = model['predictor']
    y_pred = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_pred

In [166]:
y_pred = predict(model, X_test)
acc = np.mean(y_pred == y_test) * 100
print("在测试集上的准确率 {:.1f}%".format(acc))

在测试集上的准确率 63.2%


In [159]:
confusion_matrix(y_test, y_pred)

array([[16,  0,  0],
       [ 1,  0, 13],
       [ 0,  0,  8]])

In [164]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       0.00      0.00      0.00        14
           2       0.38      1.00      0.55         8

    accuracy                           0.63        38
   macro avg       0.44      0.67      0.51        38
weighted avg       0.48      0.63      0.52        38

