In [1]:
import numpy as np
dataset_filename = "affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))

This dataset has 100 samples and 5 features


In [2]:
print(X[:5])

[[ 0.  0.  1.  1.  1.]
 [ 1.  1.  0.  1.  0.]
 [ 1.  0.  1.  1.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 0.  1.  0.  0.  1.]]


In [4]:
# 商品种类
features = ["bread", "milk", "cheese", "apples", "bananas"]

In [5]:
# 先看有多少人买过苹果
num_apple_purchases = 0
for sample in X:
    if sample[3] == 1:  # 这人买了一个苹果
        num_apple_purchases += 1
print("{0} people bought Apples".format(num_apple_purchases))

36 people bought Apples


In [6]:
# 记录同时买苹果和香蕉的人，记为有效情况，同时记录规则有效和无效的两种情况
rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3] == 1:  # 此人买了苹果
        if sample[4] == 1:
            # 此人即买了苹果也买了香蕉
            rule_valid += 1
        else:
            # 此人只买了苹果
            rule_invalid += 1
print("{0} cases of the rule being valid were discovered".format(rule_valid))
print("{0} cases of the rule being invalid were discovered".format(rule_invalid))

21 cases of the rule being valid were discovered
15 cases of the rule being invalid were discovered


In [7]:
# 现在我们获得了全部的置信度和支持度
support = rule_valid  # 支持度就是有效情况
confidence = rule_valid / num_apple_purchases
print("The support is {0} and the confidence is {1:.3f}.".format(support, confidence))
# 置信度即为信心率
print("As a percentage, that is {0:.1f}%.".format(100 * confidence))

The support is 21 and the confidence is 0.583.
As a percentage, that is 58.3%.


In [15]:
from collections import defaultdict
# 现在记录所有的规则
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):  # 第一个特征为规则的前提条件——顾客买了某一商品
        if sample[premise] == 0: continue   # 没买？就继续检测下一条件
        # Record that the premise was bought in another transaction
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:  # 跳过结论和条件相同的情况
                continue
            if sample[conclusion] == 1:  # 假如该顾客买了某一商品，而且还买了另一商品
                # 属于有用规则
                valid_rules[(premise, conclusion)] += 1
            else:
                # 否则属于无用的规则
                invalid_rules[(premise, conclusion)] += 1
support = valid_rules
#　print(support)
confidence = defaultdict(float)
# print(valid_rules.keys())
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
    # print(confidence)

In [16]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.610
 - Support: 25

Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.659
 - Support: 27

Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.694
 - Support: 25

Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.583
 - Support: 21

Rule: If a person buys bananas they will also buy cheese
 - Confidence: 0.458
 - Support: 27

Rule: If a person buys bananas they will also buy apples
 - Confidence: 0.356
 - Support: 21

Rule: If a person buys bread they will also buy milk
 - Confidence: 0.519
 - Support: 14

Rule: If a person buys bread they will also buy apples
 - Confidence: 0.185
 - Support: 5

Rule: If a person buys milk they will also buy bread
 - Confidence: 0.304
 - Support: 14

Rule: If a person buys milk they will also buy apples
 - Confidence: 0.196
 - Support: 9

Rule: If a person buys apples they will also buy bread
 - Confidence: 0.139
 

In [17]:
# 可将上面的循环用函数来表示，方便调用
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

In [18]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buys milk they will also buy apples
 - Confidence: 0.196
 - Support: 9



In [19]:
premise = 2
conclusion = 4
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.659
 - Support: 27



In [20]:
# 通过支持度来排序
from pprint import pprint
pprint(list(support.items()))

[((2, 3), 25),
 ((2, 4), 27),
 ((3, 2), 25),
 ((3, 4), 21),
 ((4, 2), 27),
 ((4, 3), 21),
 ((0, 1), 14),
 ((0, 3), 5),
 ((1, 0), 14),
 ((1, 3), 9),
 ((3, 0), 5),
 ((3, 1), 9),
 ((0, 2), 4),
 ((2, 0), 4),
 ((1, 4), 19),
 ((4, 1), 19),
 ((0, 4), 17),
 ((4, 0), 17),
 ((1, 2), 7),
 ((2, 1), 7)]


In [25]:
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
pprint(sorted_support)

[((2, 4), 27),
 ((4, 2), 27),
 ((2, 3), 25),
 ((3, 2), 25),
 ((3, 4), 21),
 ((4, 3), 21),
 ((1, 4), 19),
 ((4, 1), 19),
 ((0, 4), 17),
 ((4, 0), 17),
 ((0, 1), 14),
 ((1, 0), 14),
 ((1, 3), 9),
 ((3, 1), 9),
 ((1, 2), 7),
 ((2, 1), 7),
 ((0, 3), 5),
 ((3, 0), 5),
 ((0, 2), 4),
 ((2, 0), 4)]


In [26]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.659
 - Support: 27

Rule #2
Rule: If a person buys bananas they will also buy cheese
 - Confidence: 0.458
 - Support: 27

Rule #3
Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.610
 - Support: 25

Rule #4
Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.694
 - Support: 25

Rule #5
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.583
 - Support: 21



In [29]:
# 根据置信度排序
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

In [30]:
# 输出置信度最高的规则，并根据置信度进行排序
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.694
 - Support: 25

Rule #2
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.659
 - Support: 27

Rule #3
Rule: If a person buys bread they will also buy bananas
 - Confidence: 0.630
 - Support: 17

Rule #4
Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.610
 - Support: 25

Rule #5
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.583
 - Support: 21

