In [2]:
import numpy as np

filename = 'affinity_dataset.txt'
f = np.loadtxt(filename)
n_samples, n_features = f.shape
features = ['bread', 'milk', 'cheese', 'apple', 'banana']

print(n_samples, n_features)
print(f[:5])

100 5
[[0. 0. 1. 1. 1.]
 [1. 1. 0. 1. 0.]
 [1. 0. 1. 1. 0.]
 [0. 0. 1. 1. 1.]
 [0. 1. 0. 0. 1.]]


In [2]:
apple_purchase = 0
for sample in f:
    if sample[3] == 1:
        apple_purchase += 1
print('%s people bought apples' % apple_purchase)

36 people bought apples


In [3]:
# 首先为 规则应验 和 规则无效 两种情况创建字典
# 字典的键是由条件和结论组成的元组
# 比如（3, 4）表示“如果顾客购买了苹果，他们也会购买香蕉”
from collections import defaultdict

valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurances = defaultdict(int)

In [4]:
# 计算所有可能的规则
for sample in f:
    # 预测条件
    for premise in range(n_features):
        if sample[premise] == 0:
            continue
        num_occurances[premise] += 1
        for conclusion in range(n_features):
            # 跳过条件与结论相同的情况
            if premise == conclusion:
                continue
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1
# 支持度为valid_rules
# 置信度为valid_rules / num_occurances
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurances[premise]

In [5]:
def print_rule(premise, conclusion, support, confidence, features):
#for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print('Rule: If a person buy %s he will also by %s' % (premise_name, conclusion_name))
    print(' - confidence: %.3f' % confidence[(premise, conclusion)])
    print(' - support: %d' % support[(premise, conclusion)])
    print(' ')

In [6]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buy milk he will also by apple
 - confidence: 0.196
 - support: 9
 


In [7]:
from pprint import pprint
pprint(list(support.items()))

[((2, 3), 25),
 ((2, 4), 27),
 ((3, 2), 25),
 ((3, 4), 21),
 ((4, 2), 27),
 ((4, 3), 21),
 ((0, 1), 14),
 ((0, 3), 5),
 ((1, 0), 14),
 ((1, 3), 9),
 ((3, 0), 5),
 ((3, 1), 9),
 ((0, 2), 4),
 ((2, 0), 4),
 ((1, 4), 19),
 ((4, 1), 19),
 ((0, 4), 17),
 ((4, 0), 17),
 ((1, 2), 7),
 ((2, 1), 7)]


In [10]:
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)

for index in range(5):
    print('Rule #%d' % (index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buy cheese he will also by banana
 - confidence: 0.659
 - support: 27
 
Rule #2
Rule: If a person buy banana he will also by cheese
 - confidence: 0.458
 - support: 27
 
Rule #3
Rule: If a person buy cheese he will also by apple
 - confidence: 0.610
 - support: 25
 
Rule #4
Rule: If a person buy apple he will also by cheese
 - confidence: 0.694
 - support: 25
 
Rule #5
Rule: If a person buy apple he will also by banana
 - confidence: 0.583
 - support: 21
 


In [9]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

for index in range(5):
    print('Rule #%d' % (index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buy apple he will also by cheese
 - confidence: 0.694
 - support: 25
 
Rule #2
Rule: If a person buy cheese he will also by banana
 - confidence: 0.659
 - support: 27
 
Rule #3
Rule: If a person buy bread he will also by banana
 - confidence: 0.630
 - support: 17
 
Rule #4
Rule: If a person buy cheese he will also by apple
 - confidence: 0.610
 - support: 25
 
Rule #5
Rule: If a person buy apple he will also by banana
 - confidence: 0.583
 - support: 21
 
