# Making the datasset

In [1]:
import numpy as np

In [2]:
X = np.zeros((100, 5), dtype='bool')
features = ["bread", "milk", "cheese", "apples", "bananas"]

In [3]:
for i in range(X.shape[0]):
    if np.random.random() < 0.3:
        # A bread winner
        X[i][0] = 1
        if np.random.random() < 0.5:
            # Who likes milk
            X[i][1] = 1
        if np.random.random() < 0.2:
            # Who likes cheese
            X[i][2] = 1
        if np.random.random() < 0.25:
            # Who likes apples
            X[i][3] = 1
        if np.random.random() < 0.5:
            # Who likes bananas
            X[i][4] = 1
    else:
        # Not a bread winner
        if np.random.random() < 0.5:
            # Who likes milk
            X[i][1] = 1
            if np.random.random() < 0.2:
                # Who likes cheese
                X[i][2] = 1
            if np.random.random() < 0.25:
                # Who likes apples
                X[i][3] = 1
            if np.random.random() < 0.5:
                # Who likes bananas
                X[i][4] = 1
        else:
            if np.random.random() < 0.8:
                # Who likes cheese
                X[i][2] = 1
            if np.random.random() < 0.6:
                # Who likes apples
                X[i][3] = 1
            if np.random.random() < 0.7:
                # Who likes bananas
                X[i][4] = 1
    if X[i].sum() == 0:
        X[i][4] = 1  # Must buy something, so gets bananas


In [4]:
print(X[:5])

[[False  True False False False]
 [ True False  True  True False]
 [ True False False False  True]
 [False  True False False  True]
 [False  True  True False  True]]


In [5]:
np.savetxt("./data/affinity_dataset.txt", X, fmt='%d')

# Extracting rules from the dataset created

In [6]:
import numpy as np
dataset_filename = "./data/affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))

This dataset has 100 samples and 5 features


In [7]:
print(X[:5])

[[0. 1. 0. 0. 0.]
 [1. 0. 1. 1. 0.]
 [1. 0. 0. 0. 1.]
 [0. 1. 0. 0. 1.]
 [0. 1. 1. 0. 1.]]


In [8]:
# Name of the features in the column of the dataset
features = ["bread", "milk", "cheese", "apples", "bananas"]

### Computing for all possible rules

In [9]:
from collections import defaultdict

valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
        # Record that the premise was bought in another transaction
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:  # It makes no sense to measure if X -> X.
                continue
            if sample[conclusion] == 1:
                # This person also bought the conclusion item i.e rule is valid
                valid_rules[(premise, conclusion)] += 1
            else:
                # This person bought the premise, but not the conclusion i.e invalid rule
                invalid_rules[(premise, conclusion)] += 1
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]

#### All the extracted rules

In [10]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

Rule: If a person buys bread they will also buy cheese
 - Confidence: 0.167
 - Support: 5

Rule: If a person buys bread they will also buy apples
 - Confidence: 0.167
 - Support: 5

Rule: If a person buys cheese they will also buy bread
 - Confidence: 0.122
 - Support: 5

Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.537
 - Support: 22

Rule: If a person buys apples they will also buy bread
 - Confidence: 0.132
 - Support: 5

Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.579
 - Support: 22

Rule: If a person buys bread they will also buy bananas
 - Confidence: 0.600
 - Support: 18

Rule: If a person buys bananas they will also buy bread
 - Confidence: 0.310
 - Support: 18

Rule: If a person buys milk they will also buy bananas
 - Confidence: 0.521
 - Support: 25

Rule: If a person buys bananas they will also buy milk
 - Confidence: 0.431
 - Support: 25

Rule: If a person buys milk they will also buy cheese
 - Confidence: 0.146
 - Su

In [11]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

### Rules sorted by support

In [12]:
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)

In [13]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys milk they will also buy bananas
 - Confidence: 0.521
 - Support: 25

Rule #2
Rule: If a person buys bananas they will also buy milk
 - Confidence: 0.431
 - Support: 25

Rule #3
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.585
 - Support: 24

Rule #4
Rule: If a person buys bananas they will also buy cheese
 - Confidence: 0.414
 - Support: 24

Rule #5
Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.537
 - Support: 22



### Rules sorted by confidence

In [14]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

In [15]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys bread they will also buy bananas
 - Confidence: 0.600
 - Support: 18

Rule #2
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.585
 - Support: 24

Rule #3
Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.579
 - Support: 22

Rule #4
Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.537
 - Support: 22

Rule #5
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.526
 - Support: 20

