# Introduction

The Belgium retail market dataset is ideal for practicing on generating association rules.

Let's see how support and confidence influences the number of rules that are return in Apriori.

## Loading the data

We first load the file that contains the data.  Even though the data is not a CSV file, each line represents a receipt of items.

Additionally, since the purchases are represented as a string, we convert the string to an array of integers.

In [1]:
import pandas as pd
import numpy as np

d = pd.read_csv("retail.dat", header=None, names=['Purchases'])

# Convert the strings to a list of integers
d['Purchases'] = [[int(x) for x in row] for row in d['Purchases'].str.split()]

print("Number of transactions = {}".format(len(d)))
d.head()

Number of transactions = 88162


Unnamed: 0,Purchases
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,"[30, 31, 32]"
2,"[33, 34, 35]"
3,"[36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]"
4,"[38, 39, 47, 48]"


Since we don't know the items or if the items lie on a continuous interval, we will check whether it's the case.

In [2]:
# Grab all possible transactions
numbers = set()
maxLen = 0
for index, row in d.iterrows():
    if maxLen < len(row['Purchases']):
        maxLen = len(row['Purchases'])
    numbers = numbers.union(set(row['Purchases']))

# Check whether the number set is continuous
isCont = True
nextNum = 0
for i in numbers:
    if i != nextNum:
        isCont = False
        break
    nextNum += 1

if isCont:
    nextNum -= 1
    print("Is continuous in interval [0,{}]".format(nextNum))
print(maxLen)

Is continuous in interval [0,16469]
76


## Observing the behavior of Apriori

In [3]:
import re, itertools

# A customized version on itertools.combinations algorithm
def combinations(iterable, r):
    pool = tuple(iterable)
    n = len(pool)
    if r > n:
        return
    result = tuple(int(i) for i in 
                   list(set(re.findall(r'[0-9]+', str(pool)))))
    for comb in itertools.combinations(sorted(result),r):
        yield comb, [i for i in itertools.combinations(comb, r - 1)]

def apriori(ds, interval, support, confidence):
    # Generate the list of indexes that each item occurs in.
    def genInitList(dataset, myInterval):
        res = {num:[] for num in myInterval}
        for index, row in dataset.iterrows():
            for num in row['Purchases']:
                res[num] += [index]
        return res
    # Determine the probability that the list is in
    def getProb(occur):
        singleProb = []
        origLen = len(ds)
        for key, value in occur.items():
            singleProb.append((len(value), key, value))
        singleProb = tuple(sorted(singleProb))
        singleProb = singleProb[::-1]
        indx = 0
        while indx < len(singleProb) and \
              singleProb[indx][0]/origLen >= support:
            indx += 1
        return singleProb[:indx]
    def genSubsets(data, level):
        res = {}
        temp = {str(key):value for prob, key, value in data}
        for comb in combinations(temp,level):
            tmp2 = str(comb)
            result = tuple(int(i) for i in sorted(set(re.findall(r'[0-9]+', tmp2))))
            newList = []
            if level == 2:
                newList = [str(newItem[0]) for newItem in sorted(comb[1])]
            else:
                newList = [str(newItem) for newItem in comb[1] \
                           if str(newItem) in temp]
            # It's possible that the sets are completely disjointed
            if len(newList) == 0:
                continue
            result = tuple(i for i in sorted(result))
            res[result] = set.intersection(*[set(temp[i]) for i in newList ] )
        return res
    # Return the number of instances where the combination of numbers
    # are in the same set
    def retOccurCnt(myTuple, myIndx):
        mySet = set.intersection(*[set(myIndx[num]) for num in myTuple])
        return len(mySet)
    # Beginning of Apriori algorithm
    curInt = range(0,interval + 1)
    origLen = len(ds)
    # Generate the indexes of each 
    numOccur = genInitList(ds, curInt)
    indxList = numOccur
    # Get the support of each subset
    layer = 1
    curRules = {} # Holds the rules we'll be working with
    while len(numOccur) != 0:
        probs = getProb(numOccur)
        for prob, key, value in probs:
            curRules[key] = prob
        layer += 1
        numOccur = genSubsets(probs,layer)
    rulesList = list(curRules.items())
    rules = [] # The array we're suppose to return of each rule
    # Now determine which rules are most likely to be associated
    for i in range(0,len(rulesList)):
        for j in range(i+1,len(rulesList)):
            if type(rulesList[i][0]) == int:
                set1 = set([rulesList[i][0]])
            else:
                set1 = set([x for x in rulesList[i][0]])
            if type(rulesList[j][0]) == int:
                set2 = set([rulesList[j][0]])
            else:
                set2 = set([x for x in rulesList[j][0]])
            if not set1.isdisjoint(set2):
                continue
            combined = tuple(i for i in sorted(set.union(*[set1, set2])))
            combAmnt = retOccurCnt(combined,indxList)
            confid1 = (combAmnt/rulesList[i][1])
            confid2 = (combAmnt/rulesList[j][1])
            if confid1 >= confidence:
                rules.append("{} -> {}".format(set1, set2))
            if confid2 >= confidence:
                rules.append("{} -> {}".format(set2, set1))
    return rules

Now, we're going to observe how support and confidence values influences the number of rules made.

In [4]:
for i in [0.5, 0.25, 0.1, 0.05, 0.03]:
    for j in [0.8, 0.5, 0.25, 0.15, 0.1]:
        print("Number of rules with support:{} and confidence:{} - {}"\
              .format(i, j, len(apriori(d, nextNum, i, j))))

Number of rules with support:0.5 and confidence:0.8 - 0
Number of rules with support:0.5 and confidence:0.5 - 0
Number of rules with support:0.5 and confidence:0.25 - 0
Number of rules with support:0.5 and confidence:0.15 - 0
Number of rules with support:0.5 and confidence:0.1 - 0
Number of rules with support:0.25 and confidence:0.8 - 0
Number of rules with support:0.25 and confidence:0.5 - 2
Number of rules with support:0.25 and confidence:0.25 - 2
Number of rules with support:0.25 and confidence:0.15 - 2
Number of rules with support:0.25 and confidence:0.1 - 2
Number of rules with support:0.1 and confidence:0.8 - 1
Number of rules with support:0.1 and confidence:0.5 - 11
Number of rules with support:0.1 and confidence:0.25 - 19
Number of rules with support:0.1 and confidence:0.15 - 43
Number of rules with support:0.1 and confidence:0.1 - 47
Number of rules with support:0.05 and confidence:0.8 - 1
Number of rules with support:0.05 and confidence:0.5 - 16
Number of rules with support:0