In [None]:
'''
Association Rules Learning

Strength of a Rule
    -Support: Likelihood of all itens in the rule being bought in a single transaction
    -Confidence: Likelihood of a second item being bought once the first is bought
    -Lift: Change in likelihood of second item being bought once the first is bought
    
    =>  Support(A) = P(A)
        Confidence(A/B) = P(A/B) = P(A,B)/P(B) = Support(A,B)/Support(B)
        Lift(A/B) = P(A/B)/P(B) = Confidence(A/B)/Support(B)

Workflow
    1. Set up the data
    2. Compute the support
    3. Implement the Apriori Algorithm

Apriori Algorithm
    For 1,2,...,N-item sets by continuously pruning the number of items in each stage
    Ex:
        Start with list of orders
        Find 1-item sets and remove items below a minimum support threshold
        Find 2-item sets from remaining items and remove items below a minimum support threshold
        Generate 2-item rules
            Potentially 2 rules in each 2-item set (Confidence(A/B) and Confidence(B/A))
            Keep only 2-item sets with a minimum confidence
        Find 3-item sets from remaining items and remove items below a minimum support threshold
        Generate 3-item rules: keep only 3-item sets with a minimum confidence
'''

In [1]:
itemsPath = "data/EB-build-goods.sql"
receiptsPath = "data/75000-out1.csv"

# Read receipts data from file into an array of strings
# Receipts data takes the following form:
#   ReceiptID, ItemID_1, ItemID_2, ..., ItemID_N
with open(receiptsPath,'r') as receiptsFile:
    receiptsData = receiptsFile.read().split('\n')
    
# Get all lines but the last one, then split each line into lists.
# Take all items in each list, less the first item (the receiptID)
# Put lists into an encompassing list
baskets = [line.split(", ")[1:] for line in receiptsData[0:-1]]
baskets

[['11', '21'],
 ['7', '11', '37', '45'],
 ['3', '33', '42'],
 ['5', '12', '17', '47'],
 ['6', '18', '42'],
 ['2', '4', '34'],
 ['15', '16', '23', '40'],
 ['2', '3', '29', '34'],
 ['18', '23', '26', '35', '36'],
 ['44', '45'],
 ['17', '38', '48', '49'],
 ['2', '3', '11', '21', '37', '41', '49'],
 ['3', '17', '43', '48'],
 ['17', '35', '43', '45'],
 ['15', '37', '43'],
 ['0', '2', '20', '46', '48'],
 ['17', '47'],
 ['14'],
 ['16', '39'],
 ['13', '42'],
 ['7', '11', '37', '45'],
 ['7', '15', '49'],
 ['23', '24', '40', '41', '43'],
 ['9', '15', '28', '47'],
 ['32', '33', '37'],
 ['5', '8', '16', '19', '20', '25', '39', '45'],
 ['13', '22', '24', '32', '33'],
 ['14', '44'],
 ['6', '13', '20', '39', '40', '44', '49'],
 ['13', '46'],
 ['8', '27', '28'],
 ['1', '19'],
 ['6', '36'],
 ['7', '15', '49'],
 ['12', '31', '36', '48'],
 ['17', '29', '47'],
 ['5', '10', '21', '34', '37', '48'],
 ['1', '19'],
 ['4', '9'],
 ['7', '15', '40', '49'],
 ['18', '19', '21', '35'],
 ['3', '17', '19'],
 ['13', '

In [2]:
# Read receipts data from file into an array of strings
# Receipts data takes the following form:
#   ReceiptID, ItemID_1, ItemID_2, ..., ItemID_N
with open(itemsPath,'r') as itemsFile:
    lines = itemsFile.read().split('\n')
    
# For each line in lines less the last line,
#   Get data after the '(' and split by ','
#   Drop the last 2 items (price, category)
items = [line.split('(')[1].split(',')[0:-2] for line in lines[0:-1]]

# Map items list into a dictionary with concatenated item description
# Key:Value => ItemID:ItemDescription
itemMap = {item[0]: "{0} {1}".format(item[1],item[2]).replace("'","") for item in items}

In [3]:
numItems = len(items)
numBaskets = len(baskets)

In [4]:
def support(itemSet, baskets):
    '''
    Calculate support of items list given a list of orders
    
    param itemSet: list of items whose support is being calculated
    param baskets: list of item orders to be searched through for items in itemSet
    '''
    basketSubset = baskets
    for item in itemSet:
        basketSubset = [basket for basket in basketSubset if item in basket]
    
    return float(len(basketSubset)/float(len(baskets)))

In [5]:
support(['2','24'], baskets)

0.00288

In [54]:
# Apriori Algorithm

import itertools
def aprioriIteration(i, baskets, supportItems, assocRules, minSupport, minConfidence):
    '''
    Executes specified iteration of Apriori Algorithm.
    Items that do not reach a minimum support are pruned.
    Rules above the minimum confidence are saved.
    
    
    param i: iteration number
    param baskets: list of orders containing items
    param supportItems: items remaining from last iteration
    param assocRules: association rules from last iteration
    param minSupport: min support required for each item to remain
    param minConfidence: min confidence required to make a new rule
    param newSupportItems: items remaining after this iteration
    '''
    newSupportItems = []
    # For first iteration, we are just pruning the number of items
    if i == 1:
        for item in range(len(supportItems)):
            itemSet = [str(item)]
            if support(itemSet,baskets) >= minSupport:
                newSupportItems.append(str(item))

    # Loop through all combinations of size i from supportItems
    #   Loop through each item within the combination
    #     Calculate confidence (likelihood of buying item j, given other i-1 items are already bought)
    #     If confidence above threshold
    #       Save rule and add each item in combination to newSupportItems
    else:
        for itemSet in itertools.combinations(supportItems,i):
            itemSet = list(itemSet) # len(itemSet)==i
            if support(itemSet,baskets) > minSupport:
                for j in range(i):
                    rule_to = itemSet[j]
                    rule_from = [x for x in itemSet if x != itemSet[j]]
                    confidence = support(itemSet,baskets) / support(rule_from,baskets)
                    #set_trace()
                    if confidence > minConfidence:
                        assocRules.append([rule_from, rule_to])
                        for x in itemSet:
                            if x not in newSupportItems:
                                newSupportItems.append(x)

    return assocRules, newSupportItems

In [43]:
supportItems = items
minSupport = 0.01
minConfidence = 0.5
numIterations = 2
assocRules = []
for i in [num+1 for num in range(numIterations)]:
    assocRules, supportItems1 = aprioriIteration(i, baskets,supportItems,assocRules,minSupport,minConfidence)
    print('Iteration {0}'.format(i))
    print('assocRules:\n{0}'.format(assocRules))
    print('supportItems:\n{0}'.format(supportItems))
    print()

Iteration 1
assocRules:
[]
supportItems:
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49']

Iteration 2
assocRules:
[[['46'], '0'], [['0'], '46'], [['3'], '18'], [['3'], '35'], [['9'], '4'], [['5'], '22'], [['44'], '14'], [['32'], '16'], [['16'], '32'], [['35'], '18'], [['18'], '35'], [['28'], '27'], [['27'], '28'], [['33'], '42']]
supportItems:
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49']



In [45]:
assocRules, len(assocRules)

([[['46'], '0'],
  [['0'], '46'],
  [['3'], '18'],
  [['3'], '35'],
  [['9'], '4'],
  [['5'], '22'],
  [['44'], '14'],
  [['32'], '16'],
  [['16'], '32'],
  [['35'], '18'],
  [['18'], '35'],
  [['28'], '27'],
  [['27'], '28'],
  [['33'], '42']],
 14)

In [62]:
def ruleMeta(rule,itemMap):
    '''
    Transforms ID data in a rule to a readable format
    
    param rule: rule to be transformed
    param itemMap: mapping of ItemIDs to descriptive names
    '''
    rule_from = [itemMap[x] for x in rule[0]]
    return rule_from, itemMap[rule[1]]

In [63]:
[ruleMeta(rule,itemMap) for rule in assocRules]

[(['Chocolate Coffee'], 'Chocolate Cake'),
 (['Chocolate Cake'], 'Chocolate Coffee'),
 (['Opera Cake'], 'Cherry Tart'),
 (['Opera Cake'], 'Apricot Danish'),
 (['Napoleon Cake'], 'Strawberry Cake'),
 (['Truffle Cake'], 'Gongolais Cookie'),
 (['Bottled Water'], 'Berry Tart'),
 (['Apricot Croissant'], 'Blueberry Tart'),
 (['Blueberry Tart'], 'Apricot Croissant'),
 (['Apricot Danish'], 'Cherry Tart'),
 (['Cherry Tart'], 'Apricot Danish'),
 (['Tuile Cookie'], 'Marzipan Cookie'),
 (['Marzipan Cookie'], 'Tuile Cookie'),
 (['Cheese Croissant'], 'Orange Juice'),
 (['Cherry Tart', 'Apricot Danish'], 'Opera Cake'),
 (['Opera Cake', 'Apricot Danish'], 'Cherry Tart'),
 (['Opera Cake', 'Cherry Tart'], 'Apricot Danish')]