# Association Rule Mining

Apriori Algorithm  Implementation

In [1]:
# parameters for pruning
min_support = 2
min_conf    = 0.6 # 60% of confidance score

In [2]:
# transactions - array of array
# items - list
# itemset - dict
items = [1,2,3,4,5]

# dataset
transactions = [
    [1,3,4],    # transaction data R1
    [2,3,5],    # transaction data R2 ...
    [1,2,3,5],
    [2,5],
    [1,3,5]
]

# list required for processing the data
frequent_itemsets   = []
supports            = []
discarded           = []

### Helper Functions


In [3]:
def print_table(col1, col2):
    for i in range(len(col1)):
        print(f"{str(col1[i]).ljust(7)} {col2[i]}")

In [15]:
def get_superset(_set):
    subsets = [set([])]
    for nth in _set:
        subsets.extend([s | {nth} for s in subsets])
    return subsets

In [5]:
# calculate support
def calculate_support(item):
    _support = 0
    # print(f"calculate_support(item) = {item}")

    # iterate over dataset and count occurance `item`
    for i in range(len(transactions)):
        # if set(transactions[i]).issubset(set(item)):
        if set(item).issubset(set(transactions[i])):
            # print(f"Found {set(item)} in {set(transactions[i])}")
            _support += 1
    return _support

In [6]:
# find items in itemset
def get_items(_itemset):
    items_in_itemset = []
    for _items in _itemset:
        for _item in list(_items):
            if _item not in items_in_itemset:
                items_in_itemset.append(_item)
    return items_in_itemset


### generate the frequent itemsets

In [16]:
def get_itemsetset(n):
    frequent_itemsets.append([])
    supports.append([])
    items_in_itemset = get_items(frequent_itemsets[n-1])

    print(f"Items in n-1={n-1} FrequentItemSet {items_in_itemset}")

    # get the superset
    sets = get_superset(items_in_itemset)

    # filter the itemset less than sn'th size
    _itemsets = []
    # purning
    for s in sets:
        if len(s) == n and s not in discarded:
            _itemsets.append(s)

    # without purning
    # _itemsets = [s for s in sets if len(s) == n]

    # calculate support and check the threshold
    # update the exclude list
    for _item in _itemsets:
        support_count = calculate_support(_item)
        if support_count >= min_support:
            frequent_itemsets[n].append(_item)
            supports[n].append(support_count)
        else: #exclude
            discarded.append(_item)
    print_table(frequent_itemsets[n], supports[n])

    return frequent_itemsets[n]

In [12]:
def generate_frequent_itemset():
    n = 1
    while (True):
        print(f" ======== n = {n} =========")
        _itemsets = get_itemsetset(n)
        if(len(_itemsets) > 1): n += 1
        else: break
    # return max frequent itemset
    return n-1

### Init the Datastructures

In [13]:
# initialize the itemset and support
#    we used 0th index to reduce the condition
#     check due to 0-based index
def init():
    # get 1 itemset and calculate support
    frequent_itemsets.append([])
    supports.append([])
    # dummy init
    for item in items:
        frequent_itemsets[0].append({item})
        supports[0].append(0)


### Step 1: Most Frequent Item set generation

In [17]:
# init the lists
init()
# Step 1: most frequent items
max_itemset_number = generate_frequent_itemset()

Items in n-1=0 FrequentItemSet [1, 2, 3, 4, 5]
{1}     3
{2}     3
{3}     4
{5}     4
Items in n-1=1 FrequentItemSet [1, 2, 3, 5]
{1, 3}  3
{2, 3}  2
{1, 5}  2
{2, 5}  3
{3, 5}  3
Items in n-1=2 FrequentItemSet [1, 3, 2, 5]
{1, 3, 5} 2
{2, 3, 5} 2
Items in n-1=3 FrequentItemSet [1, 3, 5, 2]


### Step 2: formulation of rules

In [18]:
#       get all the item sets of max frequent set
#       get the superset of frequent itemset
#           S -> (I - S) i.e. => S recommends (I-S)
#       if support(I)/support(S) >= min_conf value
for I in frequent_itemsets[max_itemset_number]:
    print('='*30)
    subsets = get_superset(I)
    print(f"Subsets of I {I} = {subsets}")
    # S -> (I - S) i.e. => S recommends (I-S)
    for S in subsets:
        I_S = I - S
        if len(I_S) <= 0: continue
        I_support = calculate_support(I)
        S_support = calculate_support(S)
        conf_score = I_support / S_support
        is_selected = 'SELECTED' if conf_score >= min_conf else 'REJECTED'
        print(f"{S} -> {I_S} with Confidance: {conf_score} {is_selected}")

Subsets of I {1, 3, 5} = [set(), {1}, {3}, {1, 3}, {5}, {1, 5}, {3, 5}, {1, 3, 5}]
set() -> {1, 3, 5} with Confidance: 0.4 REJECTED
{1} -> {3, 5} with Confidance: 0.6666666666666666 SELECTED
{3} -> {1, 5} with Confidance: 0.5 REJECTED
{1, 3} -> {5} with Confidance: 0.6666666666666666 SELECTED
{5} -> {1, 3} with Confidance: 0.5 REJECTED
{1, 5} -> {3} with Confidance: 1.0 SELECTED
{3, 5} -> {1} with Confidance: 0.6666666666666666 SELECTED
Subsets of I {2, 3, 5} = [set(), {2}, {3}, {2, 3}, {5}, {2, 5}, {3, 5}, {2, 3, 5}]
set() -> {2, 3, 5} with Confidance: 0.4 REJECTED
{2} -> {3, 5} with Confidance: 0.6666666666666666 SELECTED
{3} -> {2, 5} with Confidance: 0.5 REJECTED
{2, 3} -> {5} with Confidance: 1.0 SELECTED
{5} -> {2, 3} with Confidance: 0.5 REJECTED
{2, 5} -> {3} with Confidance: 0.6666666666666666 SELECTED
{3, 5} -> {2} with Confidance: 0.6666666666666666 SELECTED


# Working on Real dataset

The csv contains the Shopping cart entries

## Task 1: Data Collect and Extract

## Task 2: Transform (Coding)

## Task3: Run Algoritm

In [None]:
# parameters for pruning
min_support = 50
min_conf    = 0.5 # 60% of confidance score

# list required for processing the data
frequent_itemsets   = []
supports            = []
discarded           = []

## Task 4: Interprate

Decode the values and interprate the result