## Dataset re-split
Split original dataset into 4 different dataset. 
Variables (No. of transactions/No. of items/Average transaction width) are controlled as possible.

In [1]:
from functools import reduce
import pandas as pd

GROCERY_STORE_DATA_PATH = "./dataset/Groceries.csv"
df = pd.read_csv(GROCERY_STORE_DATA_PATH, index_col=0)

def avg_width(df):
    avg_width=(df.loc[0:,'items'].apply(lambda x: len(x)).sum())/100
    return avg_width

In [2]:
def resplit_dataset(df,init_transaction=0,end_transaction=100):
    dataset = pd.DataFrame()
    dataset["items"] = df.loc[init_transaction:end_transaction,"items"].apply(lambda x: set(x[1:-1].split(",")))
    dataset.reset_index(inplace=True,drop=True)
    dataset.index = dataset.index+1
    items = reduce(lambda a, b: a | b, dataset.values)[0]
    item_counts = len(items)
    avg_wid = avg_width(dataset)
    trans_counts = dataset.shape[0]
    
    return dataset, items, item_counts, avg_wid, trans_counts

In [3]:
# dataset 1 (changing transactions counts)

dataset1,items1,item_counts1,avg_width1,trans_counts1 = resplit_dataset(df,init_transaction=20,end_transaction=119)


print("items set:")
print(items1)
print("items count:")
print(item_counts1)
print("average width of transactions:")
print(avg_width1)
print("transactions counts:")
print(trans_counts1)

items set:
{'condensed milk', 'cling film/bags', 'chicken', 'tropical fruit', 'photo/film', 'berries', 'white wine', 'specialty bar', 'rolls/buns', 'root vegetables', 'rice', 'salt', 'spread cheese', 'sparkling wine', 'ham', 'bottled beer', 'coffee', 'canned vegetables', 'pickled vegetables', 'specialty chocolate', 'detergent', 'meat spreads', 'newspapers', 'packaged fruit/vegetables', 'bottled water', 'misc. beverages', 'red/blush wine', 'seasonal products', 'brown bread', 'baking powder', 'cream cheese ', 'herbs', 'softener', 'long life bakery product', 'yogurt', 'oil', 'canned fish', 'frozen dessert', 'hygiene articles', 'cat food', 'sliced cheese', 'brandy', 'salty snack', 'shopping bags', 'candles', 'semi-finished bread', 'pastry', 'processed cheese', 'abrasive cleaner', 'grapes', 'whole milk', 'margarine', 'beverages', 'chocolate marshmallow', 'other vegetables', 'butter', 'candy', 'whipped/sour cream', 'pork', 'dessert', 'flour', 'butter milk', 'cereals', 'napkins', 'chewing gum

In [4]:
#dataset 2

dataset2,items2,item_counts2,avg_width2,trans_counts2 = resplit_dataset(df,init_transaction=10,end_transaction=139)

print("items set:")
print(items2)
print("items count:")
print(item_counts2)
print("average width of transactions:")
print(avg_width2)
print("transactions counts:")
print(trans_counts2)

items set:
{'condensed milk', 'cling film/bags', 'chicken', 'tropical fruit', 'photo/film', 'berries', 'white wine', 'rolls/buns', 'specialty bar', 'root vegetables', 'rice', 'salt', 'spread cheese', 'sparkling wine', 'ham', 'bottled beer', 'canned fruit', 'coffee', 'canned vegetables', 'pickled vegetables', 'specialty chocolate', 'detergent', 'meat spreads', 'newspapers', 'packaged fruit/vegetables', 'bottled water', 'misc. beverages', 'red/blush wine', 'seasonal products', 'brown bread', 'baking powder', 'cream cheese ', 'herbs', 'softener', 'long life bakery product', 'yogurt', 'fish', 'oil', 'canned fish', 'frozen dessert', 'hygiene articles', 'cat food', 'sliced cheese', 'brandy', 'salty snack', 'shopping bags', 'candles', 'semi-finished bread', 'pastry', 'processed cheese', 'abrasive cleaner', 'grapes', 'whole milk', 'margarine', 'beverages', 'chocolate marshmallow', 'other vegetables', 'butter', 'candy', 'whipped/sour cream', 'pork', 'dessert', 'flour', 'cereals', 'butter milk',

In [5]:
#controlling variable of item counts

removed = {'male cosmetics', 'pip fruit', 'curd','ham','candy', 'candles'}
for i in dataset2['items']:
    for item in removed:
        if item in i:
            if len(i)>1:
                i.remove(item)

items2 = reduce(lambda a, b: a | b, dataset2.values)[0]
item_counts2 = len(items2)
avg_width2 = avg_width(dataset2)
trans_counts2 = dataset2.shape[0]

print("items set:")
print(items2)
print("items count:")
print(item_counts2)
print("average width of transactions:")
print(avg_width2)
print("transactions counts:")
print(trans_counts2)

items set:
{'condensed milk', 'cling film/bags', 'chicken', 'tropical fruit', 'photo/film', 'berries', 'white wine', 'rolls/buns', 'specialty bar', 'root vegetables', 'rice', 'salt', 'spread cheese', 'sparkling wine', 'bottled beer', 'canned fruit', 'coffee', 'canned vegetables', 'pickled vegetables', 'specialty chocolate', 'detergent', 'meat spreads', 'newspapers', 'packaged fruit/vegetables', 'bottled water', 'misc. beverages', 'red/blush wine', 'seasonal products', 'brown bread', 'baking powder', 'cream cheese ', 'herbs', 'softener', 'long life bakery product', 'yogurt', 'fish', 'oil', 'canned fish', 'frozen dessert', 'hygiene articles', 'cat food', 'sliced cheese', 'brandy', 'salty snack', 'shopping bags', 'semi-finished bread', 'pastry', 'processed cheese', 'abrasive cleaner', 'grapes', 'whole milk', 'margarine', 'beverages', 'chocolate marshmallow', 'other vegetables', 'butter', 'whipped/sour cream', 'pork', 'dessert', 'flour', 'cereals', 'butter milk', 'napkins', 'chewing gum', 

In [6]:
# dataset 3 (changing items counts)

dataset3,items3,item_counts3,avg_width3,trans_counts3 = resplit_dataset(df,init_transaction=150,end_transaction=249)

print("items set:")
print(items3)
print("items count:")
print(item_counts3)
print("average width of transactions:")
print(avg_width3)
print("transactions counts:")
print(trans_counts3)

items set:
{'condensed milk', 'cling film/bags', 'tropical fruit', 'photo/film', 'white wine', 'berries', 'rolls/buns', 'root vegetables', 'specialty bar', 'salt', 'spread cheese', 'bottled beer', 'ham', 'coffee', 'cocoa drinks', 'canned vegetables', 'make up remover', 'pickled vegetables', 'specialty chocolate', 'meat spreads', 'detergent', 'frozen fish', 'mustard', 'soft cheese', 'newspapers', 'packaged fruit/vegetables', 'bottled water', 'misc. beverages', 'red/blush wine', 'brown bread', 'baking powder', 'cream cheese ', 'herbs', 'long life bakery product', 'yogurt', 'oil', 'canned fish', 'hygiene articles', 'finished products', 'frozen dessert', 'sliced cheese', 'flower soil/fertilizer', 'salty snack', 'shopping bags', 'candles', 'semi-finished bread', 'popcorn', 'pastry', 'grapes', 'whole milk', 'margarine', 'skin care', 'beverages', 'specialty cheese', 'butter', 'other vegetables', 'cake bar', 'candy', 'whipped/sour cream', 'tea', 'pork', 'dessert', 'flour', 'dish cleaner', 'but

In [7]:
# dataset 4 (changing average width)

dataset4,items4,item_counts4,avg_width4,trans_counts4 = resplit_dataset(df,init_transaction=1000,end_transaction=1099)

print("items set:")
print(items4)
print("items count:")
print(item_counts4)
print("average width of transactions:")
print(avg_width4)
print("transactions counts:")
print(trans_counts4)

items set:
{'condensed milk', 'vinegar', 'chicken', 'tropical fruit', 'berries', 'white wine', 'root vegetables', 'rolls/buns', 'specialty bar', 'rice', 'salt', 'spread cheese', 'sparkling wine', 'bottled beer', 'ham', 'coffee', 'canned vegetables', 'pickled vegetables', 'soups', 'specialty chocolate', 'meat spreads', 'detergent', 'mustard', 'frozen fish', 'soft cheese', 'newspapers', 'bottled water', 'misc. beverages', 'red/blush wine', 'seasonal products', 'brown bread', 'baking powder', 'cream cheese ', 'herbs', 'toilet cleaner', 'long life bakery product', 'yogurt', 'dental care', 'oil', 'canned fish', 'hygiene articles', 'ready soups', 'frozen dessert', 'finished products', 'rum', 'cat food', 'sliced cheese', 'salty snack', 'shopping bags', 'semi-finished bread', 'mayonnaise', 'pastry', 'processed cheese', 'grapes', 'margarine', 'whole milk', 'sauces', 'skin care', 'beverages', 'chocolate marshmallow', 'other vegetables', 'butter', 'specialty cheese', 'cake bar', 'roll products ',

In [8]:
#controlling variable of item counts

removed = {'chicken', 'cream', 'fruit/vegetable juice', 'whipped/sour cream', 'skin care', 'artif. sweetener', 'detergent', 
           'sliced cheese', 'soft cheese', 'canned vegetables', 'onions', 'shopping bags', 'soda', 'meat spreads', 'salt', 
           'finished products', 'mustard', 'mayonnaise', 'chocolate marshmallow', 'bottled beer', 'candy','ice cream', 'salty snack',
          'pet care'}
for i in dataset4['items']:
    for item in removed:
        if item in i:
            if len(i)>1:
                i.remove(item)

items4 = reduce(lambda a, b: a | b, dataset4.values)[0]
item_counts4 = len(items4)
avg_width4=avg_width(dataset4)
trans_counts4 = dataset4.shape[0]

print("items set:")
print(items4)
print("items count:")
print(item_counts4)
print("average width of transactions:")
print(avg_width4)
print("transactions counts:")
print(trans_counts4)

items set:
{'condensed milk', 'vinegar', 'tropical fruit', 'berries', 'white wine', 'root vegetables', 'rolls/buns', 'specialty bar', 'rice', 'spread cheese', 'sparkling wine', 'ham', 'coffee', 'canned vegetables', 'pickled vegetables', 'soups', 'specialty chocolate', 'frozen fish', 'newspapers', 'bottled water', 'misc. beverages', 'red/blush wine', 'seasonal products', 'brown bread', 'baking powder', 'cream cheese ', 'herbs', 'toilet cleaner', 'long life bakery product', 'yogurt', 'dental care', 'oil', 'canned fish', 'hygiene articles', 'ready soups', 'frozen dessert', 'rum', 'cat food', 'shopping bags', 'semi-finished bread', 'pastry', 'processed cheese', 'grapes', 'margarine', 'whole milk', 'sauces', 'beverages', 'other vegetables', 'butter', 'specialty cheese', 'cake bar', 'roll products ', 'whipped/sour cream', 'pork', 'dessert', 'organic products', 'flour', 'dish cleaner', 'butter milk', 'cereals', 'napkins', 'chewing gum', 'pasta', 'sugar', 'nuts/prunes', 'pot plants', 'onions',

In [9]:
# vary in average width

added = {'specialty cheese', 'misc. beverages'}

for i in dataset4['items']:
    for item in added:
        if item not in i:
            i.add(item)

items4 = reduce(lambda a, b: a | b, dataset4.values)[0]
item_counts4 = len(items4)
avg_width4=avg_width(dataset4)
trans_counts4 = dataset4.shape[0]

print("items set:")
print(items4)
print("items count:")
print(item_counts4)
print("average width of transactions:")
print(avg_width4)
print("transactions counts:")
print(trans_counts4)

items set:
{'condensed milk', 'vinegar', 'tropical fruit', 'berries', 'white wine', 'root vegetables', 'rolls/buns', 'specialty bar', 'rice', 'spread cheese', 'sparkling wine', 'ham', 'coffee', 'canned vegetables', 'pickled vegetables', 'soups', 'specialty chocolate', 'frozen fish', 'newspapers', 'bottled water', 'misc. beverages', 'red/blush wine', 'seasonal products', 'brown bread', 'baking powder', 'cream cheese ', 'herbs', 'toilet cleaner', 'long life bakery product', 'yogurt', 'dental care', 'oil', 'canned fish', 'hygiene articles', 'ready soups', 'frozen dessert', 'rum', 'cat food', 'shopping bags', 'semi-finished bread', 'pastry', 'processed cheese', 'grapes', 'margarine', 'whole milk', 'sauces', 'beverages', 'other vegetables', 'butter', 'specialty cheese', 'cake bar', 'roll products ', 'whipped/sour cream', 'pork', 'dessert', 'organic products', 'flour', 'dish cleaner', 'butter milk', 'cereals', 'napkins', 'chewing gum', 'pasta', 'sugar', 'nuts/prunes', 'pot plants', 'onions',

In [10]:
setnames = ["dataset1","dataset2","dataset3","dataset4"]
itemcnts = [item_counts1,item_counts2,item_counts3,item_counts4]
avgw = [avg_width1,avg_width2,avg_width3,avg_width4]
transcnts = [trans_counts1,trans_counts2,trans_counts3,trans_counts4]

datasets = {
    'name':setnames,
    'item counts':itemcnts,
    'average width':avgw,
    'transactions counts':transcnts
}

datasets = pd.DataFrame(datasets)

In [11]:
print(datasets)

       name  item counts  average width  transactions counts
0  dataset1           97           4.11                  100
1  dataset2           97           4.83                  130
2  dataset3          107           4.19                  100
3  dataset4           97           6.71                  100


## Association Rules Mining Algorithm Comparison (Brute force & Apriori)

In [12]:
import pprint
import time
from itertools import combinations
from math import ceil


def bf_frequent_items(df, items, item_counts, min_sup=0.05, debug=False):
    """
    generate all possible frequent item sets by relative min support
    >>> {1: {(('I5',), 2), (('I2',), 7), (('I1',), 6), (('I3',), 6), (('I4',), 2)},
         2: {(('I1', 'I2'), 4), (('I1', 'I3'), 4), (('I1', 'I5'), 2), (('I3', 'I2'), 4), (('I4', 'I2'), 2), (('I5', 'I2'), 2)},
         3: {(('I1', 'I3', 'I2'), 2), (('I1', 'I5', 'I2'), 2)}
         }
    :param df: dataframe
    :param items:
    :param item_counts: num of items
    :param min_sup: fractional relative min support
    :return:
    """
    print("Find frequent item sets by Brute Force")
    print("-" * 100)

    frequent_sets = {}  # dictionary, key-> k, value-> k item sets
    min_threshold = ceil(df.shape[0] * min_sup)

    for k in range(1, 1 + item_counts):
        k_item_subsets = combinations(items, k)  # all possible k-item sets
        time_start = time.time()
        # check satisfied k-item sets
        filtered_k_subsets = {(
            tuple(k_item_subset), (set(k_item_subset) <= df["items"]).sum()) for k_item_subset in k_item_subsets
            if
            (set(k_item_subset) <= df["items"]).sum() >= min_threshold}
        print(f"Process {k}-item subsets in {time.time() - time_start: .5f} s")
        # if k subsets support can't satisfy, k + 1, ... can't satisfy
        if len(filtered_k_subsets) <= 0:
            break
        frequent_sets[k] = filtered_k_subsets

    if debug:
        print("Final frequent item sets")
        print("=" * 100)
        pprint.pprint(frequent_sets)
        print("=" * 100)
    return frequent_sets

In [13]:
def apriori_frequent_items(df, items, item_counts, min_sup=0.05, debug=False):
    """
    >>>{1: [(('I1',), 6), (('I2',), 7), (('I3',), 6), (('I4',), 2), (('I5',), 2)],
        2: [(('I1', 'I2'), 4),(('I1', 'I3'), 4),(('I1', 'I5'), 2),(('I2', 'I3'), 4),(('I2', 'I4'), 2),(('I2', 'I5'), 2)],
        3: [(('I1', 'I2', 'I3'), 2), (('I1', 'I2', 'I5'), 2)]
        }
    generate frequent item sets by Apriori algorithm
    :param df:
    :param items:
    :param item_counts:
    :param min_sup:
    :param debug: debug mode
    :return:
    """
    print("Find frequent item sets by Apriori algorithm")
    print("-" * 100)

    frequent_sets = {}
    hash_sets = {}
    min_threshold = ceil(df.shape[0] * min_sup)

    # initialized by 1 frequent items
    # all elements sorted by dictionary order
    time_start = time.time()
    frequent_k_item_sets = sorted(
        ((tuple(item_set), (set(item_set) <= df["items"]).sum()) for item_set in combinations(items, 1)
         if (set(item_set) <= df["items"]).sum() >= min_threshold),
        key=lambda x: x[0])
    print(f"Process 1-item subsets in {time.time() - time_start: .5f} s")
    hash_k_sets = {item_set for item_set in combinations(items, 1) if
                   (set(item_set) <= df["items"]).sum() >= min_threshold}

    frequent_sets[1] = frequent_k_item_sets
    if debug:
        print("1-item frequent sets")
        pprint.pprint(frequent_k_item_sets)

    hash_sets[1] = hash_k_sets
    if debug:
        print("1-item hash sets")
        pprint.pprint(hash_k_sets)

    # perform level-wise generation by join two k - 1 frequent sets and pruning
    for k in range(2, 1 + item_counts):
        time_start = time.time()
        cur_item_sets = []
        cur_hash_sets = set()
        for i in range(len(frequent_k_item_sets) - 1):
            for j in range(i + 1, len(frequent_k_item_sets)):
                # joining : find all candidate k item sets
                a, b = frequent_k_item_sets[i], frequent_k_item_sets[j]
                if a[0][:-1] == b[0][:-1] and a[0][-1] < b[0][-1]:
                    candidate_item_set = a[0] + (b[0][-1],)
                    # pruning : checking all k - 1 item subsets of candidate
                    candidate_subsets = set(
                        map(lambda x: tuple(sorted(x)), combinations(set(candidate_item_set), k - 1)))
                    if not candidate_subsets - hash_k_sets:
                        candidate_sup = (set(candidate_item_set) <= df["items"]).sum()
                        if candidate_sup >= min_threshold:
                            cur_item_sets.append((candidate_item_set, candidate_sup))
                            cur_hash_sets.add(candidate_item_set)
        print(f"Process {k}-item subsets in {time.time() - time_start: .5f} s")
        if len(cur_item_sets) <= 0:
            break

        if debug:
            print(f"{k}-item frequent item sets")
            print(cur_item_sets)

        frequent_sets[k] = cur_item_sets
        frequent_k_item_sets = cur_item_sets

        if debug:
            print(f"{k}-item hash sets")
            print(cur_hash_sets)

        hash_sets[k] = cur_hash_sets
        hash_k_sets = cur_hash_sets

    if debug:
        print("Final frequent item sets")
        print("=" * 100)
        pprint.pprint(frequent_sets)
        print("=" * 100)
    return frequent_sets


In [14]:
algos = {
    "Apriori": apriori_frequent_items,  # apriori
     "Brute Force": bf_frequent_items,  # brute force
}

try:
    for algo in algos.keys():
        print("=" * 100)
        for i in range(1,5):
            print("For dataset"+str(i)+" : ")
            d = vars()[datasets.loc[i-1,'name']]
            items = vars()[("items"+str(i))]
            item_counts = vars()[("item_counts"+str(i))]
            alg_freq_item_sets = algos[algo](d, items, item_counts, min_sup=0.05, debug=False)
        print("=" * 100)
except KeyboardInterrupt:
    print ('Stopped')

For dataset1 : 
Find frequent item sets by Apriori algorithm
----------------------------------------------------------------------------------------------------
Process 1-item subsets in  0.03395 s
Process 2-item subsets in  0.04700 s
Process 3-item subsets in  0.00098 s
For dataset2 : 
Find frequent item sets by Apriori algorithm
----------------------------------------------------------------------------------------------------
Process 1-item subsets in  0.01821 s
Process 2-item subsets in  0.03243 s
Process 3-item subsets in  0.01004 s
For dataset3 : 
Find frequent item sets by Apriori algorithm
----------------------------------------------------------------------------------------------------
Process 1-item subsets in  0.01667 s
Process 2-item subsets in  0.03929 s
Process 3-item subsets in  0.00000 s
For dataset4 : 
Find frequent item sets by Apriori algorithm
----------------------------------------------------------------------------------------------------
Process 1-item subs