## Dataset re-split
Split original dataset into 4 different dataset. 
Variables (No. of transactions/No. of items/Average transaction width) are controlled as possible.

In [1]:
from functools import reduce
import pandas as pd

GROCERY_STORE_DATA_PATH = "./dataset/Groceries.csv"
df = pd.read_csv(GROCERY_STORE_DATA_PATH, index_col=0)

def avg_width(df):
    avg_width=(df.loc[0:,'items'].apply(lambda x: len(x)).sum())/100
    return avg_width

def max_width(df):
    return df.loc[0:,'items'].apply(lambda x: len(x)).max()

In [2]:
def resplit_dataset(df,init_transaction=0,end_transaction=100):
    dataset = pd.DataFrame()
    dataset["items"] = df.loc[init_transaction:end_transaction,"items"].apply(lambda x: set(x[1:-1].split(",")))
    dataset.reset_index(inplace=True,drop=True)
    dataset.index = dataset.index+1
    items = reduce(lambda a, b: a | b, dataset.values)[0]
    item_counts = len(items)
    avg_wid = avg_width(dataset)
    trans_counts = dataset.shape[0]
    
    return dataset, items, item_counts, avg_wid, trans_counts

In [3]:
# dataset 1 (changing transactions counts)

dataset1,items1,item_counts1,avg_width1,trans_counts1 = resplit_dataset(df,init_transaction=20,end_transaction=119)


print("items set:")
print(items1)
print("items count:")
print(item_counts1)
print("average width of transactions:")
print(avg_width1)
print("transactions counts:")
print(trans_counts1)

items set:
{'seasonal products', 'candy', 'sliced cheese', 'packaged fruit/vegetables', 'bottled water', 'pastry', 'ham', 'chewing gum', 'herbs', 'berries', 'abrasive cleaner', 'soda', 'frankfurter', 'softener', 'bathroom cleaner', 'hard cheese', 'detergent', 'cereals', 'root vegetables', 'zwieback', 'waffles', 'citrus fruit', 'condensed milk', 'long life bakery product', 'curd cheese', 'whipped/sour cream', 'grapes', 'rolls/buns', 'white wine', 'canned fish', 'brandy', 'tropical fruit', 'flour', 'chocolate', 'butter milk', 'pork', 'onions', 'frozen dessert', 'whole milk', 'salt', 'processed cheese', 'semi-finished bread', 'shopping bags', 'beef', 'cream cheese ', 'chicken', 'domestic eggs', 'salty snack', 'meat spreads', 'newspapers', 'sausage', 'turkey', 'baking powder', 'oil', 'specialty chocolate', 'hamburger meat', 'red/blush wine', 'canned beer', 'misc. beverages', 'specialty fat', 'sugar', 'pip fruit', 'pasta', 'cling film/bags', 'fruit/vegetable juice', 'coffee', 'cat food', 'c

In [4]:
#dataset 2

dataset2,items2,item_counts2,avg_width2,trans_counts2 = resplit_dataset(df,init_transaction=10,end_transaction=139)

print("items set:")
print(items2)
print("items count:")
print(item_counts2)
print("average width of transactions:")
print(avg_width2)
print("transactions counts:")
print(trans_counts2)

items set:
{'seasonal products', 'candy', 'sliced cheese', 'packaged fruit/vegetables', 'bottled water', 'pastry', 'ham', 'chewing gum', 'herbs', 'liquor (appetizer)', 'berries', 'abrasive cleaner', 'soda', 'frankfurter', 'softener', 'fish', 'bathroom cleaner', 'hard cheese', 'cereals', 'detergent', 'root vegetables', 'zwieback', 'waffles', 'citrus fruit', 'condensed milk', 'long life bakery product', 'curd cheese', 'whipped/sour cream', 'grapes', 'rolls/buns', 'Instant food products', 'white wine', 'canned fish', 'canned fruit', 'brandy', 'tropical fruit', 'chocolate', 'flour', 'butter milk', 'pork', 'onions', 'frozen dessert', 'whole milk', 'salt', 'processed cheese', 'semi-finished bread', 'shopping bags', 'beef', 'chicken', 'cream cheese ', 'domestic eggs', 'salty snack', 'meat spreads', 'newspapers', 'sausage', 'turkey', 'baking powder', 'oil', 'specialty chocolate', 'hamburger meat', 'red/blush wine', 'canned beer', 'misc. beverages', 'specialty fat', 'sugar', 'pip fruit', 'pasta

In [5]:
#controlling variable of item counts

removed = {'male cosmetics', 'pip fruit', 'curd','ham','candy', 'candles'}
for i in dataset2['items']:
    for item in removed:
        if item in i:
            if len(i)>1:
                i.remove(item)

items2 = reduce(lambda a, b: a | b, dataset2.values)[0]
item_counts2 = len(items2)
avg_width2 = avg_width(dataset2)
trans_counts2 = dataset2.shape[0]

print("items set:")
print(items2)
print("items count:")
print(item_counts2)
print("average width of transactions:")
print(avg_width2)
print("transactions counts:")
print(trans_counts2)

items set:
{'seasonal products', 'sliced cheese', 'packaged fruit/vegetables', 'bottled water', 'pastry', 'chewing gum', 'herbs', 'liquor (appetizer)', 'berries', 'abrasive cleaner', 'soda', 'frankfurter', 'softener', 'fish', 'bathroom cleaner', 'hard cheese', 'cereals', 'detergent', 'root vegetables', 'zwieback', 'waffles', 'citrus fruit', 'condensed milk', 'long life bakery product', 'curd cheese', 'whipped/sour cream', 'grapes', 'rolls/buns', 'Instant food products', 'white wine', 'canned fish', 'canned fruit', 'brandy', 'tropical fruit', 'chocolate', 'flour', 'butter milk', 'pork', 'onions', 'frozen dessert', 'whole milk', 'salt', 'processed cheese', 'semi-finished bread', 'shopping bags', 'beef', 'chicken', 'cream cheese ', 'domestic eggs', 'salty snack', 'meat spreads', 'newspapers', 'sausage', 'turkey', 'baking powder', 'oil', 'specialty chocolate', 'hamburger meat', 'red/blush wine', 'canned beer', 'misc. beverages', 'specialty fat', 'sugar', 'pasta', 'cling film/bags', 'fruit/

In [6]:
# dataset 3 (changing items counts)

dataset3,items3,item_counts3,avg_width3,trans_counts3 = resplit_dataset(df,init_transaction=150,end_transaction=249)

print("items set:")
print(items3)
print("items count:")
print(item_counts3)
print("average width of transactions:")
print(avg_width3)
print("transactions counts:")
print(trans_counts3)

items set:
{'candy', 'sliced cheese', 'cake bar', 'bottled water', 'packaged fruit/vegetables', 'pastry', 'ham', 'dish cleaner', 'cleaner', 'herbs', 'chewing gum', 'liquor (appetizer)', 'cookware', 'berries', 'soda', 'frankfurter', 'soft cheese', 'hard cheese', 'cereals', 'detergent', 'root vegetables', 'zwieback', 'waffles', 'citrus fruit', 'condensed milk', 'long life bakery product', 'potato products', 'whipped/sour cream', 'grapes', 'rolls/buns', 'Instant food products', 'prosecco', 'white wine', 'canned fish', 'frozen fish', 'tropical fruit', 'chocolate', 'flour', 'pork', 'butter milk', 'onions', 'skin care', 'salt', 'whole milk', 'frozen dessert', 'semi-finished bread', 'shopping bags', 'dog food', 'pot plants', 'beef', 'cream cheese ', 'domestic eggs', 'salty snack', 'meat spreads', 'mustard', 'sausage', 'newspapers', 'female sanitary products', 'turkey', 'baking powder', 'specialty chocolate', 'oil', 'tea', 'hamburger meat', 'red/blush wine', 'canned beer', 'snack products', 'm

In [7]:
# dataset 4 (changing average width)

dataset4,items4,item_counts4,avg_width4,trans_counts4 = resplit_dataset(df,init_transaction=1000,end_transaction=1099)

print("items set:")
print(items4)
print("items count:")
print(item_counts4)
print("average width of transactions:")
print(avg_width4)
print("transactions counts:")
print(trans_counts4)

items set:
{'seasonal products', 'roll products ', 'sliced cheese', 'candy', 'cake bar', 'bottled water', 'pastry', 'ham', 'dish cleaner', 'herbs', 'chewing gum', 'skin care', 'liquor (appetizer)', 'berries', 'toilet cleaner', 'soda', 'frankfurter', 'soft cheese', 'hard cheese', 'cereals', 'condensed milk', 'root vegetables', 'zwieback', 'waffles', 'citrus fruit', 'detergent', 'long life bakery product', 'curd cheese', 'whipped/sour cream', 'grapes', 'rolls/buns', 'white wine', 'canned fish', 'cream', 'mayonnaise', 'nuts/prunes', 'instant coffee', 'frozen fish', 'vinegar', 'tropical fruit', 'flour', 'pork', 'butter milk', 'onions', 'dental care', 'ready soups', 'chocolate', 'white bread', 'whole milk', 'salt', 'semi-finished bread', 'processed cheese', 'frozen dessert', 'light bulbs', 'shopping bags', 'soups', 'pot plants', 'beef', 'cream cheese ', 'chicken', 'domestic eggs', 'salty snack', 'meat spreads', 'mustard', 'newspapers', 'sausage', 'pet care', 'turkey', 'baking powder', 'spec

In [8]:
#controlling variable of item counts

removed = {'chicken', 'cream', 'fruit/vegetable juice', 'whipped/sour cream', 'skin care', 'artif. sweetener', 'detergent', 
           'sliced cheese', 'soft cheese', 'canned vegetables', 'onions', 'shopping bags', 'soda', 'meat spreads', 'salt', 
           'finished products', 'mustard', 'mayonnaise', 'chocolate marshmallow', 'bottled beer', 'candy','ice cream', 'salty snack',
          'pet care'}
for i in dataset4['items']:
    for item in removed:
        if item in i:
            if len(i)>1:
                i.remove(item)

items4 = reduce(lambda a, b: a | b, dataset4.values)[0]
item_counts4 = len(items4)
avg_width4=avg_width(dataset4)
trans_counts4 = dataset4.shape[0]

print("items set:")
print(items4)
print("items count:")
print(item_counts4)
print("average width of transactions:")
print(avg_width4)
print("transactions counts:")
print(trans_counts4)

items set:
{'seasonal products', 'roll products ', 'cake bar', 'bottled water', 'pastry', 'ham', 'dish cleaner', 'herbs', 'chewing gum', 'liquor (appetizer)', 'berries', 'toilet cleaner', 'frankfurter', 'hard cheese', 'cereals', 'condensed milk', 'root vegetables', 'zwieback', 'waffles', 'citrus fruit', 'long life bakery product', 'curd cheese', 'whipped/sour cream', 'grapes', 'rolls/buns', 'white wine', 'canned fish', 'nuts/prunes', 'instant coffee', 'frozen fish', 'vinegar', 'tropical fruit', 'flour', 'pork', 'butter milk', 'dental care', 'ready soups', 'onions', 'chocolate', 'frozen dessert', 'whole milk', 'light bulbs', 'semi-finished bread', 'processed cheese', 'soups', 'shopping bags', 'pot plants', 'beef', 'cream cheese ', 'domestic eggs', 'newspapers', 'sausage', 'turkey', 'baking powder', 'specialty chocolate', 'oil', 'hamburger meat', 'liquor', 'red/blush wine', 'canned beer', 'misc. beverages', 'sugar', 'pip fruit', 'pasta', 'specialty vegetables', 'coffee', 'cat food', 'cur

In [9]:
# vary in average width

added = {'specialty cheese', 'misc. beverages'}

for i in dataset4['items']:
    for item in added:
        if item not in i:
            i.add(item)

items4 = reduce(lambda a, b: a | b, dataset4.values)[0]
item_counts4 = len(items4)
avg_width4=avg_width(dataset4)
trans_counts4 = dataset4.shape[0]

print("items set:")
print(items4)
print("items count:")
print(item_counts4)
print("average width of transactions:")
print(avg_width4)
print("transactions counts:")
print(trans_counts4)

items set:
{'seasonal products', 'roll products ', 'cake bar', 'bottled water', 'pastry', 'ham', 'dish cleaner', 'herbs', 'chewing gum', 'liquor (appetizer)', 'berries', 'toilet cleaner', 'frankfurter', 'hard cheese', 'cereals', 'condensed milk', 'root vegetables', 'zwieback', 'waffles', 'citrus fruit', 'long life bakery product', 'curd cheese', 'whipped/sour cream', 'grapes', 'rolls/buns', 'white wine', 'canned fish', 'nuts/prunes', 'instant coffee', 'frozen fish', 'vinegar', 'tropical fruit', 'flour', 'pork', 'butter milk', 'dental care', 'ready soups', 'onions', 'chocolate', 'white bread', 'whole milk', 'frozen dessert', 'semi-finished bread', 'processed cheese', 'light bulbs', 'soups', 'shopping bags', 'pot plants', 'beef', 'cream cheese ', 'domestic eggs', 'newspapers', 'sausage', 'turkey', 'baking powder', 'specialty chocolate', 'oil', 'hamburger meat', 'liquor', 'red/blush wine', 'misc. beverages', 'canned beer', 'sugar', 'pip fruit', 'specialty vegetables', 'coffee', 'cat food'

In [10]:
setnames = ["dataset1","dataset2","dataset3","dataset4"]
itemcnts = [item_counts1,item_counts2,item_counts3,item_counts4]
avgw = [avg_width1,avg_width2,avg_width3,avg_width4]
transcnts = [trans_counts1,trans_counts2,trans_counts3,trans_counts4]
maxw = [max_width(dataset1),max_width(dataset2),max_width(dataset3),max_width(dataset4)]

datasets = {
    'name':setnames,
    'item counts':itemcnts,
    'average width':avgw,
    'max width': maxw,
    'transactions counts':transcnts,
}

datasets = pd.DataFrame(datasets)

In [11]:
print(datasets)

       name  item counts  average width  max width  transactions counts
0  dataset1           97           4.11         13                  100
1  dataset2           97           4.83         14                  130
2  dataset3          107           4.19         23                  100
3  dataset4           97           6.71         22                  100


## Association Rules Mining Algorithm Comparison (Brute force & Apriori)

In [12]:
import pprint
import time
from itertools import combinations
from math import ceil


def bf_frequent_items(df, items, item_counts, min_sup=0.05, debug=False):
    """
    generate all possible frequent item sets by relative min support
    >>> {1: {(('I5',), 2), (('I2',), 7), (('I1',), 6), (('I3',), 6), (('I4',), 2)},
         2: {(('I1', 'I2'), 4), (('I1', 'I3'), 4), (('I1', 'I5'), 2), (('I3', 'I2'), 4), (('I4', 'I2'), 2), (('I5', 'I2'), 2)},
         3: {(('I1', 'I3', 'I2'), 2), (('I1', 'I5', 'I2'), 2)}
         }
    :param df: dataframe
    :param items:
    :param item_counts: num of items
    :param min_sup: fractional relative min support
    :return:
    """
    print("Find frequent item sets by Brute Force")
    print("-" * 100)

    frequent_sets = {}  # dictionary, key-> k, value-> k item sets
    min_threshold = ceil(df.shape[0] * min_sup)

    for k in range(1, 1 + item_counts):
        k_item_subsets = combinations(items, k)  # all possible k-item sets
        time_start = time.time()
        # check satisfied k-item sets
        filtered_k_subsets = {(
            tuple(k_item_subset), (set(k_item_subset) <= df["items"]).sum()) for k_item_subset in k_item_subsets
            if
            (set(k_item_subset) <= df["items"]).sum() >= min_threshold}
        print(f"Process {k}-item subsets in {time.time() - time_start: .5f} s")
        # if k subsets support can't satisfy, k + 1, ... can't satisfy
        if len(filtered_k_subsets) <= 0:
            break
        frequent_sets[k] = filtered_k_subsets

    if debug:
        print("Final frequent item sets")
        print("=" * 100)
        pprint.pprint(frequent_sets)
        print("=" * 100)
    return frequent_sets

In [13]:
def apriori_frequent_items(df, items, item_counts, min_sup=0.05, debug=False):
    """
    >>>{1: [(('I1',), 6), (('I2',), 7), (('I3',), 6), (('I4',), 2), (('I5',), 2)],
        2: [(('I1', 'I2'), 4),(('I1', 'I3'), 4),(('I1', 'I5'), 2),(('I2', 'I3'), 4),(('I2', 'I4'), 2),(('I2', 'I5'), 2)],
        3: [(('I1', 'I2', 'I3'), 2), (('I1', 'I2', 'I5'), 2)]
        }
    generate frequent item sets by Apriori algorithm
    :param df:
    :param items:
    :param item_counts:
    :param min_sup:
    :param debug: debug mode
    :return:
    """
    print("Find frequent item sets by Apriori algorithm")
    print("-" * 100)

    frequent_sets = {}
    hash_sets = {}
    min_threshold = ceil(df.shape[0] * min_sup)

    # initialized by 1 frequent items
    # all elements sorted by dictionary order
    time_start = time.time()
    frequent_k_item_sets = sorted(
        ((tuple(item_set), (set(item_set) <= df["items"]).sum()) for item_set in combinations(items, 1)
         if (set(item_set) <= df["items"]).sum() >= min_threshold),
        key=lambda x: x[0])
    print(f"Process 1-item subsets in {time.time() - time_start: .5f} s")
    hash_k_sets = {item_set for item_set in combinations(items, 1) if
                   (set(item_set) <= df["items"]).sum() >= min_threshold}

    frequent_sets[1] = frequent_k_item_sets
    if debug:
        print("1-item frequent sets")
        pprint.pprint(frequent_k_item_sets)

    hash_sets[1] = hash_k_sets
    if debug:
        print("1-item hash sets")
        pprint.pprint(hash_k_sets)

    # perform level-wise generation by join two k - 1 frequent sets and pruning
    for k in range(2, 1 + item_counts):
        time_start = time.time()
        cur_item_sets = []
        cur_hash_sets = set()
        for i in range(len(frequent_k_item_sets) - 1):
            for j in range(i + 1, len(frequent_k_item_sets)):
                # joining : find all candidate k item sets
                a, b = frequent_k_item_sets[i], frequent_k_item_sets[j]
                if a[0][:-1] == b[0][:-1] and a[0][-1] < b[0][-1]:
                    candidate_item_set = a[0] + (b[0][-1],)
                    # pruning : checking all k - 1 item subsets of candidate
                    candidate_subsets = set(
                        map(lambda x: tuple(sorted(x)), combinations(set(candidate_item_set), k - 1)))
                    if not candidate_subsets - hash_k_sets:
                        candidate_sup = (set(candidate_item_set) <= df["items"]).sum()
                        if candidate_sup >= min_threshold:
                            cur_item_sets.append((candidate_item_set, candidate_sup))
                            cur_hash_sets.add(candidate_item_set)
        print(f"Process {k}-item subsets in {time.time() - time_start: .5f} s")
        if len(cur_item_sets) <= 0:
            break

        if debug:
            print(f"{k}-item frequent item sets")
            print(cur_item_sets)

        frequent_sets[k] = cur_item_sets
        frequent_k_item_sets = cur_item_sets

        if debug:
            print(f"{k}-item hash sets")
            print(cur_hash_sets)

        hash_sets[k] = cur_hash_sets
        hash_k_sets = cur_hash_sets

    if debug:
        print("Final frequent item sets")
        print("=" * 100)
        pprint.pprint(frequent_sets)
        print("=" * 100)
    return frequent_sets


In [14]:
algos = {
    "Apriori": apriori_frequent_items,  # apriori
     "Brute Force": bf_frequent_items,  # brute force
}

try:
    for algo in algos.keys():
        print("=" * 100)
        for i in range(1,5):
            print("For dataset"+str(i)+" : ")
            d = vars()[datasets.loc[i-1,'name']]
            items = vars()[("items"+str(i))]
            item_counts = vars()[("item_counts"+str(i))]
            alg_freq_item_sets = algos[algo](d, items, item_counts, min_sup=0.05, debug=False)
        print("=" * 100)
except KeyboardInterrupt:
    print ('Stopped')

For dataset1 : 
Find frequent item sets by Apriori algorithm
----------------------------------------------------------------------------------------------------
Process 1-item subsets in  0.01044 s
Process 2-item subsets in  0.04714 s
Process 3-item subsets in  0.00000 s
For dataset2 : 
Find frequent item sets by Apriori algorithm
----------------------------------------------------------------------------------------------------
Process 1-item subsets in  0.02000 s
Process 2-item subsets in  0.04269 s
Process 3-item subsets in  0.00000 s
For dataset3 : 
Find frequent item sets by Apriori algorithm
----------------------------------------------------------------------------------------------------
Process 1-item subsets in  0.01802 s
Process 2-item subsets in  0.05570 s
Process 3-item subsets in  0.00000 s
For dataset4 : 
Find frequent item sets by Apriori algorithm
----------------------------------------------------------------------------------------------------
Process 1-item subs