In [3]:
import numpy as np
import pandas as pd
import itertools
from time import time
import itertools


In [4]:
df = pd.read_csv('groceries.csv', sep='delimiter', header=None, engine='python')
l = [m.values[0].split(',') for ii, m in df.iterrows()]

In [5]:
def prepdata(l,k=2,  report=False):
    C_k = []
    b = 0
    
    pairs = []
    for line in l:
        for c in itertools.combinations(line, k):
            yield frozenset(c)

        C_k = []
        # report progress
        # print every 1000th element to reduce clutter
        if report:
            if b % 1000 == 0:  
                print('processing bin ', b)
            b += 1


In [6]:
nitems = 20
for C_k in prepdata(l):
    print(C_k)
    
    nitems -= 1
    if nitems == 0: 
        break

frozenset({'semi-finished bread', 'citrus fruit'})
frozenset({'margarine', 'citrus fruit'})
frozenset({'ready soups', 'citrus fruit'})
frozenset({'margarine', 'semi-finished bread'})
frozenset({'ready soups', 'semi-finished bread'})
frozenset({'ready soups', 'margarine'})
frozenset({'tropical fruit', 'yogurt'})
frozenset({'tropical fruit', 'coffee'})
frozenset({'yogurt', 'coffee'})
frozenset({'yogurt', 'pip fruit'})
frozenset({'pip fruit', 'cream cheese'})
frozenset({'pip fruit', 'meat spreads'})
frozenset({'yogurt', 'cream cheese'})
frozenset({'yogurt', 'meat spreads'})
frozenset({'meat spreads', 'cream cheese'})
frozenset({'whole milk', 'other vegetables'})
frozenset({'condensed milk', 'other vegetables'})
frozenset({'long life bakery product', 'other vegetables'})
frozenset({'condensed milk', 'whole milk'})
frozenset({'whole milk', 'long life bakery product'})


In [7]:
# Naive Approach
for s in [10, 50, 100]: # support threshold
    t = time()
    
    C2 = {}
    for key in prepdata(l, k=2):
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1
            
    L2 = {}
    for key, n in C2.items():
        if n >= s:
            L2[key] = n
    t2 = time()
    print('{} items with >{} occurances which took {} seconds'.format(len(L2), s, t2-t))

2981 items with >10 occurances which took 0.19280409812927246 seconds
605 items with >50 occurances which took 0.146162748336792 seconds
207 items with >100 occurances which took 0.14271783828735352 seconds


We see an improvement in time the stronger the filter

In [8]:
# Apriori

for s in [10, 50, 100]:
    t = time()
    # find frequent 1-tuples (individual items)
    C1 = {}
    for key in prepdata(l, k=1, report=False):
        if key not in C1:
            C1[key] = 1
        else:
            C1[key] += 1    

    #print("{} items".format(len(C1)))

    # filter stage
    L1 = {}
    for key, count in C1.items():
        if count >= s:
            L1[key] = count

    C2_items = set([a.union(b) for a in L1.keys() for b in L1.keys()]) # List comprehensions in python

    # find frequent 2-tuples
    C2 = {}
    for key in prepdata(l, k=2):
        # filter out non-frequent tuples
        if key not in C2_items:
            continue

        # record frequent tuples
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1
    t2 = time()
    print('{} items with >{} occurances which took {} seconds'.format(len(L2), s, t2-t))

207 items with >10 occurances which took 0.34893083572387695 seconds
207 items with >50 occurances which took 0.21812891960144043 seconds
207 items with >100 occurances which took 0.20238089561462402 seconds


We see an improvement in time for higher thresholds. While initially slower than naive, improvements in time are more drastic using a priori

In [21]:
(5*1000000-673) - (5*1000000+673)

-1346

In [9]:
def run_hash(x=1, N=100, k=2):
    '''x=1 means that there are 2 hash tables, 0 based index'''
    t = time()
    mx_hash = [5*1000000-673,
               5*1000000+673, 
               5*1000000+673+1346, 
               5*(1000000+673*5),  
               5*(1000000+673*7)]
    
    hash_dict = {'max_hash':  mx_hash,
          'H': [np.zeros((h,), dtype=np.int) for h in mx_hash],
         'hash_cell':[]}

    for key in prepdata(l, k=k):
        for y in range(x):
            hash_dict['hash_cell'].append(hash(key) % hash_dict['max_hash'][y])
            hash_dict['hash_cell'][y] += 1

    H_good = [set(np.where(hash_dict['H'][n] >= N)[0]) for n in range(x)]
    hash_dict.pop('H')

    C2 = {}
    for key in prepdata(l, k=k):
        for y in range(x):
            hash_dict['hash_cell'].append(hash(key) % hash_dict['max_hash'][y])
            if hash_dict['hash_cell'][y] not in H_good[y]:
                continue

        # record frequent tuples
        if key not in C2:
            C2[key] = 1
        else:
            C2[key] += 1


    # filter stage
    L2 = {}
    for key, count in C2.items():
        if count >= N:
            L2[key] = count
    t2 = time()
    print('With {} tables and k={}: found {} items with >{} occurances and took {} seconds'.format(x+1,k,  len(L2), N, t2-t))
    
    return L2

In [10]:
for N in [10, 50, 100]:
    run_hash(N=N)

With 2 tables and k=2: found 2981 items with >10 occurances and took 0.4437587261199951 seconds
With 2 tables and k=2: found 605 items with >50 occurances and took 0.43692612648010254 seconds
With 2 tables and k=2: found 207 items with >100 occurances and took 0.47879695892333984 seconds


Wit an increasing threshold (S) the number of item sets decreased, there was also a marginal decrease in time required.

In [11]:
for x in range(5):
    run_hash(x=x)

With 1 tables and k=2: found 207 items with >100 occurances and took 0.27523303031921387 seconds
With 2 tables and k=2: found 207 items with >100 occurances and took 0.4901449680328369 seconds
With 3 tables and k=2: found 207 items with >100 occurances and took 0.6596391201019287 seconds
With 4 tables and k=2: found 207 items with >100 occurances and took 0.7746970653533936 seconds
With 5 tables and k=2: found 207 items with >100 occurances and took 0.9956839084625244 seconds


With more tables the time increased (became slower) but performance did not change.

In [12]:
for k in [3, 4, 5]:
    run_hash(k=k, N=10)

With 2 tables and k=3: found 6831 items with >10 occurances and took 1.500154972076416 seconds
With 2 tables and k=4: found 3137 items with >10 occurances and took 5.2792885303497314 seconds
With 2 tables and k=5: found 376 items with >10 occurances and took 18.97327160835266 seconds


Obviously, with increased k fewer items were found and the analysis took exponentially longer. There are occurances with k=5 but at very low frequencies.

# 1-NN

In [13]:
results = run_hash(k=4, N=50)

With 2 tables and k=4: found 12 items with >50 occurances and took 4.1445770263671875 seconds


In [14]:
dict_res = {}
for comb in list(results.keys()):
    a = np.array(list(comb))
    a.sort()
    for item in a:
        if item not in dict_res:
            dict_res[item] = []
            dict_res[item].append(list(a[a!=item]))
            continue
            
        if list(a[a!=item]) not in dict_res[item]:
                dict_res[item].append(list(a[a!=item]))



In [15]:
for key in dict_res:
    print(f'{key}:')
    print()
    print(dict_res[key])
    print()

root vegetables:

[['tropical fruit', 'whole milk', 'yogurt'], ['other vegetables', 'whole milk', 'yogurt'], ['other vegetables', 'tropical fruit', 'whole milk'], ['other vegetables', 'pip fruit', 'whole milk'], ['other vegetables', 'whipped/sour cream', 'whole milk'], ['other vegetables', 'rolls/buns', 'whole milk'], ['citrus fruit', 'other vegetables', 'whole milk']]

tropical fruit:

[['root vegetables', 'whole milk', 'yogurt'], ['other vegetables', 'root vegetables', 'whole milk'], ['other vegetables', 'whole milk', 'yogurt']]

whole milk:

[['root vegetables', 'tropical fruit', 'yogurt'], ['other vegetables', 'root vegetables', 'yogurt'], ['fruit/vegetable juice', 'other vegetables', 'yogurt'], ['other vegetables', 'root vegetables', 'tropical fruit'], ['other vegetables', 'tropical fruit', 'yogurt'], ['other vegetables', 'pip fruit', 'root vegetables'], ['other vegetables', 'pip fruit', 'yogurt'], ['other vegetables', 'root vegetables', 'whipped/sour cream'], ['other vegetables',

I did approach with various Ks and thresholds, I decided to focus on K=4 and a threshold of 50 as it is easy to digest the outcome. Looking at the items shown in the 1-NN seem to largely consist of staple foods. These would likely be of little interest.

In [16]:
C = []
for key in prepdata(l, k=1):
    C.append(list(key)[0])
from collections import Counter
Counter(C).most_common(20)

[('whole milk', 2513),
 ('other vegetables', 1903),
 ('rolls/buns', 1809),
 ('soda', 1715),
 ('yogurt', 1372),
 ('bottled water', 1087),
 ('root vegetables', 1072),
 ('tropical fruit', 1032),
 ('shopping bags', 969),
 ('sausage', 924),
 ('pastry', 875),
 ('citrus fruit', 814),
 ('bottled beer', 792),
 ('newspapers', 785),
 ('canned beer', 764),
 ('pip fruit', 744),
 ('fruit/vegetable juice', 711),
 ('whipped/sour cream', 705),
 ('brown bread', 638),
 ('domestic eggs', 624)]

Indeed, all of the item sets observed are found in a list of the 20 most common items (out of 169 unique items), which means that they are probably not of much interest as sets considering that their occurances are high otherwise.