# Algorithmic Data Science - Exercise Series 1

**Konstantinos Papadakis**

*Data Science and Machine Learning 03400149*

*k.i.papadakis@gmail.com*

## Exercise 1

<img src="images/6-3-1.png" width="600">

In [36]:
import itertools
import numpy as np

baskets = np.array([
    [1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6],
    [1, 3, 5], [2, 4, 6], [1, 3, 4], [2, 4, 5],
    [3, 5, 6], [1, 2, 4], [2, 3, 5], [3, 4, 6],
], dtype=int)
threshold = 4
n_buckets = 11

def hash_(i, j):
    return (i * j) % n_buckets


# (a) Compute the supports 
item_names, item_freqs = np.unique(baskets, return_counts=True)

pair_freqs = np.zeros((6, 6), dtype=int)
for basket in baskets:
    for x in basket:
        for y in basket:
            pair_freqs[x-1][y-1] += 1  # 0 based indexing, should be a triangular matrix normally

print('Singleton Counts')
for name, count in zip(item_names, item_freqs):
    print(f'{name}: {count}')
print()
print('Pair Counts')
print(pair_freqs)
print()

# (b) Hashes
print('Hashes')
for pair in itertools.combinations(item_names, 2):
    print(f'{pair}: {hash_(*pair)}')
print()

# (c) Frequent bucket
bucket_freqs = np.zeros(n_buckets, dtype=int)
for basket in baskets:
    for pair in itertools.combinations(basket, 2):
        bucket_freqs[hash_(*pair)] += 1

print('Frequent Buckets')
print(*(bucket for bucket, freq in enumerate(bucket_freqs) if freq >= threshold))
print()

# (c) PCY second pass pairs
pcy_freq_pairs = []
for pair in itertools.combinations(item_names, 2):
    if bucket_freqs[hash_(*pair)] >= threshold:
        pcy_freq_pairs.append(pair)
print('PCY Second Pass Pairs')
print(*(pcy_freq_pairs))

Singleton Counts
1: 4
2: 6
3: 8
4: 8
5: 6
6: 4

Pair Counts
[[4 2 3 2 1 0]
 [2 6 3 4 2 1]
 [3 3 8 4 4 2]
 [2 4 4 8 3 3]
 [1 2 4 3 6 2]
 [0 1 2 3 2 4]]

Hashes
(1, 2): 2
(1, 3): 3
(1, 4): 4
(1, 5): 5
(1, 6): 6
(2, 3): 6
(2, 4): 8
(2, 5): 10
(2, 6): 1
(3, 4): 1
(3, 5): 4
(3, 6): 7
(4, 5): 9
(4, 6): 2
(5, 6): 8

Frequent Buckets
1 2 4 8

PCY Second Pass Pairs
(1, 2) (1, 4) (2, 4) (2, 6) (3, 4) (3, 5) (4, 6) (5, 6)


<img src="images/6-3-2.png" width=700>

In [45]:
def hash_2(i, j):
    return (i * j) % n_buckets

n_buckets_2 = 9
bucket_freqs_2 = np.zeros(n_buckets_2, int)
first_pass_candidates = set(pcy_freq_pairs)  # should be a bitmap normally

# Stage 2
for basket in baskets:
    for pair in itertools.combinations(basket, 2):
        if pair in first_pass_candidates:
            bucket_freqs_2[hash_2(*pair)] += 1

stage_2_pairs = []
for pair in pcy_freq_pairs:
    if bucket_freqs_2[hash_2(*pair)] >= threshold:
        stage_2_pairs.append(pair)
        
print('Second Pass Bucket Frequencies')
print(bucket_freqs_2)
print()
print('Frequent Pairs after the Second Pass')
print(*stage_2_pairs)

Second Pass Bucket Frequencies
[0 5 5 0 6 0 0 0 6]

Frequent Pairs after the Second Pass
(1, 2) (1, 4) (2, 4) (2, 6) (3, 4) (3, 5) (4, 6) (5, 6)


The second hashing didn't reduce the frequent pairs, which is quite unfortunate.