# SDSC3001 - Assignment 3


## Question 1

In [None]:
import random


def reservoir_sampling(k, stream):
    reservoir = []
    for i, item in enumerate(stream):
        if i < k:
            # Fill reservoir until we have k items
            reservoir.append(item)
        else:
            # Randomly decide whether to replace an item
            j = random.randint(0, i)  # Probability of keeping new item: k/(i+1)
            if j < k:
                reservoir[j] = item

    return reservoir

In [None]:
stream = range(1000)  # Simulate a data stream
sample_size = 5
result = reservoir_sampling(sample_size, stream)
print(f"Random sample of {sample_size} items:", result)

### Proof of correctness

Maintaining $k$ uniform samples from a streaming set guarantees at any time point $t \ge k$, the probability of any element already possessed from the sampling set is $\frac{k}{t}$, which can be proven inductively.

When $t = k$, the reservoir is filled with the first $k$ elements and each of these $k$ elements in the reservoir with probability 1.

Assume that after filling first $k$ element and processing $t - 1$ elements, each element $x_i$ is in the reservoir with the probability of $\frac{k}{t-1}$. Then, considering the $t$-th element $x_t$, if the probability of $x_t$ being in the reservoir (not replaced by $x_t$, in other words) is $1-\frac{1}{t}$.

Therefore, the probability that $x_i$ is kept as a sample is the product of these two probability. $\frac{k}{t-1} \cdot (1-\frac{1}{t}) = \frac{k}{t}$.

## Question 2

### Part A

When an itemset $I$ has a size of $m$, there are $2^m -1$ possible subsets. When mining top-$k$ most frequent patterns, 

$$
2^m - 1 \leq k \\
2^m \leq k + 1 \\
m \leq \log_2(k + 1) \\
\therefore m = \lceil \log_2(k + 1) \rceil
$$

### Part B

#### b.1

#### b.2

#### b.3

#### b.4
Question B4:

Set $k=500$. Run your Misra–Gries Algorithm on the "trans.txt" dataset and report the values of $L$ and $minSup(A)$ when setting $C=500000, 750000, 1000000$. To compute $minSub(A)$, you can refer to the file "patterns_Apriori.txt" containing all the frequent patterns of support at least $21$. Each line of "patterns_Apriori.txt" is in the form $id_1,id_2,...,id_l:sup$, where $id_1,id_2,...,id_l$ denotes a pattern $\{id_1,id_2,...,id_l\}$ and $sup$ is the support of this pattern. (Hint: the file "patterns_Apriori.txt" contains enough information. If your algorithm returns some pattern that is not in the "patterns_Apriori.txt" file, probably your algorithm is not implemented correctly.)

In [None]:
import sys
import math
from collections import Counter
from itertools import combinations


class FrequenctPatterns:
    def __init__(self):
        self.transactions = []
        self.patterns = {}

        # def load_data(self):
        with open("trans.txt") as f:
            for line in f:
                transaction = list(map(int, line.split()))
                self.transactions.append(transaction)

        with open("patterns_Apriori.txt") as f:
            for line in f:
                key, value = line.strip().split(":")
                key = tuple(sorted(map(int, key.split(","))))
                self.patterns[key] = int(value)

    def Misra_Gries(self, C, k=500):
        m = math.ceil(math.log2(k + 1))  # maximum size of patterns we need to consider
        L = 0  # number of subsets in transactions processed

        counter = Counter()  # pattern frequency counter

        # Upon receiving a_t, check if there is a counter for a_t
        for transaction in self.transactions:
            subsets = []  # transaction subsets
            for i in range(1, min(m, len(transaction)) + 1):
                subsets.extend(tuple(sorted(c)) for c in combinations(transaction, i))
            L += len(subsets)

            for subset in subsets:
                # if there is one, increment the counter
                # if there isn't one,
                #     and there is at least one counter available, use an available counter to count a_t
                #     and there is no available counter, decrement all counters by 1
                #
                # if subset in counter:
                #     counter[subset] += 1
                # else:
                #     if len(counter) < C:
                #         counter[subset] = 1
                #     else:
                #         for key in list(counter.keys()):
                #             counter[key] -= 1
                #             if counter[key] == 0:
                #                 del counter[key]
                if subset in counter or len(counter) < C:
                    counter[subset] += 1
                else:
                    for key in list(counter.keys()):
                        counter[key] -= 1
                        if counter[key] == 0:
                            del counter[key]

        counter_sorted = counter.most_common()
        threshold = counter_sorted[k - 1][1] - L / (C + 1)
        A = list(filter(lambda x: x[1] >= threshold, counter_sorted))  # filtered patterns

        min_sup = sys.maxsize
        min_pattern = ""
        for item, _ in A:
            if self.patterns[item] < min_sup:
                min_sup = self.patterns[item]
                min_pattern = item

        return L, min_sup, min_pattern

In [None]:
frequent_patterns = FrequenctPatterns()

In [None]:
for count in [500_000, 750_000, 1_000_000]:
    L, min_sup, min_pattern = frequent_patterns.Misra_Gries(count)
    print(f"C = {count}; value of {L = }, {min_sup = }, {min_pattern = }")