In [702]:
# !pip install apyori

In [703]:
import pandas as pd
import numpy as np
import sys
import numpy_indexed as npi
from itertools import combinations, product
from functools import reduce
from collections.abc import Iterable
import math
# use for vertify
from mlxtend.frequent_patterns import apriori, association_rules


In [704]:
testing_dataset_path = '/home/sokhorn/sokhorn/dataSet/data/sample_data_set.csv'
sample_dataset = pd.read_csv(
    testing_dataset_path, sep=',', usecols=[
        'InvoiceNo',
        'StockCode',
        'Quantity'
    ])
sample_dataset
item_sets = (
    sample_dataset.groupby(['InvoiceNo', 'StockCode', ])['Quantity']
    .sum().unstack().reset_index().fillna(0)
    .set_index("InvoiceNo")
)
# item_sets = item_sets.iloc[:, :10]
item_sets = item_sets.head(1000)
item_sets = item_sets.applymap(lambda x: 1 if x > 0 else 0)
item_sets.reindex(sorted(item_sets.columns), axis=1)
item_sets

StockCode,A,B,C,D
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1,1,0
2,0,1,1,1
3,1,0,1,1
4,1,1,0,0


#### Calculating RC count for each row of our data set 

In [705]:
# RC count generation 
itemset_reduct, rc_values =  npi.count(item_sets.values)

In [706]:
def rc(item_np):
    np_hash = {}
    item_rc = []
    for item in item_np:
        key = " ".join(map(str, item))
        if np_hash.get(key):
            np_hash[key] += 1
        else:
            np_hash[key] = 1

    for item in np_hash:
        values = list(map(int,  item.split()))
        values.append(np_hash[item])
        item_rc.append(values)
    return item_rc


In [707]:
# RC counr and tranction after reduce
rc_np = np.array(rc(item_sets.values))
df_rc = pd.DataFrame(rc_np)
columns = list(item_sets.columns)
columns.append("RC")
df_rc.set_axis(columns, inplace=True, axis=1)
RC = df_rc['RC']
df_rc.drop(['RC'], axis=1, inplace=True)
df_rc


Unnamed: 0,A,B,C,D
0,1,1,1,0
1,0,1,1,1
2,1,0,1,1
3,1,1,0,0


In [708]:
# Support Count 
# reduce(function, input)
def support_k_itemst(k_itemst):
    s = 0
    for i in range(len(k_itemst)):
        s += reduce(lambda a, b: a & b, k_itemst[i] & RC[i])
    return s

In [709]:
def support_appriori(k_itemset, item_sets = df_rc):
    mask = (k_itemset == 1).all(axis=1) # get all row which value are equal to 1 
    return len(item_sets[mask])


**Generating 1 itemse base on support count of row**

In [710]:
C = {}
L = {}
k_items = []
Discard = {}
itemset_size = 1
min_support = 0.5
Discard.update({itemset_size: []})

### Remove every column which it support count are less than user defind support 

In [711]:
# remove each column who support count are less than min_support for 1 itemsets
rc_1_itemset = df_rc.sum(axis=0)
# remove all column which sum of row are ness than user defind support threshold
cut_our_cols = rc_1_itemset.loc[lambda x: x < min_support].index
# cut our every itemset which are sum of each row are less than user defind support
df_rc.drop(labels=cut_our_cols, axis=1, inplace=True)
# updpate the first item set which are in frequent 
Discard.update({itemset_size: cut_our_cols})

In [712]:
Discard

{1: Index([], dtype='object')}

In [713]:
# Frequent 1 itemsets
C.update({itemset_size: np.reshape(list(df_rc.columns), (-1, 1))})

In [714]:
import typing
import itertools


def join_step(itemsets: typing.List[tuple]):
    i = 0

    while i < len(itemsets):

        skip = 1

        *itemset_first, itemset_last = itemsets[i]

        tail_items = [itemset_last]
        tail_items_append = tail_items.append

        for j in range(i + 1, len(itemsets)):

            *itemset_n_first, itemset_n_last = itemsets[j]

            if itemset_first == itemset_n_first:  # k - 1, item are identical
                tail_items_append(itemset_n_last)
                skip += 1
            else:
                break

        itemset_first_tuple = tuple(itemset_first)
        for a, b in sorted(itertools.combinations(tail_items, 2)):
            yield list(itemset_first_tuple + (a,) + (b,))

        i += skip


def prune_step(itemsets: typing.Iterable[tuple], possible_itemsets: typing.List[tuple]):
    itemsets = [
        np.unique(subarr) for subarr in itemsets
    ]
    for possible_itemset in possible_itemsets:
        for i in range(len(possible_itemset) - 2):
            removed = possible_itemset[:i] + possible_itemset[i + 1:]
            if (np.array(removed) not in np.array(itemsets)):
                break
            else:
                yield possible_itemset
            yield possible_itemset


def apriori_gen(item_sets):
    posible_extenstion = join_step(item_sets)
    yield from prune_step(item_sets, posible_extenstion)


In [715]:
# def candidate_generation(Lk):
#     try:
#         k = len(Lk)
#         k_1_item = len(Lk[0]) - 1
#         for i in range(k):
#             L1 = list(Lk[i])[-k_1_item::]
#             for j in range(i + 1, k):
#                 L2 = list(Lk[j])[:k_1_item:]
#                 if L1 == L2:
#                     yield list(np.union1d(Lk[i], Lk[j]))
#                     # test next item the same or not, if no break this loop
#                     if(j + 1 < k):
#                         if((L1 != list(Lk[j + 1])[:k_1_item:])):
#                             break

#     except IndexError as err:
#         print(f"Error with {err}")


In [716]:
def get_frequent(itemesets, min_support, prev_discard):
    L = []
    support_count = []
    new_discard = []
    k = len(prev_discard)
    for i in range(len(itemesets)):
        discard_before = False
        result = itemesets[i]

        # if result in prev_discard than break this loop
        if k > 0:
            for it in prev_discard[k]:
                if set(it).issubset(set(result)):
                    discard_before = True
                    break
        
        if not discard_before:
            # print(f'item {mergeKItemIntoOne(item)}')
            count = support_k_itemst(df_rc[result].values)
            if count >= min_support:
                L.append(result)
                support_count.append(count)
            else:
                new_discard.append(result)
    return L, support_count, new_discard


In [717]:
# def get_frequent(itemesets, min_support, prev_discard):
#     k = len(prev_discard)
#     for i in range(len(itemesets)):
#         discard_before = False
#         result = itemesets[i]

#         # if result in prev_discard than break this loop
#         if k > 0:
#             for it in prev_discard[k]:
#                 if set(it).issubset(set(result)):
#                     discard_before = True
#                     break

#         if not discard_before:
#             # print(f'item {mergeKItemIntoOne(item)}')
#             count = support_appriori(df_rc[result])
#             status = True
#             if count >= min_support:
#                 status = False
#             else:
#                 status = True
#             yield result, count, status

In [718]:
support_count = {}
f, supp, new_discard, = get_frequent(C[itemset_size], min_support, Discard)

Discard.update({itemset_size: new_discard})
L.update({itemset_size: f})
support_count.update({itemset_size: supp})


In [719]:
# c1 = np.array(C[1]).flatten()


# def candidate_generation_1(Lk):
#     try:
#         k = len(Lk)
#         for i in range(k):
#             for j in range(i + 1, k):
#                 yield list(np.union1d(Lk[i], Lk[j]))
#     except IndexError as err:
#         print(f"Error with {err}")


# C.update({k: list(candidate_generation_1(L[k - 1]))})
# f, supp, new_discard = filter_item(get_frequent(C[k], min_support, Discard))
# L.update({k: f})
# Discard.update({k: new_discard})
# support_count.update({k: supp})


# k += 1
# c_k = list(candidate_generation(L[k - 1]))
# f, supp, new_discard = filter_item(
#     get_frequent(c_k, min_support, Discard))


In [737]:
def frequent_item_set_gen():
    k = itemset_size + 1
    while True:
        try:
            C.update(
                {
                    k: list(join_step(L[k - 1]))
                }
            )
            f, supp, new_discard = get_frequent(C[k], min_support, Discard)
            L.update({k: f})
            Discard.update({k: new_discard})
            support_count.update({k: supp})
            if(len(L[k]) == 0):
                break
            k += 1
        except ValueError as err:
            print(f"Hello, there some error with {err}")
            break
    return C, support_count, Discard


### All in one place

In [740]:
f, s, d = frequent_item_set_gen()

In [734]:
import matplotlib.pyplot as plt
import timeit

In [746]:
def for_loop_array():
    for i in range(10):
        pass


In [None]:
import numpy as np
import timeit

from matplotlib import pyplot as plt

start_time = timeit.default_timer()

r = range(1, 151)
dt = []
for i in r:
    frequent_item_set_gen()
    dt.append(timeit.default_timer()-start_time)
plt.plot(r, dt)
