In [1]:
import pandas as pd
import numpy as np
# Import all libraries for the rest of the blog post
from scipy.integrate import quad
import scipy.stats as st
import timeit
import math


In [2]:
df = pd.read_csv("/home/sokhorn/sokhorn/dataSet/sample/sampleTestFuzzy.csv")
df


Unnamed: 0,InvoiceID,ItemName,Qty
0,1,A,1
1,1,B,1
2,1,D,1
3,2,A,1
4,2,B,1
5,2,C,1
6,2,D,1
7,3,B,1
8,3,D,1
9,4,B,1


In [3]:
itemset = df.groupby(['InvoiceID', 'ItemName'])['Qty'].sum(
).unstack().reset_index().fillna(0).set_index("InvoiceID").astype(int)
itemset


ItemName,A,B,C,D,E,F
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,1,0,0
2,1,1,1,1,0,0
3,0,1,0,1,0,0
4,0,1,1,1,1,0
5,1,0,1,0,1,0
6,0,1,0,1,0,1
7,1,0,0,0,1,1
8,0,0,1,0,0,1
9,0,1,1,0,0,1
10,1,1,1,1,0,1


In [4]:
def normalProbabilityDensity(x):
    constant = 1.0 / np.sqrt(2*np.pi)
    return(constant * np.exp((-x**2) / 2.0))


zoe_percentile, _ = quad(normalProbabilityDensity, np.NINF, 1.25)
mike_percentile, _ = quad(normalProbabilityDensity, np.NINF, 1.00)
print('Zoe: ', zoe_percentile)
print('Mike: ', mike_percentile)
standard_normal_table = pd.DataFrame(
    data=[],
    index=np.round(np.arange(0, 3.5, .1), 2),
    columns=np.round(np.arange(0.00, .1, .01), 2)
)

for index in standard_normal_table.index:
    for column in standard_normal_table.columns:
        z = np.round(index + column, 2)
        value, _ = quad(normalProbabilityDensity, np.NINF, z)
        standard_normal_table.loc[index, column] = value

standard_normal_table.index = standard_normal_table.index.astype(str)
standard_normal_table.columns = [str(column).ljust(
    4, '0') for column in standard_normal_table.columns]


Zoe:  0.894350226333146
Mike:  0.8413447460685435


In [5]:
def sampleSize(accurateResult, probabilityRequirement):
    n = 0
    z_value = st.norm.ppf((probabilityRequirement + 1) / 2)
    n = (pow(z_value, 2)) / (4 * pow(accurateResult, 2))
    n = n + 1
    return int(n)


Sample size = 16588


In [68]:
def instanceSelection(sampleSize, m):
    X = set({})
    a = 2
    b = 3
    x0 = 5
    c = x0
    X.update({x0})
    while(len(X) != sampleSize):
        x = (a * c + b) % m
        if not set({x}).issubset(X):
            X.update({x})
        else:
            break
        c = x
    return X


array([9, 3, 5, 1])

In [70]:
def randomDatabset(itemset, sample_size):
    random_selection = instanceSelection(
        sampleSize=sample_size, m=len(itemset))
    random_selection = np.fromiter(
        random_selection, int, len(random_selection))
    j_index = np.where(random_selection < itemset.index.values.max() - 1,
                       random_selection + 1,  random_selection)
    return itemset.iloc[j_index]


ItemName,A,B,C,D,E,F
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,1,1,1,1,0,1
5,1,0,1,0,1,0
7,1,0,0,0,1,1
3,0,1,0,1,0,0


In [92]:
# Sebastian Raschka 2014-2020
# myxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
import pandas as pd


def generate_new_combinations(old_combinations):
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]  # get a single item in the last
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:

            yield from old_tuple
            yield item


def generate_new_combinations_low_memory(old_combinations, X, min_support,
                                         is_sparse):

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    rows_count = X.shape[0]
    threshold = min_support * rows_count
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        if is_sparse:
            mask_rows = X[:, old_tuple].toarray().all(axis=1)
            X_cols = X[:, valid_items].toarray()
            supports = X_cols[mask_rows].sum(axis=0)
        else:
            mask_rows = X[:, old_tuple].all(axis=1)
            supports = X[mask_rows][:, valid_items].sum(axis=0)
        valid_indices = (supports >= threshold).nonzero()[0]
        for index in valid_indices:
            yield supports[index]
            yield from old_tuple
            yield valid_items[index]


def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
            low_memory=False, minimumInterest=0.07):

    start = timeit.default_timer()

    def _support(_x, _n_rows, _is_sparse):
        # print(f"X : {_x.astype(int)}")
        # print(f"sum : {np.sum(_x, axis=1)}")
        out = (np.sum(_x, axis=0) / _n_rows)
        return np.array(out).reshape(-1)

    def _min_interest(support_each_item, support, _n_rows):
        return abs(support - np.prod(support_each_item.sum(axis=0) / rows_count, axis=1))

    if min_support < 0.:
        raise ValueError('`min_support` must be a positive '
                         'number within the interval `(0, 1]`. '
                         'Got %s.' % min_support)

    if hasattr(df, "sparse"):
        # DataFrame with SparseArray (pandas >= 0.24)
        if df.size == 0:
            X = df.values
        else:
            X = df.sparse.to_coo().tocsc()
        is_sparse = True
    else:
        # dense DataFrame
        X = df.values
        is_sparse = False

    support = _support(X, X.shape[0], is_sparse)
    ary_col_idx = np.arange(X.shape[1])
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])

    all_ones = np.ones((int(rows_count), 1))

    while max_itemset and max_itemset < (max_len or float('inf')):
        next_max_itemset = max_itemset + 1
        # conver from generator to numpy
        combin = generate_new_combinations(itemset_dict[max_itemset])
        combin = np.fromiter(combin, dtype=int)
        combin = combin.reshape(-1, next_max_itemset)
        # print(combin)
        if combin.size == 0:
            break
        if verbose:
            print(
                '\rProcessing %d combinations | Sampling itemset size %d' %
                (combin.size, next_max_itemset), end="")
        if is_sparse:
            _bools = X[:, combin[:, 0]] == all_ones
            for n in range(1, combin.shape[1]):
                _bools = _bools & (X[:, combin[:, n]] == all_ones)
        else:
            _bools = np.all(X[:, combin], axis=2)

        support = _support(np.array(_bools), rows_count, is_sparse)
        _mask = (support >= min_support).reshape(-1)
        mask_interest = (support == min_support)
        support_each_item = X[:, combin]  # maybe slow here

        s = np.array(support[_mask])
        i = np.array(combin[_mask])
        if any(_mask):
            itemset_dict[next_max_itemset] = i
            support_dict[next_max_itemset] = s

            max_itemset = next_max_itemset
        else:
            break

        # interest = _min_interest(
        #     support_each_item, support, rows_count)
        # min_interest_mask = (interest >= minimumInterest).reshape(-1)

        # if any(mask_interest):
        #     support_equal = np.array(support[mask_interest])
        #     item_equal = np.array(combin[mask_interest])
        #     print(item_equal)

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
                             dtype='object')

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ['support', 'itemsets']
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
            mapping[i] for i in x]))
    res_df = res_df.reset_index(drop=True)
    if verbose:
        print()  # adds newline if verbose counter was used

    stop = timeit.default_timer()
    execution_time = stop - start
    # print(f"execution time : {execution_time}")
    return res_df, execution_time



0     0.5
1     0.7
2     0.6
3     0.6
4     0.3
5     0.5
6     0.3
7     0.3
8     0.3
9     0.4
10    0.6
11    0.3
12    0.3
13    0.3
14    0.3
15    0.3
Name: support, dtype: float64

In [11]:

def a_avgsupp(m, k, N):

    n_over_N = sum([math.pow(m / N, i) for i in range(k, m)])

    operation = 1 / (m - k + 1)
    return n_over_N * operation


def a_b_avg(supports):
    return min(supports), max(supports), sum(supports) / len(supports)


# def lean(support_count_i_itemset, M, avg):
#     less_than_avg = []
#     greater_than_avg = []
#     for j in support_count_i_itemset:
#         if j < avg:
#             less_than_avg.append(1)
#         elif j > avg:
#             greater_than_avg.append(1)
#     return (sum(less_than_avg) - sum(greater_than_avg)) / M


def actul_minimum_support(a, b, r_min, n=1):
    return ((pow(b, n) - pow(a, n)) * r_min + pow(a, n))


def lean(supports, avg, m):
    less_avg = np.apply_along_axis(
        lambda x: x < avg, arr=supports, axis=0).sum()
    greater_avg = np.apply_along_axis(
        lambda x: x > avg, arr=supports, axis=0).sum()
    return (less_avg - greater_avg) / m


def aproximate_minsupport(r_min, min_supp, max_min, n=1):
    a_n = pow(min_supp, n)
    b_n = pow(max_min, n)
    x_n = (r_min - (a_n / (a_n - b_n))) * (b_n - a_n)
    return x_n


In [64]:
def actual_min_support(dataset, r_minsupp, m, k=0, n=3):
    a = 1 / len((dataset))
    N = dataset.shape[1]
    itemset, time = apriori(df=dataset, min_support=r_minsupp)
    b = itemset['support'].max()
    avg = a_avgsupp(N=N, m=m, k=k)
    if a <= avg:
        r_minsupp = (r_minsupp * 2*(math.pow(avg, n) - pow(a, n))
                     ) - pow(a, n) / (2 * (pow(a, n) - pow(avg, n)))
        r_minsupp = pow(r_minsupp, 1 / n)
    elif avg <= b:
        r_minsupp = (r_minsupp * (2*(pow(b, n) - pow(avg, n)))) - \
            (pow(b, n) - 2 * pow(avg, n)) / (2 * (pow(b, n) - pow(avg, n)))
        r_minsupp = pow(r_minsupp, 1 / n)
    return r_minsupp


m = int(itemset.sum(axis=1).sum() / len(itemset))  # avg att per rows
actual_min_support(dataset=itemset, r_minsupp=0.7, m=m,)


0.4958040021794584

In [90]:
def fuzzy_frequent_itemset(dataset, f_support, k=0):
    size = sampleSize(accurateResult=0.01, probabilityRequirement=0.99)
    N = dataset.shape[1]
    random_selection = instanceSelection(sampleSize=size, m=len(itemset))

    SD = np.fromiter(
        random_selection, int, len(random_selection))
    
    a = itemset.shape[0]
    supports_1_itemset = itemset.sum(axis=1).values / a
    b = max(supports_1_itemset)

    m = int(dataset.sum(axis=1).sum() / len(dataset))  # avg att per rows
    avg = a_avgsupp(N=N, m=m, k=k)

    print(avg, m)


fuzzy_frequent_itemset(itemset, 1)


0.4375 3


array([5, 3, 3, 2])

In [108]:
a = RD
fre_apriori_itemsets = apriori(itemset, min_support=0.3, use_colnames=True)
m = int(a.sum(axis=1).sum() / len(a))  # avg att per rows
lean(supports=fre_apriori_itemsets[0]['support'], avg=0.23721, m=m)
# lean(supports=RD.sum(axis=1), avg=0.23721, m=m)

-5.333333333333333

Unnamed: 0,support,itemsets
0,0.5,(A)
1,0.7,(B)
2,0.6,(C)
3,0.6,(D)
4,0.3,(E)
5,0.5,(F)
6,0.3,"(B, A)"
7,0.3,"(A, C)"
8,0.3,"(A, D)"
9,0.4,"(B, C)"
