In [1]:
import pandas as pd
import numpy as np
# Import all libraries for the rest of the blog post
from scipy.integrate import quad
import scipy.stats as st


In [2]:
df = pd.read_csv("/home/sokhorn/sokhorn/dataSet/sample/sampleTestFuzzy.csv")
df


Unnamed: 0,InvoiceID,ItemName,Qty
0,1,A,1
1,1,B,1
2,1,D,1
3,2,A,1
4,2,B,1
5,2,C,1
6,2,D,1
7,3,B,1
8,3,D,1
9,4,B,1


In [3]:
itemset = df.groupby(['InvoiceID', 'ItemName'])['Qty'].sum(
).unstack().reset_index().fillna(0).set_index("InvoiceID").astype(int)
itemset


ItemName,A,B,C,D,E,F
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,1,0,0
2,1,1,1,1,0,0
3,0,1,0,1,0,0
4,0,1,1,1,1,0
5,1,0,1,0,1,0
6,0,1,0,1,0,1
7,1,0,0,0,1,1
8,0,0,1,0,0,1
9,0,1,1,0,0,1
10,1,1,1,1,0,1


In [4]:
def normalProbabilityDensity(x):
    constant = 1.0 / np.sqrt(2*np.pi)
    return(constant * np.exp((-x**2) / 2.0))


zoe_percentile, _ = quad(normalProbabilityDensity, np.NINF, 1.25)
mike_percentile, _ = quad(normalProbabilityDensity, np.NINF, 1.00)
print('Zoe: ', zoe_percentile)
print('Mike: ', mike_percentile)
standard_normal_table = pd.DataFrame(
    data=[],
    index=np.round(np.arange(0, 3.5, .1), 2),
    columns=np.round(np.arange(0.00, .1, .01), 2)
)

for index in standard_normal_table.index:
    for column in standard_normal_table.columns:
        z = np.round(index + column, 2)
        value, _ = quad(normalProbabilityDensity, np.NINF, z)
        standard_normal_table.loc[index, column] = value

standard_normal_table.index = standard_normal_table.index.astype(str)
standard_normal_table.columns = [str(column).ljust(
    4, '0') for column in standard_normal_table.columns]
standard_normal_table


Zoe:  0.894350226333146
Mike:  0.8413447460685435


Unnamed: 0,0.00,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09
0.0,0.5,0.503989,0.507978,0.511966,0.515953,0.519939,0.523922,0.527903,0.531881,0.535856
0.1,0.539828,0.543795,0.547758,0.551717,0.55567,0.559618,0.563559,0.567495,0.571424,0.575345
0.2,0.57926,0.583166,0.587064,0.590954,0.594835,0.598706,0.602568,0.60642,0.610261,0.614092
0.3,0.617911,0.62172,0.625516,0.6293,0.633072,0.636831,0.640576,0.644309,0.648027,0.651732
0.4,0.655422,0.659097,0.662757,0.666402,0.670031,0.673645,0.677242,0.680822,0.684386,0.687933
0.5,0.691462,0.694974,0.698468,0.701944,0.705401,0.70884,0.71226,0.715661,0.719043,0.722405
0.6,0.725747,0.729069,0.732371,0.735653,0.738914,0.742154,0.745373,0.748571,0.751748,0.754903
0.7,0.758036,0.761148,0.764238,0.767305,0.77035,0.773373,0.776373,0.77935,0.782305,0.785236
0.8,0.788145,0.79103,0.793892,0.796731,0.799546,0.802337,0.805105,0.80785,0.81057,0.813267
0.9,0.81594,0.818589,0.821214,0.823814,0.826391,0.828944,0.831472,0.833977,0.836457,0.838913


In [5]:
def sampleSize(accurateResult, probabilityRequirement):
    n = 0
    z_value = st.norm.ppf((probabilityRequirement + 1) / 2)
    n = (pow(z_value, 2)) / (4 * pow(accurateResult, 2))
    n = n + 1
    return int(n)


size = sampleSize(accurateResult=0.01, probabilityRequirement=0.99)
print(
    f"Sample size = {size}")


Sample size = 16588


In [6]:
def instanceSelection(sampleSize, m):
    X = set({})
    a = 2
    b = 3
    x0 = 5
    c = x0
    X.update({x0})
    while(len(X) != sampleSize):
        x = (a * c + b) % m
        if not set({x}).issubset(X):
            X.update({x})
        else:
            break
        c = x
    return X


sample_size = sampleSize(accurateResult=0.01, probabilityRequirement=0.99)
sample_size = 5
random_selection = instanceSelection(sampleSize=sample_size, m=10)
random_selection = np.fromiter(random_selection, int, len(random_selection))
random_selection


array([9, 3, 5, 1])

In [7]:
def randomDatabset(itemset, sample_size=5):
    random_selection = instanceSelection(
        sampleSize=sample_size, m=len(itemset))
    random_selection = np.fromiter(
        random_selection, int, len(random_selection))
    j_index = np.where(random_selection < itemset.index.values.max() - 1,
                       random_selection + 1,  random_selection)
    return itemset.iloc[j_index]


RD = randomDatabset(itemset=itemset)
RD                                                                                                  


ItemName,A,B,C,D,E,F
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,1,1,1,1,0,1
5,1,0,1,0,1,0
7,1,0,0,0,1,1
3,0,1,0,1,0,0


In [8]:
import timeit

In [44]:
# Sebastian Raschka 2014-2020
# myxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
import pandas as pd


def generate_new_combinations(old_combinations):
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]  # get a single item in the last
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:

            yield from old_tuple
            yield item


def generate_new_combinations_low_memory(old_combinations, X, min_support,
                                         is_sparse):

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    rows_count = X.shape[0]
    threshold = min_support * rows_count
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        if is_sparse:
            mask_rows = X[:, old_tuple].toarray().all(axis=1)
            X_cols = X[:, valid_items].toarray()
            supports = X_cols[mask_rows].sum(axis=0)
        else:
            mask_rows = X[:, old_tuple].all(axis=1)
            supports = X[mask_rows][:, valid_items].sum(axis=0)
        valid_indices = (supports >= threshold).nonzero()[0]
        for index in valid_indices:
            yield supports[index]
            yield from old_tuple
            yield valid_items[index]


def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
            low_memory=False, mininterest=0.07):

    start = timeit.default_timer()

    def _support(_x, _n_rows, _is_sparse):
        out = (np.sum(_x, axis=0) / _n_rows)
        return np.array(out).reshape(-1)

    if min_support <= 0.:
        raise ValueError('`min_support` must be a positive '
                         'number within the interval `(0, 1]`. '
                         'Got %s.' % min_support)

    if hasattr(df, "sparse"):
        # DataFrame with SparseArray (pandas >= 0.24)
        if df.size == 0:
            X = df.values
        else:
            X = df.sparse.to_coo().tocsc()
        is_sparse = True
    else:
        # dense DataFrame
        X = df.values
        is_sparse = False

    support = _support(X, X.shape[0], is_sparse)
    ary_col_idx = np.arange(X.shape[1])
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])

    all_ones = np.ones((int(rows_count), 1))

    while max_itemset and max_itemset < (max_len or float('inf')):
        next_max_itemset = max_itemset + 1
        # conver from generator to numpy
        combin = generate_new_combinations(itemset_dict[max_itemset])
        combin = np.fromiter(combin, dtype=int)
        combin = combin.reshape(-1, next_max_itemset)
        # print(combin)
        if combin.size == 0:
            break
        if verbose:
            print(
                '\rProcessing %d combinations | Sampling itemset size %d' %
                (combin.size, next_max_itemset), end="")
        if is_sparse:
            _bools = X[:, combin[:, 0]] == all_ones
            for n in range(1, combin.shape[1]):
                _bools = _bools & (X[:, combin[:, n]] == all_ones)
        else:
            _bools = np.all(X[:, combin], axis=2)

        support = _support(np.array(_bools), rows_count, is_sparse)
        _mask = (support >= min_support).reshape(-1)
        print(np.apply_along_axis(
            func1d=lambda x:  (x), arr=X[:, combin], axis=1), )
        if any(_mask):
            itemset_dict[next_max_itemset] = np.array(combin[_mask])
            support_dict[next_max_itemset] = np.array(support[_mask])
            max_itemset = next_max_itemset
        else:
            break
        print()
    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
                             dtype='object')

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ['support', 'itemsets']
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
                                                      mapping[i] for i in x]))
    res_df = res_df.reset_index(drop=True)

    if verbose:
        print()  # adds newline if verbose counter was used
    stop = timeit.default_timer()
    execution_time = stop - start
    # print(f"execution time : {execution_time}")
    return res_df, execution_time


print("Apriori Itemsets")
fre_apriori_itemsets = apriori(itemset, min_support=0.3, use_colnames=True)
fre_apriori_itemsets[0]


Apriori Itemsets
[[[1 1]
  [1 0]
  [1 1]
  [1 0]
  [1 0]
  [1 0]
  [1 1]
  [1 0]
  [1 0]
  [0 1]
  [0 0]
  [0 0]
  [1 0]
  [1 0]
  [0 0]]

 [[1 1]
  [1 1]
  [1 1]
  [1 0]
  [1 0]
  [1 1]
  [1 1]
  [1 0]
  [1 0]
  [1 1]
  [1 0]
  [1 0]
  [1 0]
  [1 0]
  [0 0]]

 [[0 1]
  [0 0]
  [0 1]
  [0 0]
  [0 0]
  [1 0]
  [1 1]
  [1 0]
  [1 0]
  [0 1]
  [0 0]
  [0 0]
  [1 0]
  [1 0]
  [0 0]]

 [[0 1]
  [0 1]
  [0 1]
  [0 1]
  [0 0]
  [1 1]
  [1 1]
  [1 1]
  [1 0]
  [1 1]
  [1 1]
  [1 0]
  [1 1]
  [1 0]
  [1 0]]

 [[1 0]
  [1 1]
  [1 0]
  [1 1]
  [1 0]
  [0 1]
  [0 0]
  [0 1]
  [0 0]
  [1 0]
  [1 1]
  [1 0]
  [0 1]
  [0 0]
  [1 0]]

 [[0 1]
  [0 0]
  [0 1]
  [0 0]
  [0 1]
  [1 0]
  [1 1]
  [1 0]
  [1 1]
  [0 1]
  [0 0]
  [0 1]
  [1 0]
  [1 1]
  [0 1]]

 [[1 0]
  [1 0]
  [1 0]
  [1 1]
  [1 1]
  [0 0]
  [0 0]
  [0 1]
  [0 1]
  [0 0]
  [0 1]
  [0 1]
  [0 1]
  [0 1]
  [1 1]]

 [[0 0]
  [0 1]
  [0 0]
  [0 0]
  [0 1]
  [0 1]
  [0 0]
  [0 0]
  [0 1]
  [1 0]
  [1 0]
  [1 1]
  [0 0]
  [0 1]
  [0 1]]

 [[0 1]

Unnamed: 0,support,itemsets
0,0.5,(A)
1,0.7,(B)
2,0.6,(C)
3,0.6,(D)
4,0.3,(E)
5,0.5,(F)
6,0.3,"(B, A)"
7,0.3,"(A, C)"
8,0.3,"(A, D)"
9,0.4,"(B, C)"


In [37]:
itemset

ItemName,A,B,C,D,E,F
InvoiceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,1,0,0
2,1,1,1,1,0,0
3,0,1,0,1,0,0
4,0,1,1,1,1,0
5,1,0,1,0,1,0
6,0,1,0,1,0,1
7,1,0,0,0,1,1
8,0,0,1,0,0,1
9,0,1,1,0,0,1
10,1,1,1,1,0,1


In [11]:
itemset.sum(axis=0)


ItemName
A    5
B    7
C    6
D    6
E    3
F    5
dtype: int64