In [298]:
import pandas as pd
import numpy as np
import numpy_indexed as npi
from itertools import combinations, product
from functools import reduce
from collections.abc import Iterable
import math
# use for vertify
from mlxtend.frequent_patterns import apriori, association_rules


In [299]:
real_dataset_path = "/home/sokhorn/sokhorn/dataSet/data/Online Retail.csv"
testing_dataset_path = '/home/sokhorn/sokhorn/dataSet/data/sample_data_set.csv'

sample_dataset = pd.read_csv(
    testing_dataset_path, sep=',', usecols=[
        'InvoiceNo',
        'StockCode',
        'Quantity'
    ])
sample_dataset


Unnamed: 0,InvoiceNo,StockCode,Quantity
0,536365,85123A,6
1,536365,71053,6
2,536365,84406B,8
3,536365,84029G,6
4,536365,84029E,6
5,536365,22752,2
6,536365,21730,6
7,536366,22633,6
8,536366,22632,6
9,536367,84879,32


In [300]:
item_sets = (
    sample_dataset.groupby(['InvoiceNo', 'StockCode', ])['Quantity']
    .sum().unstack().reset_index().fillna(0)
    .set_index("InvoiceNo")
)
# item_sets = item_sets.iloc[:, :10]
item_sets = item_sets.head(1000)


In [301]:
item_sets = item_sets.applymap(lambda x: 1 if x > 0 else 0)
item_sets.reindex(sorted(item_sets.columns), axis=1)
item_sets

StockCode,21730,22632,22633,22745,22748,22749,22752,71053,84029E,84029G,84406B,84879,85123A
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
536365,1,0,0,0,0,0,1,1,1,1,1,0,1
536366,0,1,1,0,0,0,0,0,0,0,0,0,0
536367,0,0,0,1,1,1,0,0,0,0,0,1,0
536368,0,0,0,1,1,1,0,0,0,0,0,1,0
536369,0,0,0,1,1,1,0,0,0,0,0,1,0
536370,0,0,0,0,0,1,0,0,0,0,0,0,0
536371,0,0,0,0,0,1,0,0,0,0,0,0,0


**Generating RC column for each tranctions**


In [302]:
def rc(item_np):
    np_hash = {}
    item_rc = []
    for item in item_np:
        key = " ".join(map(str, item))
        if np_hash.get(key):
            np_hash[key] += 1
        else:
            np_hash[key] = 1

    for item in np_hash:
        values = list(map(int,  item.split()))
        values.append(np_hash[item])
        item_rc.append(values)
    return item_rc


In [303]:
# RC count generation 
itemset_reduct, rc_values =  npi.count(item_sets.values)

In [304]:
rc_np = np.array(rc(item_sets.values))
df_rc = pd.DataFrame(rc_np)
columns = list(item_sets.columns)
columns.append("RC")
df_rc.set_axis(columns, inplace=True, axis=1)
RC = df_rc['RC']
df_rc.drop(['RC'], axis=1, inplace=True)
df_rc


Unnamed: 0,21730,22632,22633,22745,22748,22749,22752,71053,84029E,84029G,84406B,84879,85123A
0,1,0,0,0,0,0,1,1,1,1,1,0,1
1,0,1,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,1,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0


In [305]:
# reduce(function, input)
def support_k_itemst(k_itemst):
    s = 0
    for i in range(len(k_itemst)):
        s += reduce(lambda a, b: a & b, k_itemst[i] & RC[i])
    return s


In [306]:
# support count 
def count_ocurence(itemset, Tranctions):
    count = 0
    for i in range(len(Tranctions)):
        if set(itemset).issubset(set(Tranctions[i])):
            count += 1
    return count


In [307]:
def join_set_item(set_of_its, order_column_name):
    C = []
    k = len(set_of_its[0])  # get K itmset size
    for i in range(len(set_of_its)):
        set_i = set_of_its[i]
        for j in range(i + 1, len(set_of_its)):
            set_j = set_of_its[j]
            if(k > 3):
                if(set_i[::k-1] == set_j[::k-1]):
                    it_out = join_two_itemsets(set_i, set_j, order_column_name)
                    if(len(it_out)) > 0:
                        C.append(it_out)
            else:
                it_out = join_two_itemsets(set_i, set_j, order_column_name)
                if(len(it_out)) > 0:
                    C.append(it_out)
    return C


**Fk-1 x Fk-1**


In [308]:
def apriori_gen(Lk):
    k = len(Lk)
    result_list = []
    for i in range(k):
        L1 = list(Lk[i])[::k-1]
        for j in range(i + 1, k):
            L2 = list(Lk[j])[::k-1]
            if L1 == L2:
                result_list.append(np.union1d(Lk[i], Lk[j]))
            else:
                break
    return result_list


In [309]:
def join_two_itemsets(it1, it2, order):
    # it1.sort(key=lambda x: order.index(x))
    # it2.sort(key=lambda x: order.index(x))

    for i in range(len(it1) - 1):  # check befor the last one
        if it1[i] != it2[i]:
            return []

    if order.index(it1[-1]) < order.index(it2[-1]):
        return [it1] + [it2[-1]]
    return []


**Join Itemset**


In [310]:
def mergeKItemIntoOne(ab):
    result = []
    if(len(ab) != 0):
        if(len(ab[0]) == 1):
            return ab
        else:
            for item in ab:
                res = item[0] + item[1].split()
                result.append(res)
        return result


In [311]:
def get_frequent(itemesets, min_support, prev_discard):
    L = []
    support_count = []
    new_discard = []
    column_items = mergeKItemIntoOne(itemesets)

    k = len(prev_discard)
    for i in range(len(itemesets)):
        discard_before = False
        item = itemesets[i]
        result = []
        for i_item in item:
            if isinstance(i_item, list):
                for j in i_item:
                    result.append(j)
            else:
                result.append(i_item)

        if k > 0:
            for it in prev_discard[k]:
                if set(it).issubset(set(result)):
                    discard_before = True
                    break

        if not discard_before:
            # print(f'item {mergeKItemIntoOne(item)}')
            count = support_k_itemst(df_rc[column_items[i]].values)
            if count >= min_support:
                L.append(result)
                support_count.append(count)
            else:
                new_discard.append(result)
    return L, support_count, new_discard


In [312]:
def generate_next_itemset(L):
    k = L.shape[-1] + 1
    print(len(L))
    aa = np.array(list(combinations(L, k)))
    col_name = np.hstack((aa[:, 0, :], aa[:, 1, :]))
    return col_name


**Generating 1 itemse base on support count of row**


In [313]:
C = {}
L = {}
k_items = []
Discard = {}
itemset_size = 1
min_support = 1
Discard.update({itemset_size: []})


#### Generating 1 itemset


In [314]:
# remove each column who support count are less than min_support for 1 itemsets
rc_1_itemset = df_rc.sum(axis=0)
# remove all column which sum of row are ness than user defind support threshold
cut_our_cols = rc_1_itemset.loc[lambda x: x < min_support].index
cut_our_cols
# cut our every itemset which are sum of each row are less than user defind support
df_rc.drop(labels=cut_our_cols, axis=1, inplace=True)
df_rc


Unnamed: 0,21730,22632,22633,22745,22748,22749,22752,71053,84029E,84029G,84406B,84879,85123A
0,1,0,0,0,0,0,1,1,1,1,1,0,1
1,0,1,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,1,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0


In [315]:
C.update({itemset_size: np.reshape(list(df_rc.columns), (-1, 1))})

In [316]:
support_count = {}
f, supp, new_discard = get_frequent(
    C[itemset_size], min_support, Discard
)
Discard.update({itemset_size: new_discard})
L.update({itemset_size: f})
support_count.update({itemset_size: supp})


In [317]:
k = itemset_size + 1
while True:
    try:
        C.update({k: join_set_item(L[k - 1], list(df_rc.columns))})
        f, supp, new_discard = get_frequent(C[k], min_support, Discard)
        L.update({k: f})
        Discard.update({k: new_discard})
        support_count.update({k: supp})
        if(len(L[k]) == 0):
            break
        k += 1
    except:
        print("Hello error ")



 [[['21730'], '22632'], [['21730'], '22633'], [['21730'], '22745'], [['21730'], '22748'], [['21730'], '22749'], [['21730'], '22752'], [['21730'], '71053'], [['21730'], '84029E'], [['21730'], '84029G'], [['21730'], '84406B'], [['21730'], '84879'], [['21730'], '85123A'], [['22632'], '22633'], [['22632'], '22745'], [['22632'], '22748'], [['22632'], '22749'], [['22632'], '22752'], [['22632'], '71053'], [['22632'], '84029E'], [['22632'], '84029G'], [['22632'], '84406B'], [['22632'], '84879'], [['22632'], '85123A'], [['22633'], '22745'], [['22633'], '22748'], [['22633'], '22749'], [['22633'], '22752'], [['22633'], '71053'], [['22633'], '84029E'], [['22633'], '84029G'], [['22633'], '84406B'], [['22633'], '84879'], [['22633'], '85123A'], [['22745'], '22748'], [['22745'], '22749'], [['22745'], '22752'], [['22745'], '71053'], [['22745'], '84029E'], [['22745'], '84029G'], [['22745'], '84406B'], [['22745'], '84879'], [['22745'], '85123A'], [['22748'], '22749'], [['22748'], '22752'], [['22748'], '

In [None]:
C

In [318]:
L

{1: [['21730'],
  ['22632'],
  ['22633'],
  ['22745'],
  ['22748'],
  ['22749'],
  ['22752'],
  ['71053'],
  ['84029E'],
  ['84029G'],
  ['84406B'],
  ['84879'],
  ['85123A']],
 2: [['21730', '22752'],
  ['21730', '71053'],
  ['21730', '84029E'],
  ['21730', '84029G'],
  ['21730', '84406B'],
  ['21730', '85123A'],
  ['22632', '22633'],
  ['22745', '22748'],
  ['22745', '22749'],
  ['22745', '84879'],
  ['22748', '22749'],
  ['22748', '84879'],
  ['22749', '84879'],
  ['22752', '71053'],
  ['22752', '84029E'],
  ['22752', '84029G'],
  ['22752', '84406B'],
  ['22752', '85123A'],
  ['71053', '84029E'],
  ['71053', '84029G'],
  ['71053', '84406B'],
  ['71053', '85123A'],
  ['84029E', '84029G'],
  ['84029E', '84406B'],
  ['84029E', '85123A'],
  ['84029G', '84406B'],
  ['84029G', '85123A'],
  ['84406B', '85123A']],
 3: [['21730', '22752', '71053'],
  ['21730', '22752', '84029E'],
  ['21730', '22752', '84029G'],
  ['21730', '22752', '84406B'],
  ['21730', '22752', '85123A'],
  ['21730', '7105

**End Here**

**Computing the minimum-support for mining frequent pattern**


In [319]:
all_supportcount = support_count


In [320]:
sum(all_supportcount[1]) / len(all_supportcount)


2.6

In [321]:
np.arange(5).sum()/5


2.0

In [322]:
support_1_itemset = all_supportcount[1]
Aavesupp = np.array(support_1_itemset).sum() / len(support_1_itemset)
Aavesupp


1.0

In [323]:
m = len(df_rc.columns)


L - S - R
L : left gradent
S : symitri
R : right gradent


In [324]:
def avg(supports):
    return sum(supports) / len(supports)

In [325]:
def lean(support_count_i_itemset, M=m, avg=Aavesupp):
    less_than_avg = []
    greater_than_avg = []
    for j in support_count_i_itemset:
        if j < avg:
            less_than_avg.append(1)
        elif j > avg:
            greater_than_avg.append(1)
    # print(f"Less avg {less_than_avg}")
    # print(f"Greater avg {greater_than_avg}")
    return (sum(less_than_avg) - sum(greater_than_avg)) / M


we assume that Apriori(D, k) generates a set of all k-itemsets in D, where k â‰¥ 1. Without
any prior knowledge we could estimate a, b and A avesupp as follows.


In [326]:
a = 1 / m
b = max(support_count[max(support_count, key=support_count.get)])


In [327]:
index = np.arange(60)


In [328]:
a = np.ones(100)
a[-index] = 10
a

array([10.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
       10., 10., 10., 10., 10., 10., 10., 10., 10.])

In [329]:
def a_avgsupp(m, k, N):
    n_over_N = sum([math.pow(m / N, i) for i in range(k, m)])
    operation = 1 / (m - k + 1)
    return n_over_N * operation


a_avgsupp(5, 2, 1000)


6.28140625e-06

# Todo

calculate Linear approximation here bro


In [330]:
supports = reduce(lambda x, y: x+y, support_count.values())
len(supports)


116

In [331]:
lean(support_count[1])


0.0

In [332]:
# Computing the minimum-support for mining frequent pattern


#### Testing code


In [333]:
# support_k_itemst(df_rc[col_name[0]].values)


In [334]:
# supp = support_k_itemst(df_rc[col_name].values)
# supp


In [335]:
# supp_2_itemset =  [support_k_itemst(df_rc[col].values) for col in col_2_item]
# supp_2_itemset


sort_by_index = lambda cols : cols.index  
print(f"List before Sort {L[1]}")
l = list(L[1]).sort(key =sort_by_index(list(df_rc.columns)))
print(f"List after Sort {l}")


### Debug code


In [336]:
np.array(np.meshgrid(L[3], L[3]))

array([[['21730', '22752', '71053', ..., '84029G', '84406B', '85123A'],
        ['21730', '22752', '71053', ..., '84029G', '84406B', '85123A'],
        ['21730', '22752', '71053', ..., '84029G', '84406B', '85123A'],
        ...,
        ['21730', '22752', '71053', ..., '84029G', '84406B', '85123A'],
        ['21730', '22752', '71053', ..., '84029G', '84406B', '85123A'],
        ['21730', '22752', '71053', ..., '84029G', '84406B', '85123A']],

       [['21730', '21730', '21730', ..., '21730', '21730', '21730'],
        ['22752', '22752', '22752', ..., '22752', '22752', '22752'],
        ['71053', '71053', '71053', ..., '71053', '71053', '71053'],
        ...,
        ['84029G', '84029G', '84029G', ..., '84029G', '84029G',
         '84029G'],
        ['84406B', '84406B', '84406B', ..., '84406B', '84406B',
         '84406B'],
        ['85123A', '85123A', '85123A', ..., '85123A', '85123A',
         '85123A']]], dtype='<U6')

**Candedate Generation**


**F_k-1 x F_k-1** <img src="images/fk-1xfk-1.png"  />


In [337]:
# unq, _ = np.unique(k_2, axis=0, return_inverse=True)
# cnt = np.bincount(_)
# unq = unq.view(k_2.dtype).reshape(-1, k_2.shape[1])
# uniqe_item = np.column_stack((unq, cnt))
# uniqe_item


### Start Explore numpy


In [338]:
def cartesian(*arrays):
    mesh = np.meshgrid(*arrays)  # standard numpy meshgrid
    dim = len(mesh)  # number of dimensions
    elements = mesh[0].size  # number of elements, any index will do
    flat = np.concatenate(mesh).ravel()  # flatten the whole meshgrid
    reshape = np.reshape(flat, (dim, elements)).T  # reshape and transpose
    return reshape


In [339]:
b = cartesian(a, a, a)
print(b)


[[10. 10. 10.]
 [10. 10.  1.]
 [10. 10.  1.]
 ...
 [10. 10. 10.]
 [10. 10. 10.]
 [10. 10. 10.]]


In [340]:
def repeat_product(x, y):
    x_out = np.tile(x, len(y))
    y_out = np.repeat(y, len(x))
    print(f"x {x} , y {y}")
    output = np.transpose([x_out, y_out])
    return output


In [341]:
# mesh = np.array(np.meshgrid(df_rc, product_names))
# combinations = mesh.T.reshape(-1, 2)
# combinations


In [342]:
def combinations_of_2(l):
    for i, j in zip(*np.triu_indices(len(l), 1)):
        # print(l[i], l[j])
        yield l[i], l[j]


In [343]:
def combinations_of_2(l):
    for i, j in zip(*np.triu_indices(len(l), 1)):
        yield l[i], l[j]


In [344]:
support_k_itemst(df_rc[product_names].values)

NameError: name 'product_names' is not defined

In [None]:
cmb = combinations_of_2(product_names)
for i in list(cmb):
    print(i)


In [None]:
def apriori_gen_yield(Lk):
    k = len(Lk)
    for i in range(k):
        L1 = list(Lk[i])[::k-1]
        for j in range(i + 1, k):
            L2 = list(Lk[j])[::k-1]
            if L1 == L2:
                yield list(np.union1d(Lk[i], Lk[j]))
            else:
                break


In [None]:
df_rc[apriori_gen(i)]

In [None]:
df_rc[['21730','22752','71053','84029E','84029G']]

In [None]:
for i in apriori_gen_yield(L[4]):
    df_rc[i]

In [386]:

item = [
    ['A', 'B', 'C'],
    ['A', 'B', 'D'],
    ['A', 'B', 'E'],
    ['A', 'C', 'D'],
    ['B', 'C', 'D'],
    ['B', 'D', 'E'],
    ['C', 'D', 'E'],
]

# F 3 = {ABC,ABD,ABE,ACD,BCD,BDE,CDE}
# when we already sorted this array can loop to next item if it have, orderwise break loop


def apriori_gen(Lk):
    k = len(Lk)
    k_1_item = len(Lk[0]) - 1
    result_list = []
    for i in range(k):
        L1 = list(Lk[i])[-k_1_item::]
        for j in range(i + 1, k):
            L2 = list(Lk[j])[:k_1_item:]
            if L1 == L2:
                result_list.append(np.union1d(Lk[i], Lk[j]))
                # test next item the same or not, if no break this loop
                if(j + 1 < k):
                    if((L1 != list(Lk[j + 1])[:k_1_item:])):
                        break

    return result_list


b = apriori_gen(item)
b


[array(['A', 'B', 'C', 'D'], dtype='<U1'),
 array(['A', 'B', 'D', 'E'], dtype='<U1'),
 array(['A', 'C', 'D', 'E'], dtype='<U1'),
 array(['B', 'C', 'D', 'E'], dtype='<U1')]

In [349]:
a = np.arange(5)
a

array([0, 1, 2, 3, 4])

In [350]:
a[:2:]

array([0, 1])

In [351]:
a[-2::]

array([3, 4])