In [1]:
import pandas as pd
import numpy as np
import numpy_indexed as npi
from itertools import combinations, product
from functools import reduce
from collections.abc import Iterable
import math
import timeit
import matplotlib.pyplot as plt
import matplotlib
from mlxtend.frequent_patterns import association_rules


In [2]:
real_dataset_path = "/home/sokhorn/sokhorn/dataSet/data/Online Retail.csv"
testing_dataset_path = '/home/sokhorn/sokhorn/dataSet/data/sample_data_set.csv'
tranction_reduction_itemsets = '/home/sokhorn/sokhorn/dataSet/data/sample_tranc_red.csv'
sample_dataset = pd.read_csv(
    real_dataset_path, sep=',', usecols=[
        'InvoiceNo',
        'StockCode',
        'Quantity', "Country"
    ])

# item_sets = (
#     sample_dataset[sample_dataset['Country'] == 'France'].groupby(['InvoiceNo', 'StockCode', ])['Quantity']
#     .sum().unstack().reset_index().fillna(0)
#     .set_index("InvoiceNo")
# )
# item_sets = (
#     sample_dataset.groupby(['InvoiceNo', 'StockCode', ])['Quantity']
#     .sum().unstack().reset_index().fillna(0)
#     .set_index("InvoiceNo")
# )
# item_sets = item_sets.applymap(lambda x: 1 if x > 0 else 0)
# item_sets.reindex(sorted(item_sets.columns), axis=1)


In [8]:
sample_dataset.head(-1)

Unnamed: 0,InvoiceNo,StockCode,Quantity,Country
0,536365,85123A,6,United Kingdom
1,536365,71053,6,United Kingdom
2,536365,84406B,8,United Kingdom
3,536365,84029G,6,United Kingdom
4,536365,84029E,6,United Kingdom
...,...,...,...,...
541918,581597,23256,4,France
541919,581598,22613,12,France
541920,581599,22899,6,France
541921,581600,23254,4,France


In [None]:
def itemsett_by_country(country):
    item_sets_country = (
        sample_dataset[sample_dataset['Country'] == country].groupby(
            ['InvoiceNo', 'StockCode', ])['Quantity']
        .sum().unstack().reset_index().fillna(0)
        .set_index("InvoiceNo")
    )
    item_sets_country = item_sets_country.applymap(lambda x: 1 if x > 0 else 0)
    item_sets_country.reindex(sorted(item_sets_country.columns), axis=1)
    return item_sets_country


In [None]:
item_sets = itemsett_by_country("France")
item_sets = item_sets.iloc[0: 10]
item_sets


In [None]:
def user_minsupp(item_sets):
    minimum_support_count = (60 / 100) * len(item_sets)
    return minimum_support_count


user_minsupp(item_sets) / len(item_sets)


### Tranditonal Apriori Algorithm


In [None]:
# Sebastian Raschka 2014-2020
# myxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
import pandas as pd


def generate_new_combinations(old_combinations):
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]  # get a single item in the last
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:

            yield from old_tuple
            yield item


def generate_new_combinations_low_memory(old_combinations, X, min_support,
                                         is_sparse):

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    rows_count = X.shape[0]
    threshold = min_support * rows_count
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        if is_sparse:
            mask_rows = X[:, old_tuple].toarray().all(axis=1)
            X_cols = X[:, valid_items].toarray()
            supports = X_cols[mask_rows].sum(axis=0)
        else:
            mask_rows = X[:, old_tuple].all(axis=1)
            supports = X[mask_rows][:, valid_items].sum(axis=0)
        valid_indices = (supports >= threshold).nonzero()[0]
        for index in valid_indices:
            yield supports[index]
            yield from old_tuple
            yield valid_items[index]


def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
            low_memory=False):
    start = timeit.default_timer()

    def _support(_x, _n_rows, _is_sparse):

        out = (np.sum(_x, axis=0) / _n_rows)
        return np.array(out).reshape(-1)

    if min_support <= 0.:
        raise ValueError('`min_support` must be a positive '
                         'number within the interval `(0, 1]`. '
                         'Got %s.' % min_support)

    if hasattr(df, "sparse"):
        # DataFrame with SparseArray (pandas >= 0.24)
        if df.size == 0:
            X = df.values
        else:
            X = df.sparse.to_coo().tocsc()
        is_sparse = True
    else:
        # dense DataFrame
        X = df.values
        is_sparse = False
    support = _support(X, X.shape[0], is_sparse)
    ary_col_idx = np.arange(X.shape[1])
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])

    all_ones = np.ones((int(rows_count), 1))

    while max_itemset and max_itemset < (max_len or float('inf')):
        next_max_itemset = max_itemset + 1

        # With exceptionally large datasets, the matrix operations can use a
        # substantial amount of memory. For low memory applications or large
        # datasets, set `low_memory=True` to use a slower but more memory-
        # efficient implementation.
        if low_memory:
            combin = generate_new_combinations_low_memory(
                itemset_dict[max_itemset], X, min_support, is_sparse)
            # slightly faster than creating an array from a list of tuples
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset + 1)

            if combin.size == 0:
                break
            if verbose:
                print(
                    '\rProcessing %d combinations | Sampling itemset size %d' %
                    (combin.size, next_max_itemset), end="")

            itemset_dict[next_max_itemset] = combin[:, 1:]
            support_dict[next_max_itemset] = combin[:, 0].astype(float) \
                / rows_count
            max_itemset = next_max_itemset
        else:
            # conver from generator to numpy
            combin = generate_new_combinations(itemset_dict[max_itemset])
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset)

            # end generator
            if combin.size == 0:
                break
            if verbose:
                print(
                    '\rProcessing %d combinations | Sampling itemset size %d' %
                    (combin.size, next_max_itemset), end="")

            if is_sparse:
                _bools = X[:, combin[:, 0]] == all_ones
                for n in range(1, combin.shape[1]):
                    _bools = _bools & (X[:, combin[:, n]] == all_ones)
            else:
                _bools = np.all(X[:, combin], axis=2)

            support = _support(np.array(_bools), rows_count, is_sparse)

            _mask = (support >= min_support).reshape(-1)

            if any(_mask):

                itemset_dict[next_max_itemset] = np.array(combin[_mask])
                support_dict[next_max_itemset] = np.array(support[_mask])
                max_itemset = next_max_itemset

            else:
                # Exit condition
                # when there no more itemst
                break

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
                             dtype='object')

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ['support', 'itemsets']
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
                                                      mapping[i] for i in x]))
    res_df = res_df.reset_index(drop=True)

    if verbose:
        print()  # adds newline if verbose counter was used
    stop = timeit.default_timer()
    execution_time = stop - start
    # print(f"execution time : {execution_time}")
    return res_df, execution_time


print("Apriori Itemsets")
fre_apriori_itemsets = apriori(item_sets, min_support=0.4, use_colnames=True)


### Tranction Reduction


In [None]:
# RC count generation
itemset_reductabc, rc_values = npi.count(item_sets.values)
itemset_reductabc = pd.DataFrame(itemset_reductabc, columns=item_sets.columns)
itemset_reductabc = itemset_reductabc.astype(int)


In [None]:
# Sebastian Raschka 2014-2020
# myxtend Machine Learning Library Extensions
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import numpy as np
import pandas as pd


# RC count generation
itemset_reductabc, rc_values = npi.count(item_sets.values)
itemset_reductabc = pd.DataFrame(itemset_reductabc, columns=item_sets.columns)
itemset_reductabc = itemset_reductabc.astype(int)


def generate_new_combinations(old_combinations):

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]  # get a single item in the last
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]

        old_tuple = tuple(old_combination)
        for item in valid_items:
            yield from old_tuple
            yield item


def generate_new_combinations_low_memory(old_combinations, X, min_support,
                                         is_sparse):

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    rows_count = X.shape[0]
    threshold = min_support * rows_count
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        if is_sparse:
            mask_rows = X[:, old_tuple].toarray().all(axis=1)
            X_cols = X[:, valid_items].toarray()
            supports = X_cols[mask_rows].sum(axis=0)
        else:
            mask_rows = X[:, old_tuple].all(axis=1)
            supports = X[mask_rows][:, valid_items].sum(axis=0)
        valid_indices = (supports >= threshold).nonzero()[0]
        for index in valid_indices:
            yield supports[index]
            yield from old_tuple
            yield valid_items[index]


def apriori_tranction_reduction(df, min_support=0.5, use_colnames=False, max_len=None, rc_values=None, verbose=0):
    start = timeit.default_timer()

    def _tranction_reduc_support(k_itemset, rc_values, tranction_size):
        print(f"Tranction Size {tranction_size}")
        return ((np.bitwise_and.reduce(k_itemset, axis=2) * rc_values.reshape(-1, 1)).sum(axis=0) / tranction_size)

    # def fre_1_itemset(item, minsupp=0):
    #     print(f"Min Support : {minsupp}")
    #     rc_of_1_itemset = item.sum(axis=0)
    #     item_index = np.arange(item.shape[1])
    #     return rc_of_1_itemset.values[rc_of_1_itemset.values > minsupp], item_index[rc_of_1_itemset.values > minsupp]

    def fre_1_itemset(item, minsupp=0):
        rc_of_1_itemset = item.sum(axis=0)
        tranction_size = len(item)
        item_index = np.arange(item.shape[1])
        mask = (rc_of_1_itemset.values / tranction_size) >= minsupp
        return rc_of_1_itemset.values[mask] / tranction_size, item_index[mask]

    if min_support <= 0.:
        raise ValueError('`min_support` must be a positive '
                         'number within the interval `(0, 1]`. '
                         'Got %s.' % min_support)

    if hasattr(df, "sparse"):
        # DataFrame with SparseArray (pandas >= 0.24)
        if df.size == 0:
            X = df.values
        else:
            X = df.sparse.to_coo().tocsc()
    else:
        # dense DataFrame
        X = df.values
    tranction_size = len(X)
    one_itemset, support_one_itemset = fre_1_itemset(
        df, minsupp=min_support)

    support_dict = {1: one_itemset}
    itemset_dict = {1: support_one_itemset.reshape(-1, 1)}
    max_itemset = 1

    while max_itemset and max_itemset < (max_len or float('inf')):

        next_max_itemset = max_itemset + 1
        # convert from generator to numpy
        combin = generate_new_combinations(itemset_dict[max_itemset])
        combin = np.fromiter(combin, dtype=int)
        combin = combin.reshape(-1, next_max_itemset)

        if combin.size == 0:  # No more itemset to generate
            break

        support = _tranction_reduc_support(
            X[:, combin], rc_values, tranction_size
        )
        _mask = (support >= min_support).reshape(-1)
        if any(_mask):
            # this will be generate item those are frequent
            itemset_dict[next_max_itemset] = np.array(combin[_mask])
            support_dict[next_max_itemset] = np.array(support[_mask])
            max_itemset = next_max_itemset
        else:
            # Exit condition
            # when there no more itemst
            break

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
                             dtype='object')
        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ['support', 'itemsets']
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df['itemsets'] = res_df['itemsets'].apply(
            lambda x: frozenset([mapping[i] for i in x])
        )
    res_df = res_df.reset_index(drop=True)
    stop = timeit.default_timer()
    execution_time = stop - start
    # print(f"execution time : {execution_time}")
    return res_df, execution_time


print("Apriori Tranction Reduction Itemsets")
a, t = apriori_tranction_reduction(
    itemset_reductabc, min_support=1, use_colnames=True, rc_values=rc_values)
a


### Actual Minimum support


In [None]:
def generate_new_combinations(old_combinations):
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]  # get a single item in the last
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:

            yield from old_tuple
            yield item


def generate_new_combinations_low_memory(old_combinations, X, min_support,
                                         is_sparse):

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    rows_count = X.shape[0]
    threshold = min_support * rows_count
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        if is_sparse:
            mask_rows = X[:, old_tuple].toarray().all(axis=1)
            X_cols = X[:, valid_items].toarray()
            supports = X_cols[mask_rows].sum(axis=0)
        else:
            mask_rows = X[:, old_tuple].all(axis=1)
            supports = X[mask_rows][:, valid_items].sum(axis=0)
        valid_indices = (supports >= threshold).nonzero()[0]
        for index in valid_indices:
            yield supports[index]
            yield from old_tuple
            yield valid_items[index]


def actual_apriori(df, min_support=0, use_colnames=False, max_len=None, verbose=0,
                   low_memory=False):
    start = timeit.default_timer()

    def _support(_x, _n_rows, _is_sparse):

        out = (np.sum(_x, axis=0) / _n_rows)
        return np.array(out).reshape(-1)

    if hasattr(df, "sparse"):
        # DataFrame with SparseArray (pandas >= 0.24)
        if df.size == 0:
            X = df.values
        else:
            X = df.sparse.to_coo().tocsc()
        is_sparse = True
    else:
        # dense DataFrame
        X = df.values
        is_sparse = False
    support = _support(X, X.shape[0], is_sparse)
    ary_col_idx = np.arange(X.shape[1])
    support_dict = {1: support[support > min_support]}
    itemset_dict = {1: ary_col_idx[support > min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])

    all_ones = np.ones((int(rows_count), 1))

    while max_itemset and max_itemset < (max_len or float('inf')):
        next_max_itemset = max_itemset + 1

        # With exceptionally large datasets, the matrix operations can use a
        # substantial amount of memory. For low memory applications or large
        # datasets, set `low_memory=True` to use a slower but more memory-
        # efficient implementation.
        if low_memory:
            combin = generate_new_combinations_low_memory(
                itemset_dict[max_itemset], X, min_support, is_sparse)
            # slightly faster than creating an array from a list of tuples
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset + 1)

            if combin.size == 0:
                break
            if verbose:
                print(
                    '\rProcessing %d combinations | Sampling itemset size %d' %
                    (combin.size, next_max_itemset), end="")

            itemset_dict[next_max_itemset] = combin[:, 1:]
            support_dict[next_max_itemset] = combin[:, 0].astype(float) \
                / rows_count
            max_itemset = next_max_itemset
        else:
            # conver from generator to numpy
            combin = generate_new_combinations(itemset_dict[max_itemset])
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset)

            # end generator
            if combin.size == 0:
                break
            if verbose:
                print(
                    '\rProcessing %d combinations | Sampling itemset size %d' %
                    (combin.size, next_max_itemset), end="")

            if is_sparse:
                _bools = X[:, combin[:, 0]] == all_ones
                for n in range(1, combin.shape[1]):
                    _bools = _bools & (X[:, combin[:, n]] == all_ones)
            else:
                _bools = np.all(X[:, combin], axis=2)

            support = _support(np.array(_bools), rows_count, is_sparse)

            _mask = (support > min_support).reshape(-1)

            if any(_mask):
                itemset_dict[next_max_itemset] = np.array(combin[_mask])
                support_dict[next_max_itemset] = np.array(support[_mask])
                max_itemset = next_max_itemset
            else:
                # Exit condition
                # when there no more itemst
                break

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
                             dtype='object')
        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ['support', 'itemsets']
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
                                                      mapping[i] for i in x]))
    res_df = res_df.reset_index(drop=True)

    if verbose:
        print()  # adds newline if verbose counter was used
    stop = timeit.default_timer()
    execution_time = stop - start
    print(f"execution_time : {execution_time}")
    return res_df


In [None]:
act_frieq_itemset = actual_apriori(
    item_sets, use_colnames=True, min_support=0.4)


In [None]:
def lean(supports, avg, m):
    less_avg = np.apply_along_axis(
        lambda x: x < avg, arr=supports, axis=0).sum()
    greater_avg = np.apply_along_axis(
        lambda x: x > avg, arr=supports, axis=0).sum()
    return (less_avg - greater_avg) / m


def aproximate_minsupport(r_min, min_supp, max_min, n=1):
    a_n = pow(min_supp, n)
    b_n = pow(max_min, n)
    x_n = (r_min - (a_n / (a_n - b_n))) * (b_n - a_n)
    return x_n


In [None]:
supports = act_frieq_itemset['support']
a, b = supports.min(), supports.max()
m = len(supports)
avg = supports.sum() / m
lean_value = lean(supports, m, avg)
print(f"Lean : {lean_value}\nMinimum support : {a}\nMaximum support : {b}\nAverage : {avg}\nTranction size : {m}")


In [None]:
def a_b_m_avg_lean_actual_support(supports, user_min_support):
    a, b = supports.min(), supports.max()
    m = len(supports)
    avg = supports.sum() / m
    lean_value = lean(supports, m, avg)
    act_min_support = aproximate_minsupport(user_min_support, a, b)
    return a, b, avg, lean_value, act_min_support


In [None]:
# apriximate minimum support with linear strategy
aproximate_minsupport(0.2, a, b)
# apriximate minimum support with Polynomial strategy
aproximate_minsupport(0.2, a, b, n=3)


In [None]:
def apriori_actualMinimumSupport(min_support, data):
    itemset_apriori = apriori(min_support=min_support, df=data)
    act_min = a_b_m_avg_lean_actual_support(
        supports=itemset_apriori['support'], user_min_support=min_support)[-1]
    return itemset_apriori, act_min


# apriori_itemset, act_minsupport = apriori_actualMinimumSupport(
#     min_support=0.2, data=item_sets)


In [None]:
def association_maxAntecedent_maxConsequents(fre_itemset, min_threshold=0.6):
    start = timeit.default_timer()
    rules = association_rules(
        fre_itemset, metric='confidence', min_threshold=min_threshold)
    max_antecedents = rules['antecedents'].str.len().max()
    max_consequents = rules['consequents'].str.len().max()
    stop = timeit.default_timer()
    execution_time = stop - start
    print(f"execution time : {execution_time}")
    return rules, max_antecedents, max_consequents, execution_time


### Association Rule


In [None]:
# # Apriori
# apriori_rules, maxAntecedentsAp, maxConsequentsAp, execution_time_Ap = association_maxAntecedent_maxConsequents(
#     fre_apriori_itemsets, min_threshold=0.6)
# tranction_redu_apriori_rules, maxAntecedentsTR, maxConsequentsTR, execution_time_TR = association_maxAntecedent_maxConsequents(
#     freq_itemset_tranction_reduc, min_threshold=0.6)


In [None]:
# print(
#     f"Apriori\nRule = {apriori_rules.shape[0]}\nMinimum antecedents : {maxAntecedentsAp}\nMaximum consequents : {maxConsequentsAp}")
# print(
#     f"\nApriori Tranction Reduction\nRule = {tranction_redu_apriori_rules.shape[0]}\nMinimum antecedents : {maxAntecedentsTR}\nMaximum consequents : {maxConsequentsTR}")


In [None]:
def item_can_recommend(antecedents, consequents):
    antecedents = antecedents.str.len()
    consequents = consequents.str.len()
    print(f"antecedents : {antecedents.max()}")
    ants = np.arange(1, antecedents.max())
    result = {}
    for i in ants:
        index = consequents[antecedents > i]
        item = set(index)
        result.update({i: item})
    return result


# apriori_rules['consequents'][13616]
# apriori_rules['antecedents'][13616]
# print("Apriori :  When User Purchase : It can recommend up to")
# item_can_recommend(apriori_rules['antecedents'].str.len(
# ), apriori_rules['consequents'].str.len())


In [None]:
# print("Apriori Tranction Reduction :  When User Purchase : It can recommend up to")
# item_can_recommend(tranction_redu_apriori_rules['antecedents'].str.len(
# ), tranction_redu_apriori_rules['consequents'].str.len())


### Test


In [None]:
# supports = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
supports = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
            0.9, 1, 1.1, 1.2, 1.5, 1.6, 1.7, 1.8, 1.9, 2]
# plot execution time
time_exc_apriori = {}
time_exc_tranction_redc = {}
for support in supports:
    fre_apriori_itemsets, execution_time_apriori = apriori(
        item_sets, min_support=support, use_colnames=True)
    fre_apriori_tranction_red_itemsets, execution_time_apriori_tranc_red = apriori_tranction_reduction(
        item_sets, min_support=support, use_colnames=True, rc_values=rc_values)
    time_exc_apriori.update({
        support: execution_time_apriori
    })
    time_exc_tranction_redc.update({
        support: execution_time_apriori_tranc_red
    })
    # print(
    #     f"execution_time_apriori : {execution_time_apriori}, execution_time_apriori_tranc_red : {execution_time_apriori_tranc_red}")


In [None]:
time_exc_apriori.values()


In [None]:
plt.xlabel("Support")
plt.ylabel("Time (ms)")
plt.plot(time_exc_apriori.keys(), time_exc_apriori.values(),
         marker="o", label="Apriori")
plt.plot(time_exc_tranction_redc.keys(),
         time_exc_tranction_redc.values(), marker="o", label="Tranction Reduction")
plt.legend()
plt.grid()


In [None]:
fre_apriori_itemsets, execution_time_apriori = apriori(
    item_sets, min_support=1, use_colnames=True)


fre_apriori_tranction_red_itemsets, execution_time_apriori_tranc_red = apriori_tranction_reduction(
    itemset_reductabc, min_support=1, use_colnames=True, rc_values=rc_values)


In [None]:
fre_apriori_itemsets, execution_time_apriori = apriori(
    item_sets, min_support=0.2, use_colnames=True)
fre_apriori_itemsets


In [None]:
rules = association_rules(fre_apriori_itemsets,
                          metric='confidence', min_threshold=0.6)
rules


In [None]:
item_can_recommend(rules['antecedents'], rules['consequents'])


In [None]:
item_sets = itemsett_by_country("France")
item_sets = item_sets.iloc[0: 10]
item_sets


In [None]:
supports = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
            0.9, 1, 1.1, 1.2, 1.5, 1.6, 1.7, 1.8, 1.9, 2]
# plot execution time
time_exc_apriori = {}
time_exc_tranction_redc = {}
for support in supports:
    fre_apriori_itemsets, execution_time_apriori = apriori(
        item_sets, min_support=support, use_colnames=True)
    fre_apriori_tranction_red_itemsets, execution_time_apriori_tranc_red = apriori_tranction_reduction(
        item_sets, min_support=support, use_colnames=True, rc_values=rc_values)
    time_exc_apriori.update({
        support: execution_time_apriori
    })
    time_exc_tranction_redc.update({
        support: execution_time_apriori_tranc_red
    })


In [None]:
fre_apriori_itemsets, execution_time_apriori = apriori(
    item_sets, min_support=0.5, use_colnames=True)


In [None]:
def rules_by_country(country, apriori_itemset, tranction_reduction_itemsets, rc_values, supports, metric='confidence', min_threshold=0.6):
    itemset_apriori = {}
    itemset_tranction_reduction = {}

    apriori_rules = {}
    tranction_reduction_rules = {}

    time_exc_apriori = {}
    time_exc_tranction_redc = {}

    item_sets_by_country = itemsett_by_country(country)
    item_sets_by_country = item_sets_by_country.iloc[0: 10]

    for i in supports:

        fre_apriori_itemsets, execution_time_apriori = apriori(
            apriori_itemset, min_support=i, use_colnames=True)

        fre_apriori_tranction_red_itemsets, execution_time_apriori_tranc_reduction = apriori_tranction_reduction(
            tranction_reduction_itemsets, min_support=i, use_colnames=True, rc_values=rc_values)

        if not fre_apriori_itemsets['support'].empty:
            itemset_apriori.update({
                i: fre_apriori_itemsets,
            })
            apriori_rules.update({
                i: association_rules(fre_apriori_itemsets,
                                 metric=metric, min_threshold=min_threshold)
            })
            time_exc_apriori.update({
                i: execution_time_apriori
            })
        if not fre_apriori_tranction_red_itemsets['support'].empty:
            itemset_tranction_reduction.update({
                i: fre_apriori_tranction_red_itemsets
            })
            tranction_reduction_rules.update({
                i: association_rules(fre_apriori_itemsets,
                                     metric=metric, min_threshold=min_threshold)
            })

            time_exc_tranction_redc.update({
                i: execution_time_apriori_tranc_reduction
            })

    return itemset_apriori, itemset_tranction_reduction,  apriori_rules, tranction_reduction_rules, time_exc_apriori, time_exc_tranction_redc


In [None]:
fre_apriori_tranction_red_itemsets, execution_time_apriori_tranc_reduction = apriori_tranction_reduction(
    item_sets, min_support=0.2, use_colnames=True, rc_values=rc_values)
fre_apriori_tranction_red_itemsets


In [None]:
counties = sample_dataset['Country'].unique()
counties[:4]


In [None]:
def rule_by_every_countries(counties, apriori_itemset, tranction_reduction_itemsets, rc_values,  supports):
    country_itemset_apriori = {}
    country_itemset_apriori_tranction_reduction = {}

    country_time_apriori = {}
    country_time_apriori__tranction_reduction = {}

    country_rule_apriori = {}
    country_rule_apriori_tranction_reduction = {}

    for i in counties:
        itemset_apriori, itemset_tranction_reduction, apriori_rules, tranction_reduction_rules, execution_time_apriori, execution_time_apriori_tranc_reduction = rules_by_country(
            i, apriori_itemset, tranction_reduction_itemsets, rc_values,  supports)
        country_itemset_apriori.update({
            i: itemset_apriori
        })
        country_itemset_apriori_tranction_reduction.update({
            i: itemset_tranction_reduction
        })
        country_rule_apriori.update({
            i: apriori_rules
        })
        country_rule_apriori_tranction_reduction.update({
            i: tranction_reduction_rules
        })
        country_time_apriori.update({
            i: execution_time_apriori
        })
        country_time_apriori__tranction_reduction.update({
            i: execution_time_apriori_tranc_reduction
        })

    return country_itemset_apriori, country_itemset_apriori_tranction_reduction, country_rule_apriori, country_rule_apriori_tranction_reduction, country_time_apriori, country_time_apriori__tranction_reduction


### Plot support and Rule It can produce


In [None]:
rules_by_country('France', item_sets, itemset_reductabc,
                 rc_values=rc_values, supports=supports)


Given this support, how many rule it can generate


In [None]:
unitedKingdom = rule_by_every_countries(
    counties[:3],  apriori_itemset=item_sets, tranction_reduction_itemsets=itemset_reductabc, rc_values=rc_values, supports=supports)
unitedKingdom


In [None]:
def plotTimeWithSupportEachCountries(rule_by_each_countries, counties):
    fig, ax = plt.subplots(1, len(counties), figsize=(10, 5))
    fig, bx = plt.subplots(1, len(counties), figsize=(10, 5))
    ax = ax.ravel()
    bx = bx.ravel()
    for i in range(0, len(counties)):
        country = counties[i]

        apriori_supports_itemset = rule_by_each_countries[0][country]
        apriori_tranction_reduction_supports_itemset = rule_by_each_countries[1][country]

        apriori_rules = rule_by_each_countries[2][country]
        apriori_tranction_reduction_rules = rule_by_each_countries[3][country]

        apriori_supports_exc_time = rule_by_each_countries[4][country]
        apriori_tranction_reduction_supports_exc_time = rule_by_each_countries[5][country]
        print(apriori_supports_exc_time.keys())
        ax[i].plot(
            apriori_supports_exc_time.keys(), apriori_supports_exc_time.values(), label="Apriori")
        ax[i].legend(loc="best")

        ax[i].title("Hello")
        ax[i].plot(
            apriori_tranction_reduction_supports_exc_time.keys(), apriori_tranction_reduction_supports_exc_time.values(), label="Tranction Reduction")
        ax[i].set_title(f"Apriori-{country}")
        ax[i].set_xlabel("Support")
        ax[i].set_ylabel("Time(s)")
        ax[i].legend(loc="best")
        
        # bx[i].plot(
        #     apriori_tranction_reduction_supports_exc_time.keys(), apriori_tranction_reduction_supports_exc_time.values(), )
        # bx[i].set_title(f"Tranction-Reduction-{country}")
        # bx[i].set_xlabel("Support")
        # bx[i].set_ylabel("Time(s)")

    plt.show()


plotTimeWithSupportEachCountries(unitedKingdom, counties[:3])


In [None]:
counties[:2]

In [None]:
unitedKingdom

In [None]:
x = np.linspace(0, 10)
fig, ax = plt.subplots(1, 3, figsize=(10, 5))
ax = ax.ravel()
for i in range(3):
    y = np.sin(x+i)
    ax[i].plot(x, y)
    ax[i].set_title(f"{i}")
plt.show()

In [None]:
support_with_time = [{0.2: 0.0336321360009606, 0.3: 0.02212243100075284, 0.4: 0.010942570999759482, 0.5: 0.0036832910009252373, 0.6: 0.00968838699918706,
                      0.7: 0.009691577000921825, 0.8: 0.0073171189997083275, 0.9: 0.010909566000918858, 1: 0.009190041000692872},
                     {0.2: 0.04421744100000069, 0.3: 0.050529127000118024, 0.4: 0.014789639999435167, 0.5: 0.008559862999391044,
                      0.6: 0.007137236001653946, 0.7: 0.013940286000433844, 0.8: 0.012399811999785015, 0.9: 0.01900387900059286, 1: 0.01578570700075943}
                     ]


In [None]:
import matplotlib.pyplot as plt


x = np.linspace(0, 10)
fig, ax = plt.subplots(1, 3, figsize=(10, 5))
ax = ax.ravel()
for i in range(3):
    y = np.sin(x+i)
    ax[i].plot(x, y)
    ax[i].set_title(f"{i}")
plt.show()


In [None]:
unitedKingdom[0]['France']


In [None]:
france = rule_by_every_countries([counties[2]])


### New DataSet


In [None]:
path = "/home/sokhorn/sokhorn/dataSet/groceries/Groceries_dataset.csv"
df = pd.read_csv(path)
df


In [None]:
grocery_itemsets = (
    df.groupby(
        ['Member_number', 'itemDescription', ])['Quantity']
    .sum().unstack().reset_index().fillna(0)
    .set_index("Member_number")
)
grocery_itemsets = grocery_itemsets.applymap(lambda x: 1 if x > 0 else 0)
grocery_itemsets


In [None]:
itemsets, grocery_apriori_time = apriori(
    grocery_itemsets, min_support=0.05, use_colnames=True)
itemsets


In [None]:
rules = association_rules(itemsets, metric='confidence', min_threshold=0.3)
rules
