# Apriori Estimator
Implementation of Apriori Algorithm

In [609]:
import pandas as pd
import numpy as np
import itertools
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import sys
sys.path.append('..')
from utils.utils import flatten, is_subset_of

In [172]:
data_path = '../data/store_data.csv'

In [173]:
# def flatten(array):
#     """
#     Returns a list o flatten elements of every inner lists (or tuples)
#         ****RECURSIVE****
#     """
#     res = []
#     for el in array:
#         if isinstance(el, (list, tuple)):
#             res.extend(flatten(el))
#             continue
#         res.append(el)
#     return res
# def is_subset_of(a, b):
#     for da in a:
#         if da not in b:
#             return False
#     return True

In [519]:
class AprioriEstimator():
    
    def __init__(self, df_data):
        self.df_data = df_data
        self.dshape = df_data.shape
        self.le = LabelEncoder()
        self.extract_items(df_data)
    
    @property
    def data(self):
        return self.df_data
    @property
    def shape(self):
        return self.dshape
    
    @property
    def l1label(self):
        return list(zip(self.itemset, self.l1_count))
    @property
    def Encoder(self):
        return self.le 
    
    def extract_items(self, data):
        self.raw_items = []
        for index, row in data.iterrows():
            row = pd.Series(row).dropna().values
            self.raw_items.append(np.array(row).tolist())
        self.itemset, self.l1_count = np.unique(flatten(self.raw_items), return_counts=True)
        self.le.fit(self.itemset)
        self.encoded_data = [ self.le.transform(d) for d in self.raw_items ]
        

    def apriori_gen(self, lk, n):
        c = flatten(lk)
        c = np.unique(c)
        return [[dc, 0] for dc in itertools.combinations(c, n) ]

    def ffis(self, min_sup):
        lk = self.le.transform(self.itemset)[self.l1_count > min_sup]
        i=2
        lks = lk
        while len(lk) > 1:
            lks = lk
            ck = self.apriori_gen(lk, i)
            with tqdm(total = len(self.encoded_data)) as pbar:
                for row in self.encoded_data:
                    for index, c in enumerate(ck):
                        if is_subset_of(c[0], row):
                            ck[index][1] += 1;
                    pbar.update(1)
            lk = [ c  for c in ck if int(c[1]) > min_sup]
            i+=1
        return lks
            
    def support(self, com):
        com_s = set(com)
        count = 0
        for i, row in enumerate(self.encoded_data):
            if com_s.issubset(row) and set(row).issuperset(com_s):
                count += 1
        return count/self.dshape[0]
    
    def gen_rule(self, fis, min_cond):
        rules = []
        for m, m_c in fis:
            for i in range(len(m)-1):
                for com in itertools.combinations(m, i+1):
                    sm = self.support(m)
                    scom = self.support(com)
                    if sm/scom >= min_cond:
                         rules.append({'rule': [com, tuple(set(m).difference(com))], 'confidence': sm/scom, 'support': sm})
        return rules
    
    def print_out_rules(self, rules):
        for dr in rules:
            print(f"{self.le.inverse_transform(dr['rule'][0])} => {self.le.inverse_transform(dr['rule'][1])}, with confidence {dr['confidence']} and support {dr['support']}")

In [520]:
store_data_df = pd.read_csv(data_path, header=None)

In [521]:
store_data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [522]:
ape = AprioriEstimator(store_data_df[:1000])

In [532]:
aso = ape.ffis(10)

100%|██████████| 1000/1000 [00:11<00:00, 85.28it/s]
100%|██████████| 1000/1000 [02:47<00:00,  5.87it/s]
100%|██████████| 1000/1000 [00:40<00:00, 24.98it/s]


In [533]:
[(ape.Encoder.inverse_transform(a[0]).tolist(), a[1]) for a in aso]

[(['burgers', 'eggs', 'mineral water'], 13),
 (['chocolate', 'eggs', 'milk'], 16),
 (['chocolate', 'eggs', 'mineral water'], 14),
 (['chocolate', 'eggs', 'spaghetti'], 13),
 (['chocolate', 'french fries', 'mineral water'], 11),
 (['chocolate', 'french fries', 'spaghetti'], 13),
 (['chocolate', 'frozen vegetables', 'milk'], 13),
 (['chocolate', 'frozen vegetables', 'spaghetti'], 11),
 (['chocolate', 'green tea', 'mineral water'], 11),
 (['chocolate', 'ground beef', 'mineral water'], 11),
 (['chocolate', 'milk', 'mineral water'], 18),
 (['chocolate', 'milk', 'spaghetti'], 17),
 (['chocolate', 'mineral water', 'soup'], 12),
 (['chocolate', 'mineral water', 'spaghetti'], 16),
 (['cooking oil', 'eggs', 'mineral water'], 11),
 (['eggs', 'french fries', 'milk'], 11),
 (['eggs', 'milk', 'mineral water'], 16),
 (['eggs', 'mineral water', 'shrimp'], 12),
 (['eggs', 'mineral water', 'spaghetti'], 17),
 (['eggs', 'pancakes', 'spaghetti'], 11),
 (['escalope', 'mineral water', 'spaghetti'], 11),
 ([

In [546]:
rules = ape.gen_rule(aso, 0.1)

In [547]:
ape.print_out_rules(rules)

['burgers'] => ['mineral water' 'eggs'], with confidence 0.16249999999999998 and support 0.013
['burgers' 'eggs'] => ['mineral water'], with confidence 0.38235294117647056 and support 0.013
['burgers' 'mineral water'] => ['eggs'], with confidence 0.5416666666666666 and support 0.013
['eggs' 'mineral water'] => ['burgers'], with confidence 0.21666666666666667 and support 0.013
['milk'] => ['chocolate' 'eggs'], with confidence 0.11764705882352941 and support 0.016
['chocolate' 'eggs'] => ['milk'], with confidence 0.38095238095238093 and support 0.016
['chocolate' 'milk'] => ['eggs'], with confidence 0.34782608695652173 and support 0.016
['eggs' 'milk'] => ['chocolate'], with confidence 0.45714285714285713 and support 0.016
['chocolate' 'eggs'] => ['mineral water'], with confidence 0.3333333333333333 and support 0.014
['chocolate' 'mineral water'] => ['eggs'], with confidence 0.23728813559322035 and support 0.014
['eggs' 'mineral water'] => ['chocolate'], with confidence 0.233333333333333

In [527]:
rearrange_data = []
for index, row in store_data_df.iterrows():
    temp = [ 1 if names in set(row) else 0 for names, _ in ape.l1label]
    rearrange_data.append(temp)
rearr_df = pd.DataFrame(rearrange_data, columns=[name for name, _ in ape.l1label])

KeyboardInterrupt: 

In [345]:
rearr_df.head()

Unnamed: 0,almonds,antioxydant juice,asparagus,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,1,1,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [346]:
for index, row in store_data_df.iterrows():
    print([ 1 if names in set(row) else 0 for names, _ in ape.l1label])
    break

[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]


In [374]:
rearr_df[:500].to_csv('rearr_store_data.csv', index=False)

In [640]:
keys , counts = np.unique(flatten(ape.encoded_data), return_counts=True)
ht_arr = np.array([(i,j) for i,j in zip(keys, counts)], dtype=[('key', int), ('counts', int)])
rst = np.flip(np.sort(ht_arr, order='counts'))
rst = filter(lambda x: x if x[1]>0 else None, rst)
rst = [ [d]  for d in rst]
list(rst)
r = [1,2,3]
rst[0].append(r)

In [641]:
rst[0][1].append(3)

In [642]:
r

[1, 2, 3, 3]