# Apriori Estimator
Implementation of Apriori Algorithm

In [287]:
import pandas as pd
import numpy as np
import itertools
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [172]:
data_path = '../data/store_data.csv'

In [173]:
def flatten(array):
    """
    Returns a list o flatten elements of every inner lists (or tuples)
        ****RECURSIVE****
    """
    res = []
    for el in array:
        if isinstance(el, (list, tuple)):
            res.extend(flatten(el))
            continue
        res.append(el)
    return res
def is_subset_of(a, b):
    for da in a:
        if da not in b:
            return False
    return True

In [315]:
class AprioriEstimator():
    
    def __init__(self, df_data):
        self.df_data = df_data
        self.dshape = df_data.shape
        self.le = LabelEncoder()
        self.extract_items(df_data)
    
    @property
    def data(self):
        return self.df_data
    @property
    def shape(self):
        return self.dshape
    
    @property
    def l1label(self):
        return list(zip(self.itemset, self.l1_count))
    @property
    def Encoder(self):
        return self.le 
    
    def extract_items(self, data):
        self.raw_items = []
        for index, row in data.iterrows():
            row = pd.Series(row).dropna().values
            self.raw_items.append(np.array(row).tolist())
        self.itemset, self.l1_count = np.unique(flatten(self.raw_items), return_counts=True)
        self.le.fit(self.itemset)
        self.encoded_data = [ self.le.transform(d) for d in self.raw_items ]
        

    def apriori_gen(self, lk, n):
        c = flatten(lk)
        c = np.unique(c)
        return [[dc, 0] for dc in itertools.combinations(c, n) ]

    def ffis(self, min_sup):
        lk = self.le.transform(self.itemset)[self.l1_count > min_sup]
        i=2
        lks = lk
        while len(lk) > 1:
            lks = lk
            ck = self.apriori_gen(lk, i)
            with tqdm(total = len(self.encoded_data)) as pbar:
                for row in self.encoded_data:
                    for index, c in enumerate(ck):
                        if is_subset_of(c[0], row):
                            ck[index][1] += 1;
                    pbar.update(1)
            lk = [ c  for c in ck if int(c[1]) > min_sup]
            i+=1
        return lks
            
        
        
        

In [316]:
store_data_df = pd.read_csv(data_path, header=None)

In [317]:
store_data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [318]:
ape = AprioriEstimator(store_data_df[:500])

In [319]:
aso = ape.ffis(5)

100%|██████████| 500/500 [00:05<00:00, 97.29it/s]
100%|██████████| 500/500 [01:05<00:00,  8.06it/s]
100%|██████████| 500/500 [00:18<00:00, 26.60it/s]


In [321]:
[(ape.Encoder.inverse_transform(a[0]).tolist(), a[1]) for a in aso]

[(['burgers', 'eggs', 'milk'], 6),
 (['burgers', 'eggs', 'mineral water'], 8),
 (['burgers', 'eggs', 'turkey'], 6),
 (['burgers', 'french fries', 'milk'], 6),
 (['burgers', 'milk', 'mineral water'], 7),
 (['chocolate', 'eggs', 'milk'], 9),
 (['chocolate', 'eggs', 'mineral water'], 9),
 (['chocolate', 'eggs', 'spaghetti'], 7),
 (['chocolate', 'french fries', 'mineral water'], 7),
 (['chocolate', 'french fries', 'spaghetti'], 8),
 (['chocolate', 'frozen vegetables', 'milk'], 6),
 (['chocolate', 'frozen vegetables', 'mineral water'], 6),
 (['chocolate', 'milk', 'mineral water'], 10),
 (['chocolate', 'milk', 'spaghetti'], 8),
 (['chocolate', 'mineral water', 'soup'], 7),
 (['chocolate', 'mineral water', 'spaghetti'], 9),
 (['chocolate', 'mineral water', 'tomatoes'], 6),
 (['cooking oil', 'eggs', 'mineral water'], 7),
 (['eggs', 'french fries', 'milk'], 6),
 (['eggs', 'french fries', 'spaghetti'], 7),
 (['eggs', 'herb & pepper', 'mineral water'], 6),
 (['eggs', 'milk', 'mineral water'], 8),