In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
basket = pd.read_csv('basket_dataset/Market_Basket_Optimisation.csv', header=None)

In [3]:
basket.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [4]:
table = []

In [5]:
list_products = np.array(list(set(basket.values.flatten())))

In [6]:
list_products = list_products[list_products != 'nan']

In [7]:
try:
    for b in basket.loc:
        table.append(np.isin(list_products, b.values).astype(int))
except:
    pass

In [8]:
basket_encoded = pd.DataFrame(table, columns=list_products)

In [33]:
basket_encoded.head()

Unnamed: 0,cream,nonfat milk,blueberries,tomatoes,black tea,herb & pepper,yogurt cake,french fries,green beans,salad,...,antioxydant juice,whole wheat rice,magazines,honey,whole weat flour,muffins,fromage blanc,hot dogs,mushroom cream sauce,mashed potato
0,0,0,0,0,0,0,0,0,0,1,...,1,0,0,1,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [9]:
def generate_permutations(columns, n):
    if n == 1:
        return [(k,) for k in columns]
    permutations = []
    for i, c in enumerate(columns):
        permutations.extend([(c,) + p for p in generate_permutations(columns[i+1:], n-1)])
    return permutations

def generate_all_possible_combinations(columns, n=3):
    combinations = []
    for i in range(1, n+1):
        combinations.extend(generate_permutations(columns, i))
    return combinations 

In [32]:
generate_all_possible_combinations([1,2,3,4], 3)

[(1,),
 (2,),
 (3,),
 (4,),
 (1, 2),
 (1, 3),
 (1, 4),
 (2, 3),
 (2, 4),
 (3, 4),
 (1, 2, 3),
 (1, 2, 4),
 (1, 3, 4),
 (2, 3, 4)]

In [11]:
all_possible_basket_combinations = generate_all_possible_combinations(list_products, n=3)

In [12]:
def calculate_combination_probability(df, combination):
    logic = None
    for product in combination:
        if logic is None:
            logic = df[product] == 1
            continue
        logic &= (df[product] == 1)
    return len(df[logic]) / len(df)

In [13]:
probabilities = {}
for combination in all_possible_basket_combinations:
    probabilities[frozenset(combination)] = calculate_combination_probability(basket_encoded, combination)

In [14]:
def generate_rules(combination):
    possible_comb = generate_all_possible_combinations(combination)
    rules = []
    for if_ in possible_comb:
        for then_ in possible_comb:
            if set(if_).intersection(set(then_)).__len__() != 0:
                continue
            rules.append((if_, then_))
    return rules

In [15]:
rules_generator = list(filter(lambda x: len(x) > 1, all_possible_basket_combinations))

In [16]:
rules = []

for combination in rules_generator:
    rules.extend(generate_rules(combination))

In [17]:
rules = list(set(rules))

In [18]:
rules_confidence = {}
lift = {}
support = {}

In [19]:
for (if_, then_) in rules:
    support[(if_, then_)] = probabilities[frozenset(if_ + then_)]
    if probabilities[frozenset(if_)] != 0:
        rules_confidence[(if_, then_)] = probabilities[frozenset(if_ + then_)] / probabilities[frozenset(if_)]
        if probabilities[frozenset(then_)] != 0:
            lift[(if_, then_)] = probabilities[frozenset(if_ + then_)] / (probabilities[frozenset(if_)] * probabilities[frozenset(then_)])

In [20]:
def association_rules(support_lower=0.05, confidence_lower=0.9, lift_lower=1):
    rules_list = list(lift.keys())
    return list(filter(lambda x: support[x] > support_lower and rules_confidence[x] > confidence_lower and lift[x] > lift_lower, rules_list))

In [21]:
pd.DataFrame(association_rules(support_lower=0.0005))

Unnamed: 0,0,1
0,"(body spray, low fat yogurt)","(mineral water,)"
1,"(blueberries, red wine)","(spaghetti,)"
2,"(whole wheat pasta, mushroom cream sauce)","(french fries,)"
3,"(clothes accessories, salmon)","(mineral water,)"
4,"(salt, tomato sauce)","(french fries,)"
5,"(bug spray, almonds)","(spaghetti,)"
6,"(red wine, soup)","(mineral water,)"
7,"(salmon, chutney)","(mineral water,)"
8,"(white wine, toothpaste)","(avocado,)"
9,"(salt, tomato sauce)","(red wine,)"


### Using Decision Tree to help with rules selection

For example, we want to see milk pattern

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
tree = DecisionTreeClassifier(max_depth=5)

In [24]:
interested_product = 'milk'

In [25]:
y = basket_encoded[interested_product]
X = basket_encoded.drop(interested_product, axis=1)

In [26]:
tree.fit(X, y)

In [27]:
def get_association_rules_from_tree(tree, feature_names, out):
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features  = [feature_names[i] for i in tree.tree_.feature]

    # get ids of child nodes
    idx = np.argwhere(left == -1)[:,0]     

    def recurse(left, right, child, lineage=None):
        if lineage is None:
            buy = np.argmax(tree.tree_.value[child]) 
            lineage = [buy]
        if child in left:
            parent = np.where(left == child)[0].item()
            split = 'l'
        else:
            parent = np.where(right == child)[0].item()
            split = 'r'

        lineage.append((parent, split, threshold[parent], features[parent]))

        if parent == 0:
            lineage.reverse()
            return lineage
        else:
            return recurse(left, right, parent, lineage)
    
    rules = []

    currule = tuple()
    for child in idx:
        for node in recurse(left, right, child):
            if isinstance(node, np.int64):
                if node == 1:
                    rules.append((currule, out))
                currule = tuple()
                continue
            if node[1] == 'r':
                currule = (node[3],) + currule
    return rules


In [28]:
rules = get_association_rules_from_tree(tree, tree.feature_names_in_, out=('milk',))

In [29]:
support_milk = {}
confidence_milk = {}
lift_milk = {}

In [30]:
for comb1, comb2 in rules:
    support_milk[(comb1, comb2)] = calculate_combination_probability(basket_encoded, comb1 + comb2)
    confidence_milk[(comb1, comb2)] = support_milk[(comb1, comb2)] / calculate_combination_probability(basket_encoded, comb1)
    lift_milk[(comb1, comb2)] = confidence_milk[(comb1, comb2)] / calculate_combination_probability(basket_encoded, comb2)

In [31]:
lift_milk

{(('green beans', 'soup'), ('milk',)): 5.144718792866941,
 (('mushroom cream sauce', 'frozen vegetables'), ('milk',)): 3.177620430888405,
 (('cooking oil', 'frozen vegetables'), ('milk',)): 3.1776204308884046,
 (('pancakes', 'whole wheat pasta'), ('milk',)): 3.3073192239858913,
 (('spaghetti', 'whole wheat pasta'), ('milk',)): 3.5077628133183696,
 (('meatballs', 'whole wheat pasta'), ('milk',)): 6.430898491083676,
 (('cereals', 'shrimp', 'mineral water'), ('milk',)): 5.144718792866941,
 (('strong cheese', 'spaghetti', 'mineral water'),
  ('milk',)): 4.409758965314521,
 (('cereals', 'frozen smoothie', 'spaghetti', 'mineral water'),
  ('milk',)): 7.717078189300412,
 (('shrimp', 'soup', 'mineral water'), ('milk',)): 4.630246913580247,
 (('french wine', 'soup', 'mineral water'), ('milk',)): 6.430898491083676,
 (('frozen vegetables', 'soup', 'mineral water'),
  ('milk',)): 4.670863114576565}

### Class Implementation of Association Rules and Association Trees

In [34]:
from dataclasses import dataclass

In [35]:
@dataclass
class AssociationRule:
    if_: tuple
    then_: tuple
    support: float
    confidence: float
    lift: float

In [53]:
class AssociaitionRulesModel:
    def __init__(self, support_lower: float, confidence_lower: float, lift_lower: float, n_products=3):
        self.support_lower = support_lower
        self.confidence_lower = confidence_lower
        self.lift_lower = lift_lower
        self.n_products = n_products
    
    @staticmethod
    def generate_permutations(columns: list, n: int):
        if n == 1:
            return [(k,) for k in columns]
        permutations = []
        for i, c in enumerate(columns):
            permutations.extend([(c,) + p for p in generate_permutations(columns[i+1:], n-1)])
        return permutations

    @staticmethod
    def generate_all_possible_combinations(columns: list, n: int):
        combinations = []
        for i in range(1, n+1):
            combinations.extend(generate_permutations(columns, i))
        return combinations 
    
    @staticmethod
    def calculate_combination_probability(df: pd.DataFrame, combination: tuple):
        logic = None
        for product in combination:
            if logic is None:
                logic = df[product] == 1
                continue
            logic &= (df[product] == 1)
        return len(df[logic]) / len(df)
    
    def calculate_all_probabilities(self, X: pd.DataFrame):
        self.probabilities_ = {}
        for combination in self.all_possible_combinations_:
            self.probabilities_[frozenset(combination)] = calculate_combination_probability(X, combination)
        return self.probabilities_
    
    def generate_all_rules(self):
        rules = []
        rules_generator = list(filter(lambda x: len(x) > 1, self.all_possible_combinations_))
        for combination in rules_generator:
            rules.extend(self.generate_combination_rules(combination))
        self.all_rules_ = list(set(rules))
        return self.all_rules_
    
    def generate_combination_rules(self, combination: tuple):
        possible_comb = self.generate_all_possible_combinations(combination, self.n_products)
        rules = []
        for if_ in possible_comb:
            for then_ in possible_comb:
                if set(if_).intersection(set(then_)).__len__() != 0:
                    continue
                rules.append((if_, then_))
        return rules
    
    def calculate_rule_metric(self, rule: tuple):
        if_, then_ = rule
        support = self.probabilities_[frozenset(if_ + then_)]
        confidence = 0
        lift = 0
        if self.probabilities_[frozenset(if_)] != 0:
            confidence = self.probabilities_[frozenset(if_ + then_)] / self.probabilities_[frozenset(if_)]
            if self.probabilities_[frozenset(then_)] != 0:
                lift = self.probabilities_[frozenset(if_ + then_)] / (self.probabilities_[frozenset(if_)] * self.probabilities_[frozenset(then_)])
        association_rule = AssociationRule(if_=if_, then_=then_, support=support, confidence=confidence, lift=lift)
        return association_rule
    
    def calculate_metrics(self):
        self.association_metrics_ = []
        for rule in self.all_rules_:
            self.association_metrics_.append(self.calculate_rule_metric(rule))
        return self.association_metrics_
    
    def association_rules(self):
        return list(filter(lambda x: x.support > self.support_lower and x.confidence > self.confidence_lower and x.lift > self.lift_lower, self.association_metrics_))

    def fit(self, X: pd.DataFrame):
        self.feature_names_ = X.columns
        self.all_possible_combinations_ = self.generate_all_possible_combinations(X.columns, self.n_products)
        self.calculate_all_probabilities(X)
        self.generate_all_rules()
        self.calculate_metrics()
        self.association_rules_ = self.association_rules()
    
    def accuracy(self, Xtest: pd.DataFrame):
        test_rules = []
        for rule in self.association_rules_:
            if_ = rule.if_; then_ = rule.then_
            support = self.calculate_combination_probability(Xtest, if_ + then_)
            if_prob = self.calculate_combination_probability(Xtest, if_)
            then_prob = self.calculate_combination_probability(Xtest, then_)
            confidence = support / if_prob if if_prob != 0 else 0
            lift = confidence / then_prob if then_prob != 0 else 0
            test_rule = AssociationRule(if_, then_, support, confidence, lift)
            test_rules.append(test_rule)
        return test_rules
            

In [62]:
from sklearn.model_selection import KFold

In [64]:
train_test_idx = list(KFold().split(basket_encoded))

In [72]:
test_results = {}
for i, (train, test) in enumerate(train_test_idx):
    xtrain = basket_encoded.loc[train]
    xtest = basket_encoded.loc[test]
    model = AssociaitionRulesModel(0.0005, 0.5, 1, 2)
    model.fit(xtrain)
    test_results[i] = model.accuracy(xtest)

In [73]:
test_results

{0: [AssociationRule(if_=('tomato sauce',), then_=('spaghetti',), support=0.003997335109926716, confidence=0.23076923076923075, lift=1.273472850678733),
  AssociationRule(if_=('burger sauce',), then_=('spaghetti',), support=0.0006662225183211193, confidence=0.09090909090909091, lift=0.5016711229946524),
  AssociationRule(if_=('nonfat milk',), then_=('mineral water',), support=0.002664890073284477, confidence=0.3333333333333333, lift=1.3745421245421243),
  AssociationRule(if_=('mayonnaise',), then_=('mineral water',), support=0.0006662225183211193, confidence=0.125, lift=0.5154532967032966)],
 1: [AssociationRule(if_=('chocolate bread',), then_=('mineral water',), support=0.0006666666666666666, confidence=0.14285714285714285, lift=0.6105006105006104),
  AssociationRule(if_=('cream',), then_=('spaghetti',), support=0.0, confidence=0.0, lift=0.0)],
 2: [],
 3: [AssociationRule(if_=('extra dark chocolate',), then_=('mineral water',), support=0.002, confidence=0.2, lift=0.819672131147541),


### Adding Decision Trees to help with Association Rules Selection

In [83]:
class AssociationTree(AssociaitionRulesModel):
    def __init__(self, support_lower: float, confidence_lower: float, lift_lower: float, n_products=3):
        super().__init__(support_lower, confidence_lower, lift_lower, n_products)
        self.probabilities_ = {}
    
    @staticmethod
    def get_association_rules_from_tree(tree: DecisionTreeClassifier, feature_names: list, out: tuple):
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features  = [feature_names[i] for i in tree.tree_.feature]

        # get ids of child nodes
        idx = np.argwhere(left == -1)[:,0]     

        def recurse(left, right, child, lineage=None):
            if lineage is None:
                buy = np.argmax(tree.tree_.value[child]) 
                lineage = [buy]
            if child in left:
                parent = np.where(left == child)[0].item()
                split = 'l'
            else:
                parent = np.where(right == child)[0].item()
                split = 'r'

            lineage.append((parent, split, threshold[parent], features[parent]))

            if parent == 0:
                lineage.reverse()
                return lineage
            else:
                return recurse(left, right, parent, lineage)
        
        rules = []

        currule = tuple()
        for child in idx:
            for node in recurse(left, right, child):
                if isinstance(node, np.int64):
                    if node == 1:
                        rules.append((currule, out))
                    currule = tuple()
                    continue
                if node[1] == 'r':
                    currule = (node[3],) + currule
        return rules
    
    def generate_feature_name_rules(self, X, feature_name):
        tree = DecisionTreeClassifier(max_depth=self.n_products)
        y = X[feature_name]
        Xtree = X.drop(feature_name, axis=1)
        tree.fit(Xtree, y)
        return self.get_association_rules_from_tree(tree, tree.feature_names_in_, out=(feature_name,))

    def generate_all_rules(self, X: pd.DataFrame):
        self.all_rules_ = []
        for feature_name in self.feature_names_:
            rules = self.generate_feature_name_rules(X, feature_name)
            self.all_rules_.extend(rules)
    
    def calculate_rule_metric(self, rule: tuple):
        if_, then_ = rule
        if frozenset(if_ + then_) not in self.probabilities_.keys():
            p = self.calculate_combination_probability(self._X, if_ + then_)
            self.probabilities_[frozenset(if_ + then_)] = p
        if frozenset(if_) not in self.probabilities_.keys():
            p = self.calculate_combination_probability(self._X, if_)
            self.probabilities_[frozenset(if_)] = p
        if frozenset(then_) not in self.probabilities_.keys():
            p = self.calculate_combination_probability(self._X, then_)
            self.probabilities_[frozenset(then_)] = p
        support = self.probabilities_[frozenset(if_ + then_)]
        confidence = 0
        lift = 0
        if self.probabilities_[frozenset(if_)] != 0:
            confidence = self.probabilities_[frozenset(if_ + then_)] / self.probabilities_[frozenset(if_)]
            if self.probabilities_[frozenset(then_)] != 0:
                lift = self.probabilities_[frozenset(if_ + then_)] / (self.probabilities_[frozenset(if_)] * self.probabilities_[frozenset(then_)])
        association_rule = AssociationRule(if_=if_, then_=then_, support=support, confidence=confidence, lift=lift)
        return association_rule
    
    def fit(self, X: pd.DataFrame):
        self._X = X.copy()
        self.feature_names_ = X.columns
        self.generate_all_rules(X)
        self.calculate_metrics()
        self.association_rules_ = self.association_rules()

In [84]:
basket_encoded.columns.__len__()

120

In [96]:
assotree = AssociationTree(1/750, 0.5, 1, 5)
assotree.fit(basket_encoded)

In [98]:
pd.DataFrame(assotree.association_rules_)

Unnamed: 0,if_,then_,support,confidence,lift
0,"(black tea, turkey)","(eggs,)",0.001466,0.733333,4.080663
1,"(rice, herb & pepper)","(ground beef,)",0.001866,0.636364,6.476748
2,"(shrimp, soup, mineral water)","(milk,)",0.0016,0.6,4.630247
3,"(frozen vegetables, soup, mineral water)","(milk,)",0.003066,0.605263,4.670863
4,"(eggs, tomato sauce)","(chocolate,)",0.001466,0.55,3.356835
5,"(mushroom cream sauce, pasta)","(escalope,)",0.002533,0.95,11.976387
6,"(ham, olive oil)","(mineral water,)",0.002,0.555556,2.330661
7,"(frozen vegetables, soup)","(mineral water,)",0.005066,0.633333,2.656954
8,"(olive oil, frozen vegetables, soup)","(mineral water,)",0.001733,0.8125,3.408592
9,"(red wine, soup)","(mineral water,)",0.001866,0.933333,3.915511
