In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
basket = pd.read_csv('basket_dataset/Market_Basket_Optimisation.csv', header=None)

In [3]:
basket.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [4]:
table = []

In [5]:
list_products = np.array(list(set(basket.values.flatten())))

In [6]:
list_products = list_products[list_products != 'nan']

In [7]:
try:
    for b in basket.loc:
        table.append(np.isin(list_products, b.values).astype(int))
except:
    pass

In [8]:
basket_encoded = pd.DataFrame(table, columns=list_products)

In [9]:
def generate_permutations(columns, n):
    if n == 1:
        return [(k,) for k in columns]
    permutations = []
    for i, c in enumerate(columns):
        permutations.extend([(c,) + p for p in generate_permutations(columns[i+1:], n-1)])
    return permutations

def generate_all_possible_combinations(columns, n=3):
    combinations = []
    for i in range(1, n+1):
        combinations.extend(generate_permutations(columns, i))
    return combinations 

In [10]:
generate_all_possible_combinations([1,2,3,4,5], 3)

[(1,),
 (2,),
 (3,),
 (4,),
 (5,),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (2, 3),
 (2, 4),
 (2, 5),
 (3, 4),
 (3, 5),
 (4, 5),
 (1, 2, 3),
 (1, 2, 4),
 (1, 2, 5),
 (1, 3, 4),
 (1, 3, 5),
 (1, 4, 5),
 (2, 3, 4),
 (2, 3, 5),
 (2, 4, 5),
 (3, 4, 5)]

In [11]:
all_possible_basket_combinations = generate_all_possible_combinations(list_products)

In [12]:
all_possible_basket_combinations

[('chicken',),
 ('energy drink',),
 ('ketchup',),
 ('burger sauce',),
 ('honey',),
 ('light mayo',),
 ('herb & pepper',),
 ('strawberries',),
 ('pepper',),
 ('blueberries',),
 ('candy bars',),
 ('chocolate',),
 ('parmesan cheese',),
 ('green tea',),
 ('antioxydant juice',),
 ('mint green tea',),
 ('mayonnaise',),
 ('tea',),
 ('pet food',),
 ('soda',),
 ('carrots',),
 ('mashed potato',),
 ('tomatoes',),
 ('chocolate bread',),
 ('butter',),
 ('frozen vegetables',),
 ('mineral water',),
 ('white wine',),
 ('rice',),
 ('protein bar',),
 ('whole weat flour',),
 ('green grapes',),
 ('fresh tuna',),
 ('black tea',),
 ('ham',),
 ('bramble',),
 ('soup',),
 ('chili',),
 ('cottage cheese',),
 ('babies food',),
 ('shampoo',),
 ('melons',),
 ('corn',),
 ('eggs',),
 ('spinach',),
 ('hot dogs',),
 ('flax seed',),
 ('cookies',),
 ('cake',),
 ('green beans',),
 ('chutney',),
 ('cider',),
 ('spaghetti',),
 ('mint',),
 ('water spray',),
 ('napkins',),
 ('milk',),
 ('meatballs',),
 ('cream',),
 ('pasta',)

In [13]:
def calculate_combination_probability(df, combination):
    logic = None
    for product in combination:
        if logic is None:
            logic = df[product] == 1
            continue
        logic &= (df[product] == 1)
    return len(df[logic]) / len(df)

In [14]:
probabilities = {}
for combination in all_possible_basket_combinations:
    probabilities[frozenset(combination)] = calculate_combination_probability(basket_encoded, combination)

In [15]:
def generate_rules(combination):
    possible_comb = generate_all_possible_combinations(combination)
    rules = []
    for if_ in possible_comb:
        for then_ in possible_comb:
            if set(if_).intersection(set(then_)).__len__() != 0:
                continue
            rules.append((if_, then_))
    return rules

In [16]:
rules_generator = list(filter(lambda x: len(x) > 1, all_possible_basket_combinations))

In [17]:
rules = []

for combination in rules_generator:
    rules.extend(generate_rules(combination))

In [18]:
rules = list(set(rules))

In [24]:
rules_confidence = {}
lift = {}
support = {}

In [25]:
for (if_, then_) in rules:
    support[(if_, then_)] = probabilities[frozenset(if_ + then_)]
    if probabilities[frozenset(if_)] != 0:
        rules_confidence[(if_, then_)] = probabilities[frozenset(if_ + then_)] / probabilities[frozenset(if_)]
        if probabilities[frozenset(then_)] != 0:
            lift[(if_, then_)] = probabilities[frozenset(if_ + then_)] / (probabilities[frozenset(if_)] * probabilities[frozenset(then_)])

In [26]:
def association_rules(support_lower=0.05, confidence_lower=0.9, lift_lower=1):
    rules_list = list(lift.keys())
    return list(filter(lambda x: support[x] > support_lower and rules_confidence[x] > confidence_lower and lift[x] > lift_lower, rules_list))

In [37]:
pd.DataFrame(association_rules(support_lower=0.0005))

Unnamed: 0,0,1
0,"(shampoo, tomato juice)","(mineral water,)"
1,"(candy bars, olive oil)","(mineral water,)"
2,"(meatballs, red wine)","(spaghetti,)"
3,"(bug spray, almonds)","(spaghetti,)"
4,"(mint green tea, ham)","(french fries,)"
5,"(low fat yogurt, body spray)","(mineral water,)"
6,"(fromage blanc, champagne)","(frozen vegetables,)"
7,"(white wine, shallot)","(escalope,)"
8,"(mashed potato, brownies)","(chocolate,)"
9,"(soup, pasta)","(shrimp,)"
