## Market Basket Analysis with Python

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("input/Market_Basket_Optimisation.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [2]:
# Create a list of transaction
df['Transactions']= df.values.tolist()
df['Transactions']

0       [shrimp, almonds, avocado, vegetables mix, gre...
1       [burgers, meatballs, eggs, nan, nan, nan, nan,...
2       [chutney, nan, nan, nan, nan, nan, nan, nan, n...
3       [turkey, avocado, nan, nan, nan, nan, nan, nan...
4       [mineral water, milk, energy bar, whole wheat ...
                              ...                        
7496    [butter, light mayo, fresh bread, nan, nan, na...
7497    [burgers, frozen vegetables, eggs, french frie...
7498    [chicken, nan, nan, nan, nan, nan, nan, nan, n...
7499    [escalope, green tea, nan, nan, nan, nan, nan,...
7500    [eggs, frozen smoothie, yogurt cake, low fat y...
Name: Transactions, Length: 7501, dtype: object

In [3]:
# Delete NaN from the transaction list
df['Transactions'] = df['Transactions'].apply(lambda x: [i for i in x if str(i) != "nan"])
df['Transactions']

0       [shrimp, almonds, avocado, vegetables mix, gre...
1                              [burgers, meatballs, eggs]
2                                               [chutney]
3                                       [turkey, avocado]
4       [mineral water, milk, energy bar, whole wheat ...
                              ...                        
7496                    [butter, light mayo, fresh bread]
7497    [burgers, frozen vegetables, eggs, french frie...
7498                                            [chicken]
7499                                [escalope, green tea]
7500    [eggs, frozen smoothie, yogurt cake, low fat y...
Name: Transactions, Length: 7501, dtype: object

In [4]:
# Convert the transaction list from a DataFrame column into a list of strings
transactions = list(df['Transactions'])
transactions.count(['burgers', 'meatballs', 'eggs'])

1

### 1.Count the number of rules

In [5]:
# Import library to count the number of permutations
from itertools import permutations

# Extract unique items.
unique_items = [item for transaction in transactions for item in transaction]

# Convert the unique item list from a string to a list
unique_item_list = list(set(unique_items))

# Compute and print rules.
rules = list(permutations(unique_item_list, 2))
print(len(rules))
print(rules)

14280
[('pepper', 'chicken'), ('pepper', 'bramble'), ('pepper', 'dessert wine'), ('pepper', 'champagne'), ('pepper', 'green grapes'), ('pepper', 'grated cheese'), ('pepper', 'oil'), ('pepper', 'vegetables mix'), ('pepper', 'black tea'), ('pepper', 'whole wheat pasta'), ('pepper', 'salad'), ('pepper', 'cider'), ('pepper', 'shallot'), ('pepper', 'milk'), ('pepper', 'red wine'), ('pepper', 'tomato sauce'), ('pepper', 'pancakes'), ('pepper', 'spinach'), ('pepper', 'burgers'), ('pepper', 'clothes accessories'), ('pepper', 'fresh bread'), ('pepper', 'meatballs'), ('pepper', 'ketchup'), ('pepper', 'herb & pepper'), ('pepper', 'fromage blanc'), ('pepper', 'energy bar'), ('pepper', 'water spray'), ('pepper', 'honey'), ('pepper', 'olive oil'), ('pepper', 'light mayo'), ('pepper', 'sandwich'), ('pepper', 'oatmeal'), ('pepper', 'yogurt cake'), ('pepper', 'cereals'), ('pepper', 'tomatoes'), ('pepper', 'cauliflower'), ('pepper', 'light cream'), ('pepper', 'mayonnaise'), ('pepper', 'turkey'), ('peppe

### 2.Support

In [6]:
#!pip install mlxtend

In [7]:
from mlxtend.preprocessing import TransactionEncoder

# Instantiate transaction encoder
encoder = TransactionEncoder().fit(transactions)

# One-hot encode itemsets by applying fit and transform
onehot = encoder.transform(transactions)

# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns = encoder.columns_)

# Computing Support for Single Items
print(onehot.mean())

 asparagus           0.000133
almonds              0.020397
antioxydant juice    0.008932
asparagus            0.004666
avocado              0.033329
                       ...   
whole wheat pasta    0.029463
whole wheat rice     0.058526
yams                 0.011465
yogurt cake          0.027330
zucchini             0.009465
Length: 120, dtype: float64


In [8]:
# Define itemset that contains both eggs and ground beef
onehot['eggs_&_ground beef'] = np.logical_and(onehot['eggs'], onehot['ground beef'])

# Compute Support for itemset that contains both eggs and ground beef 
print(onehot['eggs_&_ground beef'].mean())

0.019997333688841486


### 3.Confidence

In [9]:
# Compute Support for the itemsets that contains eggs and ground beef
support_eggs_groundbeef = np.logical_and(onehot['eggs'], onehot['ground beef']).mean()
support_eggs = onehot['eggs'].mean()
support_groundbeef = onehot['ground beef'].mean()

# Compute and print Confidence {eggs -> ground beef}
confidence_eggs_to_groundbeef = support_eggs_groundbeef / support_eggs
print(confidence_eggs_to_groundbeef)

0.11127596439169138


### 4.Lift

In [10]:
# Compute and print Lift {eggs -> ground beef}
lift_eggs_to_groundbeef = support_eggs_groundbeef / (support_eggs * support_groundbeef)
print(lift_eggs_to_groundbeef)

1.1325386823637411


In [11]:
lift2 = confidence_eggs_to_groundbeef / support_groundbeef
print(lift2)

1.132538682363741


### 5. Apriori Algorithm

In [12]:
from mlxtend.frequent_patterns import apriori

# Compute frequent itemsets
frequent_itemsets = apriori(onehot, min_support = 0.0005,max_len = 4, use_colnames = True)
print(frequent_itemsets.head())

    support             itemsets
0  0.020397            (almonds)
1  0.008932  (antioxydant juice)
2  0.004666          (asparagus)
3  0.033329            (avocado)
4  0.004533        (babies food)


In [13]:
from mlxtend.frequent_patterns import apriori, association_rules

# Compute association rules
Rules = association_rules(frequent_itemsets, metric = "support", min_threshold = 0.005)
Rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(burgers),(almonds),0.087188,0.020397,0.005199,0.059633,2.923577,0.003421,1.041724
1,(almonds),(burgers),0.020397,0.087188,0.005199,0.254902,2.923577,0.003421,1.225089
2,(chocolate),(almonds),0.163845,0.020397,0.005999,0.036615,1.795099,0.002657,1.016834
3,(almonds),(chocolate),0.020397,0.163845,0.005999,0.294118,1.795099,0.002657,1.184553
4,(almonds),(eggs),0.020397,0.179709,0.006532,0.320261,1.782108,0.002867,1.206774
...,...,...,...,...,...,...,...,...,...
2057,"(eggs_&_ground beef, spaghetti)","(eggs, ground beef)",0.008932,0.019997,0.008932,1.000000,50.006667,0.008754,inf
2058,(eggs),"(ground beef, eggs_&_ground beef, spaghetti)",0.179709,0.008932,0.008932,0.049703,5.564540,0.007327,1.042904
2059,(ground beef),"(eggs, eggs_&_ground beef, spaghetti)",0.098254,0.008932,0.008932,0.090909,10.177748,0.008055,1.090175
2060,(eggs_&_ground beef),"(eggs, ground beef, spaghetti)",0.019997,0.008932,0.008932,0.446667,50.006667,0.008754,1.791086


In [14]:
filtered_rules = Rules[(Rules['antecedent support'] > 0.01) & 
                       (Rules['support'] > 0.009) & 
                       (Rules['confidence'] > 0.5) & 
                       (Rules['lift'] > 1.00)]

filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
409,(eggs_&_ground beef),(eggs),0.019997,0.179709,0.019997,1.0,5.56454,0.016404,inf
691,(eggs_&_ground beef),(ground beef),0.019997,0.098254,0.019997,1.0,10.177748,0.018033,inf
833,(eggs_&_ground beef),(mineral water),0.019997,0.238368,0.010132,0.506667,2.125563,0.005365,1.543848
1432,"(eggs, ground beef)",(mineral water),0.019997,0.238368,0.010132,0.506667,2.125563,0.005365,1.543848
1442,"(eggs, ground beef)",(eggs_&_ground beef),0.019997,0.019997,0.019997,1.0,50.006667,0.019597,inf
1443,"(eggs, eggs_&_ground beef)",(ground beef),0.019997,0.098254,0.019997,1.0,10.177748,0.018033,inf
1444,"(ground beef, eggs_&_ground beef)",(eggs),0.019997,0.179709,0.019997,1.0,5.56454,0.016404,inf
1447,(eggs_&_ground beef),"(eggs, ground beef)",0.019997,0.019997,0.019997,1.0,50.006667,0.019597,inf
1515,"(mineral water, eggs_&_ground beef)",(eggs),0.010132,0.179709,0.010132,1.0,5.56454,0.008311,inf
1516,"(eggs, eggs_&_ground beef)",(mineral water),0.019997,0.238368,0.010132,0.506667,2.125563,0.005365,1.543848


In [15]:
# Computing support.
supportASAL = np.logical_and(onehot['asparagus'],onehot['almonds']).mean()
supportAS = onehot['asparagus'].mean()
supportAL = onehot['almonds'].mean()

# Compute and print confidence and lift.
confidence = supportASAL / supportAS
lift = supportASAL / (supportAS * supportAL)

# Print results.
print(supportAL, confidence, lift)

0.020397280362618318 0.0 0.0
