# Market Basket Analysis with Python

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("input/Market_Basket_Optimisation.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [29]:
# Create a list of transaction
df['Transactions']= df.values.tolist()
df['Transactions']

0       [shrimp, almonds, avocado, vegetables mix, gre...
1       [burgers, meatballs, eggs, nan, nan, nan, nan,...
2       [chutney, nan, nan, nan, nan, nan, nan, nan, n...
3       [turkey, avocado, nan, nan, nan, nan, nan, nan...
4       [mineral water, milk, energy bar, whole wheat ...
                              ...                        
7496    [butter, light mayo, fresh bread, nan, nan, na...
7497    [burgers, frozen vegetables, eggs, french frie...
7498    [chicken, nan, nan, nan, nan, nan, nan, nan, n...
7499    [escalope, green tea, nan, nan, nan, nan, nan,...
7500    [eggs, frozen smoothie, yogurt cake, low fat y...
Name: Transactions, Length: 7501, dtype: object

In [30]:
# Delete NaN from the transaction list
df['Transactions'] = df['Transactions'].apply(lambda x: [i for i in x if str(i) != "nan"])
df['Transactions']

0       [shrimp, almonds, avocado, vegetables mix, gre...
1       [burgers, meatballs, eggs, [burgers, meatballs...
2                                    [chutney, [chutney]]
3                    [turkey, avocado, [turkey, avocado]]
4       [mineral water, milk, energy bar, whole wheat ...
                              ...                        
7496    [butter, light mayo, fresh bread, [butter, lig...
7497    [burgers, frozen vegetables, eggs, french frie...
7498                                 [chicken, [chicken]]
7499         [escalope, green tea, [escalope, green tea]]
7500    [eggs, frozen smoothie, yogurt cake, low fat y...
Name: Transactions, Length: 7501, dtype: object

In [31]:
# Convert the transaction list from a DataFrame column into a list of strings
transactions = list(df['Transactions'])
transactions.count(['burgers', 'meatballs', 'eggs'])

0

In [32]:
# Count the number of rules

# Import library to count the number of permutations
from itertools import permutations

# Extract unique items.
unique_items = [item for transaction in transactions for item in transaction]

# Convert the unique item list from a string to a list
unique_item_list = list(set(unique_items))

# Compute and print rules.
rules = list(permutations(unique_item_list, 2))
print(len(rules))
print(rules)

TypeError: unhashable type: 'list'

### 3.1. Support

In [10]:
!pip install mlxtend

In [11]:
# Import the library for encoding
from mlxtend.preprocessing import TransactionEncoder

# Instantiate transaction encoder
encoder = TransactionEncoder().fit(transactions)

In [12]:
# One-hot encode itemsets by applying fit and transform
onehot = encoder.transform(transactions)

# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns = encoder.columns_)
print(onehot)

       asparagus  almonds  antioxydant juice  asparagus  avocado  babies food  \
0          False     True               True      False     True        False   
1          False    False              False      False    False        False   
2          False    False              False      False    False        False   
3          False    False              False      False     True        False   
4          False    False              False      False    False        False   
...          ...      ...                ...        ...      ...          ...   
7496       False    False              False      False    False        False   
7497       False    False              False      False    False        False   
7498       False    False              False      False    False        False   
7499       False    False              False      False    False        False   
7500       False    False              False      False    False        False   

      bacon  barbecue sauce

In [13]:
# Computing Support for Single Items
print(onehot.mean())

 asparagus           0.000133
almonds              0.020397
antioxydant juice    0.008932
asparagus            0.004666
avocado              0.033329
                       ...   
whole wheat pasta    0.029463
whole wheat rice     0.058526
yams                 0.011465
yogurt cake          0.027330
zucchini             0.009465
Length: 120, dtype: float64


In [14]:
# Define itemset that contains both eggs and ground beef
onehot['eggs_&_ground beef'] = np.logical_and(onehot['eggs'], onehot['ground beef'])

# Compute Support for itemset that contains both eggs and ground beef 
print(onehot['eggs_&_ground beef'].mean())

0.019997333688841486


In [15]:
# Drop the column of "eggs_&_ground beef" to keep the dataset simple 
onehot=onehot.drop('eggs_&_ground beef', axis=1)
onehot

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7497,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7498,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7499,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### 3.2. Confidence

In [16]:
# Compute Support for the itemsets that contains eggs and ground beef
support_eggs_groundbeef = np.logical_and(onehot['eggs'], onehot['ground beef']).mean()
support_eggs = onehot['eggs'].mean()
support_groundbeef = onehot['ground beef'].mean()

In [17]:
# Compute and print Confidence {eggs -> ground beef}
confidence_eggs_to_groundbeef = support_eggs_groundbeef / support_eggs
print(confidence_eggs_to_groundbeef)

0.11127596439169138


### 3.3. Lift

In [18]:
# Compute and print Lift {eggs -> ground beef}
lift_eggs_to_groundbeef = support_eggs_groundbeef / (support_eggs * support_groundbeef)
print(lift_eggs_to_groundbeef)

1.1325386823637411


In [19]:
lift2 = confidence_eggs_to_groundbeef / support_groundbeef
print(lift2)

1.132538682363741


## 4. Apriori Algorithm

In [20]:
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori

In [21]:
# Compute frequent itemsets
frequent_itemsets = apriori(onehot, min_support = 0.0005,max_len = 4, use_colnames = True)

# Print number of itemsets
print(len(frequent_itemsets))

19788


In [22]:
# Print frequent itemsets
print(frequent_itemsets.head())

    support             itemsets
0  0.020397            (almonds)
1  0.008932  (antioxydant juice)
2  0.004666          (asparagus)
3  0.033329            (avocado)
4  0.004533        (babies food)


In [23]:
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori, association_rules

# Compute association rules
Rules = association_rules(frequent_itemsets,
                          metric = "support",
                          min_threshold = 0.005)

In [24]:
# Print association rules
Rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(burgers),(almonds),0.087188,0.020397,0.005199,0.059633,2.923577,0.003421,1.041724
1,(almonds),(burgers),0.020397,0.087188,0.005199,0.254902,2.923577,0.003421,1.225089
2,(chocolate),(almonds),0.163845,0.020397,0.005999,0.036615,1.795099,0.002657,1.016834
3,(almonds),(chocolate),0.020397,0.163845,0.005999,0.294118,1.795099,0.002657,1.184553
4,(eggs),(almonds),0.179709,0.020397,0.006532,0.036350,1.782108,0.002867,1.016555
...,...,...,...,...,...,...,...,...,...
1935,"(olive oil, pancakes)",(spaghetti),0.010799,0.174110,0.005066,0.469136,2.694478,0.003186,1.555746
1936,"(spaghetti, pancakes)",(olive oil),0.025197,0.065858,0.005066,0.201058,3.052910,0.003407,1.169224
1937,(olive oil),"(spaghetti, pancakes)",0.065858,0.025197,0.005066,0.076923,3.052910,0.003407,1.056037
1938,(spaghetti),"(olive oil, pancakes)",0.174110,0.010799,0.005066,0.029096,2.694478,0.003186,1.018846


In [25]:
# Print the rules.
print(Rules)

                antecedents             consequents  antecedent support  \
0                 (burgers)               (almonds)            0.087188   
1                 (almonds)               (burgers)            0.020397   
2               (chocolate)               (almonds)            0.163845   
3                 (almonds)             (chocolate)            0.020397   
4                    (eggs)               (almonds)            0.179709   
...                     ...                     ...                 ...   
1935  (olive oil, pancakes)             (spaghetti)            0.010799   
1936  (spaghetti, pancakes)             (olive oil)            0.025197   
1937            (olive oil)   (spaghetti, pancakes)            0.065858   
1938            (spaghetti)   (olive oil, pancakes)            0.174110   
1939             (pancakes)  (olive oil, spaghetti)            0.095054   

      consequent support   support  confidence      lift  leverage  conviction  
0               0.

In [26]:
filtered_rules = Rules[(Rules['antecedent support'] > 0.01) & 
                       (Rules['support'] > 0.009) & 
                       (Rules['confidence'] > 0.5) & 
                       (Rules['lift'] > 1.00)]

In [27]:
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1406,"(ground beef, eggs)",(mineral water),0.019997,0.238368,0.010132,0.506667,2.125563,0.005365,1.543848
1592,"(frozen vegetables, ground beef)",(mineral water),0.016931,0.238368,0.009199,0.543307,2.279277,0.005163,1.667711
1736,"(ground beef, milk)",(mineral water),0.021997,0.238368,0.011065,0.50303,2.110308,0.005822,1.532552


In [28]:
# Computing support.
supportASAL = np.logical_and(onehot['asparagus'],onehot['almonds']).mean()
supportAS = onehot['asparagus'].mean()
supportAL = onehot['almonds'].mean()

# Compute and print confidence and lift.
confidence = supportASAL / supportAS
lift = supportASAL / (supportAS * supportAL)

# Print results.
print(supportAL, confidence, lift)

0.020397280362618318 0.0 0.0
