###  MArket Basket Analysis on Groceries Dataset

### Import Libraries

In [14]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

### Read the data, encode the data

This csv dataset has 3 columns:
1. Member_number: ID of customers
2. Date: Date of purchases
3. itemDescription: Description of product purchased

In [62]:
df = pd.read_csv('Groceries_dataset.csv') 
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [63]:
df.shape
print(df.columns.tolist())

['Member_number', 'Date', 'itemDescription']


In [64]:
#'Date' column is useless for our task, so we will drop it.
# Market basket analysis focuses on which items are bought together
df.drop('Date', axis=1, inplace=True)
df.head()

Unnamed: 0,Member_number,itemDescription
0,1808,tropical fruit
1,2552,whole milk
2,2300,pip fruit
3,1187,other vegetables
4,3037,whole milk


In [67]:
member_id = 1808  
items = df[df['Member_number'] == member_id]['itemDescription'].tolist()
print(items)

['tropical fruit', 'long life bakery product', 'meat', 'sugar', 'rolls/buns', 'semi-finished bread', 'whole milk', 'citrus fruit', 'candy', 'napkins']


### Convert Basket per Customer 
- df.groupby('Member_number') : Groups the DataFrame by the Member_number column
- df.groupby('Member_number')['itemDescription'] Selects only the itemDescription column for aggregation
- .apply(list)
- Takes all itemDescription values within each group and combines them into a Python list
- .reset_index(name='Basket')
- Converts the Series back into a DataFrame
- Member_number becomes a column instead of index

In [68]:
df = df.groupby('Member_number')['itemDescription'].apply(list).reset_index(name='Basket')
df.head()

Unnamed: 0,Member_number,Basket
0,1000,"[soda, canned beer, sausage, sausage, whole mi..."
1,1001,"[frankfurter, frankfurter, beef, sausage, whol..."
2,1002,"[tropical fruit, butter milk, butter, frozen v..."
3,1003,"[sausage, root vegetables, rolls/buns, deterge..."
4,1004,"[other vegetables, pip fruit, root vegetables,..."


In [69]:
# Convert baskets to list of lists
transactions = df['Basket'].tolist()
print(transactions[20])

['rolls/buns', 'liquor', 'curd', 'sliced cheese', 'baking powder', 'yogurt', 'tropical fruit', 'butter']


### TransactionEncoder

In [70]:
te=TransactionEncoder()
te_array=te.fit(transactions).transform(transactions)
df=pd.DataFrame(te_array, columns=te.columns_)
df.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [71]:
n = df.shape[0]
item_counts = df.sum(axis=0)

max_freq = item_counts.max()
most_frequent_item = item_counts.idxmax()
mean_freq = item_counts.mean()
median_freq = item_counts.median()

print(f"Most frequent item: {most_frequent_item}")
print(f"max support: {max_freq / n : .2f}")

print(f"Mean support of items: {mean_freq / n : .2f}")
print(f"Median support of items: {median_freq / n : .2f}")


Most frequent item: whole milk
max support:  0.46
Mean support of items:  0.05
Median support of items:  0.02


## Find the frequent itemsets

In [72]:
freq_items = apriori(df, min_support = 0.05, use_colnames = True) 
freq_items['length'] = freq_items['itemsets'].apply(lambda x:len(x))
print(freq_items) 

      support                                        itemsets  length
0    0.078502                                      (UHT-milk)       1
1    0.119548                                          (beef)       1
2    0.079785                                       (berries)       1
3    0.062083                                     (beverages)       1
4    0.158799                                  (bottled beer)       1
..        ...                                             ...     ...
160  0.050539  (whole milk, other vegetables, tropical fruit)       3
161  0.071832          (whole milk, other vegetables, yogurt)       3
162  0.065162                  (rolls/buns, soda, whole milk)       3
163  0.065931                (rolls/buns, yogurt, whole milk)       3
164  0.054387                      (whole milk, soda, yogurt)       3

[165 rows x 3 columns]


In [73]:
# Find all frequent 2-item combinations that appear in at least 5% of transactions.
# Select frequent itemsets such that: It contain exactly 2 items and have support ≥ 0.05

freq_items[(freq_items['length']==2 )& (freq_items['support']>=0.05)]

Unnamed: 0,support,itemsets,length
57,0.050795,"(other vegetables, beef)",2
58,0.064135,"(whole milk, beef)",2
59,0.068497,"(bottled beer, other vegetables)",2
60,0.063109,"(rolls/buns, bottled beer)",2
61,0.055156,"(bottled beer, soda)",2
...,...,...,...
149,0.097486,"(soda, yogurt)",2
150,0.116470,"(whole milk, tropical fruit)",2
151,0.075680,"(yogurt, tropical fruit)",2
152,0.079785,"(whole milk, whipped/sour cream)",2


## Apply Apriori algorithm

In [74]:
rules = association_rules(freq_items, metric ='support', min_threshold=0.05 ) 
print(rules)

              antecedents           consequents  antecedent support  \
0      (other vegetables)                (beef)            0.376603   
1                  (beef)    (other vegetables)            0.119548   
2            (whole milk)                (beef)            0.458184   
3                  (beef)          (whole milk)            0.119548   
4          (bottled beer)    (other vegetables)            0.158799   
..                    ...                   ...                 ...   
255  (whole milk, yogurt)                (soda)            0.150590   
256        (soda, yogurt)          (whole milk)            0.097486   
257          (whole milk)        (soda, yogurt)            0.458184   
258                (soda)  (whole milk, yogurt)            0.313494   
259              (yogurt)    (whole milk, soda)            0.282966   

     consequent support   support  confidence      lift  representativity  \
0              0.119548  0.050795    0.134877  1.128223               

In [75]:
rules = rules.sort_values(['support', 'confidence'], ascending =[False,False]) 
print(rules)

            antecedents         consequents  antecedent support  \
119  (other vegetables)        (whole milk)            0.376603   
118        (whole milk)  (other vegetables)            0.458184   
152        (rolls/buns)        (whole milk)            0.349666   
153        (whole milk)        (rolls/buns)            0.458184   
183              (soda)        (whole milk)            0.313494   
..                  ...                 ...                 ...   
89          (margarine)  (other vegetables)            0.116983   
63               (curd)        (rolls/buns)            0.120831   
62         (rolls/buns)              (curd)            0.349666   
88   (other vegetables)         (margarine)            0.376603   
46         (whole milk)           (chicken)            0.458184   

     consequent support   support  confidence      lift  representativity  \
119            0.458184  0.191380    0.508174  1.109106               1.0   
118            0.376603  0.191380    0.41

In [76]:
# Extract support values and print results
support_values = rules['support']
print(support_values)
# confidence_values = rules['confidence']

119    0.191380
118    0.191380
152    0.178553
153    0.178553
183    0.151103
         ...   
89     0.050026
63     0.050026
62     0.050026
88     0.050026
46     0.050026
Name: support, Length: 260, dtype: float64


In [77]:
# Extract confidence values and print results
confidence_values = rules['confidence']
print(confidence_values)

119    0.508174
118    0.417693
152    0.510638
153    0.389698
183    0.481997
         ...   
89     0.427632
63     0.414013
62     0.143067
88     0.132834
46     0.109183
Name: confidence, Length: 260, dtype: float64


In [59]:
# Display Frequent Itemsets
print("Frequent Itemsets:")
print(freq_items) 


Frequent Itemsets:
      support                                        itemsets  length
0    0.078502                                      (UHT-milk)       1
1    0.119548                                          (beef)       1
2    0.079785                                       (berries)       1
3    0.062083                                     (beverages)       1
4    0.158799                                  (bottled beer)       1
..        ...                                             ...     ...
160  0.050539  (whole milk, other vegetables, tropical fruit)       3
161  0.071832          (whole milk, other vegetables, yogurt)       3
162  0.065162                  (rolls/buns, soda, whole milk)       3
163  0.065931                (rolls/buns, yogurt, whole milk)       3
164  0.054387                      (whole milk, soda, yogurt)       3

[165 rows x 3 columns]
