### Market Basket Analysis

### Import Libraries

In [34]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

### Read the data, encode the data

In [37]:
df = pd.read_csv('Market_Basket_Optimisation.csv',header=None) #header=None, as the dataset does not have column names.
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [41]:
df.shape

(7501, 20)

### Convert Rows into Transactions 
- Creates an empty list to store all transactions : transactions = [ ]
- Loops over each row in the DataFrame : len(df) = number of transactions
- Creates an empty list for one transaction : t = [ ]
- Loops over each column in the row : df.shape[1] = number of columns
- Fetches the value at: row i and column j by using df.iloc[i,j] # iloc is integer-based indexing
- Checks if the cell is NOT NaN as Apriori cannot process NaN and We only want actual items
- If df.iloc[i,j] is not NaN then converts the item to a string.  Add the string to transaction list : t
- After all columns of a row are processed then adds the completed transaction 't' to transactions 

In [42]:
transactions = []

for i in range(len(df)):
    t = []
    for j in range(df.shape[1]):
        item = df.iloc[i, j]
        if pd.notna(item):          # remove NaN
            t.append(str(item))
    transactions.append(t)

In [43]:
## Just for understanding
print(transactions[1])
print(transactions[2111])

['burgers', 'meatballs', 'eggs']
['mineral water', 'cider']


In [44]:
## Just for understanding
max_len = 0
for t in transactions :
    if len(t) > max_len :
        maxt = t
        max_len = len(t)
print(max_len)
print(maxt)

20
['shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice', 'low fat yogurt', 'green tea', 'honey', 'salad', 'mineral water', 'salmon', 'antioxydant juice', 'frozen smoothie', 'spinach', 'olive oil']


### TransactionEncoder

In [45]:
te=TransactionEncoder()
te_array=te.fit(transactions).transform(transactions)
df=pd.DataFrame(te_array, columns=te.columns_)
df 

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7497,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7498,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7499,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Find the frequent itemsets

In [50]:
freq_items = apriori(df, min_support = 0.05, use_colnames = True) 
freq_items['length'] = freq_items['itemsets'].apply(lambda x:len(x))
print(freq_items) 

     support                    itemsets  length
0   0.087188                   (burgers)       1
1   0.081056                      (cake)       1
2   0.059992                   (chicken)       1
3   0.163845                 (chocolate)       1
4   0.080389                   (cookies)       1
5   0.051060               (cooking oil)       1
6   0.179709                      (eggs)       1
7   0.079323                  (escalope)       1
8   0.170911              (french fries)       1
9   0.063325           (frozen smoothie)       1
10  0.095321         (frozen vegetables)       1
11  0.052393             (grated cheese)       1
12  0.132116                 (green tea)       1
13  0.098254               (ground beef)       1
14  0.076523            (low fat yogurt)       1
15  0.129583                      (milk)       1
16  0.238368             (mineral water)       1
17  0.065858                 (olive oil)       1
18  0.095054                  (pancakes)       1
19  0.071457        

In [52]:
# Find all frequent 2-item combinations that appear in at least 5% of transactions.
# Select frequent itemsets such that: It contain exactly 2 items and have support ≥ 0.05

freq_items[(freq_items['length']==2 )& (freq_items['support']>=0.05)]

Unnamed: 0,support,itemsets,length
25,0.05266,"(mineral water, chocolate)",2
26,0.050927,"(eggs, mineral water)",2
27,0.059725,"(spaghetti, mineral water)",2


## Apply Apriori algorithm

In [54]:
rules = association_rules(freq_items, metric ='support', min_threshold=0.05 ) 
print(rules)

       antecedents      consequents  antecedent support  consequent support  \
0  (mineral water)      (chocolate)            0.238368            0.163845   
1      (chocolate)  (mineral water)            0.163845            0.238368   
2           (eggs)  (mineral water)            0.179709            0.238368   
3  (mineral water)           (eggs)            0.238368            0.179709   
4      (spaghetti)  (mineral water)            0.174110            0.238368   
5  (mineral water)      (spaghetti)            0.238368            0.174110   

    support  confidence      lift  representativity  leverage  conviction  \
0  0.052660    0.220917  1.348332               1.0  0.013604    1.073256   
1  0.052660    0.321400  1.348332               1.0  0.013604    1.122357   
2  0.050927    0.283383  1.188845               1.0  0.008090    1.062815   
3  0.050927    0.213647  1.188845               1.0  0.008090    1.043158   
4  0.059725    0.343032  1.439085               1.0  0.018223

In [55]:
rules = rules.sort_values(['support', 'confidence'], ascending =[False,False]) 
print(rules)

       antecedents      consequents  antecedent support  consequent support  \
4      (spaghetti)  (mineral water)            0.174110            0.238368   
5  (mineral water)      (spaghetti)            0.238368            0.174110   
1      (chocolate)  (mineral water)            0.163845            0.238368   
0  (mineral water)      (chocolate)            0.238368            0.163845   
2           (eggs)  (mineral water)            0.179709            0.238368   
3  (mineral water)           (eggs)            0.238368            0.179709   

    support  confidence      lift  representativity  leverage  conviction  \
4  0.059725    0.343032  1.439085               1.0  0.018223    1.159314   
5  0.059725    0.250559  1.439085               1.0  0.018223    1.102008   
1  0.052660    0.321400  1.348332               1.0  0.013604    1.122357   
0  0.052660    0.220917  1.348332               1.0  0.013604    1.073256   
2  0.050927    0.283383  1.188845               1.0  0.008090

In [56]:
# Extract support values and print results
support_values = rules['support']
print(support_values)
# confidence_values = rules['confidence']

4    0.059725
5    0.059725
1    0.052660
0    0.052660
2    0.050927
3    0.050927
Name: support, dtype: float64


In [57]:
# Extract confidence values and print results
confidence_values = rules['confidence']
print(confidence_values)

4    0.343032
5    0.250559
1    0.321400
0    0.220917
2    0.283383
3    0.213647
Name: confidence, dtype: float64


In [58]:
# Display Frequent Itemsets
print("Frequent Itemsets:")
print(freq_items) 


Frequent Itemsets:
     support                    itemsets  length
0   0.087188                   (burgers)       1
1   0.081056                      (cake)       1
2   0.059992                   (chicken)       1
3   0.163845                 (chocolate)       1
4   0.080389                   (cookies)       1
5   0.051060               (cooking oil)       1
6   0.179709                      (eggs)       1
7   0.079323                  (escalope)       1
8   0.170911              (french fries)       1
9   0.063325           (frozen smoothie)       1
10  0.095321         (frozen vegetables)       1
11  0.052393             (grated cheese)       1
12  0.132116                 (green tea)       1
13  0.098254               (ground beef)       1
14  0.076523            (low fat yogurt)       1
15  0.129583                      (milk)       1
16  0.238368             (mineral water)       1
17  0.065858                 (olive oil)       1
18  0.095054                  (pancakes)       1
1