In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

#loading dataset
retail = pd.read_csv("online_retail.csv", encoding='ISO-8859-1')
print(retail.tail())

         index InvoiceNo StockCode                      Description  Quantity  \
541904  541904    581587     22613      PACK OF 20 SPACEBOY NAPKINS        12   
541905  541905    581587     22899     CHILDREN'S APRON DOLLY GIRL          6   
541906  541906    581587     23254    CHILDRENS CUTLERY DOLLY GIRL          4   
541907  541907    581587     23255  CHILDRENS CUTLERY CIRCUS PARADE         4   
541908  541908    581587     22138    BAKING SET 9 PIECE RETROSPOT          3   

            InvoiceDate  UnitPrice  CustomerID Country  
541904  12/9/2011 12:50       0.85     12680.0  France  
541905  12/9/2011 12:50       2.10     12680.0  France  
541906  12/9/2011 12:50       4.15     12680.0  France  
541907  12/9/2011 12:50       4.15     12680.0  France  
541908  12/9/2011 12:50       4.95     12680.0  France  


In [8]:
#basic cleaning
#dropping rows with missing customerID
retail.dropna(subset=['CustomerID'], inplace=True)
#removing cancelled transactions
retail = retail[~retail['InvoiceNo'].astype(str).str.startswith('C')]
#removing extra spaces from descriptions
retail['Description'] = retail['Description'].str.strip()
#filter for france only (its a very large dataset..)
france_subset = retail[retail['Country'] == "France"]

print(f"Total Transactions: {retail.shape[0]}")
print(f"France Transactions: {france_subset.shape[0]}")
print(france_subset.head()) 

Total Transactions: 397924
France Transactions: 8342
    index InvoiceNo StockCode                      Description  Quantity  \
26     26    536370     22728        ALARM CLOCK BAKELIKE PINK        24   
27     27    536370     22727         ALARM CLOCK BAKELIKE RED        24   
28     28    536370     22726       ALARM CLOCK BAKELIKE GREEN        12   
29     29    536370     21724  PANDA AND BUNNIES STICKER SHEET        12   
30     30    536370     21883                  STARS GIFT TAPE        24   

       InvoiceDate  UnitPrice  CustomerID Country  
26  12/1/2010 8:45       3.75     12583.0  France  
27  12/1/2010 8:45       3.75     12583.0  France  
28  12/1/2010 8:45       3.75     12583.0  France  
29  12/1/2010 8:45       0.85     12583.0  France  
30  12/1/2010 8:45       0.65     12583.0  France  


In [10]:
#creating basket matrix
basket = (france_subset.groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

#conerting to binary
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
basket_sets = basket.map(encode_units)
basket_sets = basket.applymap(encode_units) 

#dropping the 'Postage' column as it is usually a line item, not a real product
if 'POSTAGE' in france_subset.columns:
    basket_sets.drop('POSTAGE', inplace=True, axis=1)
print((f"Matrix Shape: {basket_sets.shape}"))
print(basket_sets.head()) 

Matrix Shape: (389, 1543)
Description  10 COLOUR SPACEBOY PEN  12 COLOURED PARTY BALLOONS  \
InvoiceNo                                                         
536370                            0                           0   
536852                            0                           0   
536974                            0                           0   
537065                            0                           0   
537463                            0                           0   

Description  12 EGG HOUSE PAINTED WOOD  12 MESSAGE CARDS WITH ENVELOPES  \
InvoiceNo                                                                 
536370                               0                                0   
536852                               0                                0   
536974                               0                                0   
537065                               0                                0   
537463                               0                

In [11]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
#finding frequent itemsets
#min-suppot=0.07 means items that appear in at least 7% of txns
#use_colnames=True gives actual product names
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

#sorting by support (most popular items first)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
print(f"Found {frequent_itemsets.shape[0]} frequent itemsets.")
print(frequent_itemsets.sort_values(by='support', ascending=False).head()) 

Found 90 frequent itemsets.
     support                            itemsets  length
22  0.771208                           (POSTAGE)       1
23  0.187661                (RABBIT NIGHT LIGHT)       1
27  0.179949     (RED TOADSTOOL LED NIGHT LIGHT)       1
21  0.172237  (PLASTERS IN TIN WOODLAND ANIMALS)       1
18  0.169666     (PLASTERS IN TIN CIRCUS PARADE)       1


In [12]:
#generating rules (confidence and lift)
#metric="lift": rules where the link is stronger than random chance
#min_threshold=1: keeping positive associations
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) 

In [13]:
#sorting by the strongest connection (lift)
rules =  rules.sort_values(by='lift', ascending=False)

In [14]:
#cleaning up display by showing only key columns 
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)) 

                                           antecedents  \
77               (POSTAGE, ALARM CLOCK BAKELIKE GREEN)   
80                          (ALARM CLOCK BAKELIKE RED)   
2                           (ALARM CLOCK BAKELIKE RED)   
3                         (ALARM CLOCK BAKELIKE GREEN)   
81                        (ALARM CLOCK BAKELIKE GREEN)   
76                 (POSTAGE, ALARM CLOCK BAKELIKE RED)   
7                          (ALARM CLOCK BAKELIKE PINK)   
6                           (ALARM CLOCK BAKELIKE RED)   
119  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...   
122                    (SET/6 RED SPOTTY PAPER PLATES)   

                                           consequents   support  confidence  \
77                          (ALARM CLOCK BAKELIKE RED)  0.071979    0.848485   
80               (POSTAGE, ALARM CLOCK BAKELIKE GREEN)  0.071979    0.756757   
2                         (ALARM CLOCK BAKELIKE GREEN)  0.079692    0.837838   
3                           (ALARM CLOCK 