In [1]:
import numpy as np 
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
data = pd.read_excel('Online Retail.xlsx')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
# Exploring the different regions of transactions 
data.Country.unique() 


array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [4]:
data.shape

(541909, 8)

In [5]:
# Stripping extra spaces in the description 
data['Description'] = data['Description'].str.strip() 

# Dropping the rows without any invoice number 
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
data['InvoiceNo'] = data['InvoiceNo'].astype('str') 

# Dropping all transactions which were done on credit 
data = data[~data['InvoiceNo'].str.contains('C')] 


In [6]:
data.shape

(532621, 8)

In [7]:
# Transactions done in France 
basket_France = (data[data['Country'] =="France"] 
        .groupby(['InvoiceNo', 'Description'])['Quantity'] 
        .sum().unstack().reset_index().fillna(0) 
        .set_index('InvoiceNo')) 

# Transactions done in the United Kingdom 
basket_UK = (data[data['Country'] =="United Kingdom"] 
        .groupby(['InvoiceNo', 'Description'])['Quantity'] 
        .sum().unstack().reset_index().fillna(0) 
        .set_index('InvoiceNo')) 

# Transactions done in Portugal 
basket_Por = (data[data['Country'] =="Portugal"] 
        .groupby(['InvoiceNo', 'Description'])['Quantity'] 
        .sum().unstack().reset_index().fillna(0) 
        .set_index('InvoiceNo')) 

basket_Sweden = (data[data['Country'] =="Sweden"] 
        .groupby(['InvoiceNo', 'Description'])['Quantity'] 
        .sum().unstack().reset_index().fillna(0) 
        .set_index('InvoiceNo')) 


In [8]:
basket_France.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
basket_France.shape

(392, 1563)

In [10]:
# Defining the hot encoding function to make the data suitable 
# for the concerned libraries 
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

# Encoding the datasets 
basket_encoded = basket_France.applymap(hot_encode) 
basket_France = basket_encoded 

basket_encoded = basket_UK.applymap(hot_encode) 
basket_UK = basket_encoded 

basket_encoded = basket_Por.applymap(hot_encode) 
basket_Por = basket_encoded 

basket_encoded = basket_Sweden.applymap(hot_encode) 
basket_Sweden = basket_encoded 


In [11]:
# Building the model 
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True) 

# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf
258,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf
270,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf
300,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
301,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796


In [12]:
basket_France.shape

(392, 1563)

In [13]:
frq_items = apriori(basket_UK, min_support = 0.03, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
3,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.03766,0.050035,0.03091,0.820768,16.403939,0.029026,5.300203
4,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.050035,0.051267,0.037553,0.750535,14.639752,0.034988,3.803076
5,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.051267,0.050035,0.037553,0.732497,14.639752,0.034988,3.551237
9,(JUMBO BAG PINK POLKADOT),(JUMBO BAG RED RETROSPOT),0.062088,0.10382,0.042053,0.677308,6.523895,0.035607,2.777201
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.046928,0.049821,0.03016,0.642694,12.900183,0.027822,2.659288


In [14]:
basket_UK.shape

(18667, 4175)

In [15]:
frq_items = apriori(basket_Por, min_support = 0.05, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1170,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET 12 COLOUR PENCILS SPACEBOY),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1171,(SET 12 COLOUR PENCILS SPACEBOY),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1172,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET OF 4 KNICK KNACK TINS LONDON),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1173,(SET OF 4 KNICK KNACK TINS LONDON),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1174,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET OF 4 KNICK KNACK TINS POPPIES),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf


In [16]:
frq_items = apriori(basket_Sweden, min_support = 0.05, use_colnames = True) 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
1,(PACK OF 72 SKULL CAKE CASES),(12 PENCILS SMALL TUBE SKULL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
4,(ASSORTED BOTTLE TOP MAGNETS),(36 DOILIES DOLLY GIRL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
5,(36 DOILIES DOLLY GIRL),(ASSORTED BOTTLE TOP MAGNETS),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
180,(CHILDRENS CUTLERY DOLLY GIRL),(CHILDRENS CUTLERY CIRCUS PARADE),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf


In [17]:
rules.shape

(104608, 9)

In [18]:
rules.columns

Index(['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction'],
      dtype='object')

In [19]:
rules[300:350]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2587,(LARGE RED BABUSHKA NOTEBOOK),"(LARGE YELLOW BABUSHKA NOTEBOOK, SET OF 3 CAKE...",0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2589,(LARGE YELLOW BABUSHKA NOTEBOOK),"(LARGE RED BABUSHKA NOTEBOOK, SET OF 3 CAKE TI...",0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2590,"(LARGE RED BABUSHKA NOTEBOOK, SET OF 3 CAKE TI...",(LARGE YELLOW BABUSHKA NOTEBOOK),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2592,"(SET OF 3 CAKE TINS SKETCHBOOK, LARGE YELLOW B...",(LARGE RED BABUSHKA NOTEBOOK),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2593,(LARGE RED BABUSHKA NOTEBOOK),"(SET OF 3 CAKE TINS SKETCHBOOK, LARGE YELLOW B...",0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2595,(LARGE YELLOW BABUSHKA NOTEBOOK),"(LARGE RED BABUSHKA NOTEBOOK, SET OF 3 CAKE TI...",0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2596,"(LARGE RED BABUSHKA NOTEBOOK, VICTORIAN SEWING...",(LARGE YELLOW BABUSHKA NOTEBOOK),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2598,"(LARGE YELLOW BABUSHKA NOTEBOOK, VICTORIAN SEW...",(LARGE RED BABUSHKA NOTEBOOK),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2599,(LARGE RED BABUSHKA NOTEBOOK),"(LARGE YELLOW BABUSHKA NOTEBOOK, VICTORIAN SEW...",0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
2601,(LARGE YELLOW BABUSHKA NOTEBOOK),"(LARGE RED BABUSHKA NOTEBOOK, VICTORIAN SEWING...",0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf


In [20]:
rules.iloc[200:250, 0:2]

Unnamed: 0,antecedents,consequents
2207,"(SET OF 3 REGENCY CAKE TINS, TREASURE TIN BUFF...",(DOILEY STORAGE TIN)
2210,(DOILEY STORAGE TIN),"(SET OF 3 REGENCY CAKE TINS, TREASURE TIN BUFF..."
2212,"(TREASURE TIN GYMKHANA DESIGN, SET OF 3 REGENC...",(DOILEY STORAGE TIN)
2217,(DOILEY STORAGE TIN),"(TREASURE TIN GYMKHANA DESIGN, SET OF 3 REGENC..."
2224,"(GUMBALL COAT RACK, FAIRY CAKE FLANNEL ASSORTE...",(MOBILE VINTAGE HEARTS)
2225,"(GUMBALL COAT RACK, MOBILE VINTAGE HEARTS)",(FAIRY CAKE FLANNEL ASSORTED COLOUR)
2228,(FAIRY CAKE FLANNEL ASSORTED COLOUR),"(GUMBALL COAT RACK, MOBILE VINTAGE HEARTS)"
2229,(MOBILE VINTAGE HEARTS),"(GUMBALL COAT RACK, FAIRY CAKE FLANNEL ASSORTE..."
2237,"(POSTAGE, MOBILE VINTAGE HEARTS)",(FAIRY CAKE FLANNEL ASSORTED COLOUR)
2238,"(FAIRY CAKE FLANNEL ASSORTED COLOUR, POSTAGE)",(MOBILE VINTAGE HEARTS)


In [21]:
rules.iloc[2581,1]

frozenset({'LARGE PURPLE BABUSHKA NOTEBOOK', 'MAGIC DRAWING SLATE DOLLY GIRL'})

In [22]:
rules.iloc[2581,0]

frozenset({'SET OF 3 CAKE TINS PANTRY DESIGN',
           'SET OF 3 CAKE TINS SKETCHBOOK'})