# Importing the required Libraries

In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

Loading the Dataset from the local machine

In [2]:
df = pd.read_excel(r'C:\Users\user\Downloads\Online Retail.xlsx')

Performing EDA on the Data

In [3]:
df.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


In [4]:
df.shape

(541909, 8)

In [5]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [6]:
df.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

Removing the extra spaces in the Description column

In [7]:
df['Description'] = df['Description'].str.strip()

Dropping all the rows with no Invoice Number

In [8]:
df.dropna(axis = 0, subset = ['InvoiceNo'], inplace = True)

In [9]:
df.shape

(541909, 8)

Converting the Invoice Column to String Data type

In [10]:
df['InvoiceNo'] = df['InvoiceNo'].astype('str')

In [11]:
df.shape

(541909, 8)

Dropping all rows that contain goods bought on credit

In [12]:
df = df[~df['InvoiceNo'].str.contains('C')]

checking the Dataset structure after dropping the rows

In [13]:
df.shape

(532621, 8)

In [14]:
df.isnull()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
541904,False,False,False,False,False,False,False,False
541905,False,False,False,False,False,False,False,False
541906,False,False,False,False,False,False,False,False
541907,False,False,False,False,False,False,False,False


In [15]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,532621.0,532621.0,397924.0
mean,10.239972,3.847621,15294.315171
std,159.593551,41.758023,1713.169877
min,-9600.0,-11062.06,12346.0
25%,1.0,1.25,13969.0
50%,3.0,2.08,15159.0
75%,10.0,4.13,16795.0
max,80995.0,13541.33,18287.0


# Splitting the Dataset with respect to the countries

In [16]:
basket_France = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [17]:
basket_Netherlands = (df[df['Country'] =="Netherlands"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [18]:
basket_Germany = (df[df['Country'] =="Germany"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [19]:
basket_EIRE = (df[df['Country'] =="EIRE"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [20]:
basket_Spain = (df[df['Country'] =="Spain"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [21]:
basket_Portugal = (df[df['Country'] =="Portugal"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [22]:
basket_Belgium = (df[df['Country'] =="Belgium"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [23]:
basket_Finland = (df[df['Country'] =="Finland"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [24]:
basket_Sweden = (df[df['Country'] =="Sweden"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

# Encoding the Data

In [25]:
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1

The above is done to make the data suitable for the library used in the study

#  Encoding dataset and Building the Apriori Association Rule Mining Model for the some selected Countries based on their region

In [26]:
basket_encoded = basket_France.applymap(hot_encode)
basket_France = basket_encoded

In [27]:
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
45,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf
258,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf
270,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf
300,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
302,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796


In [28]:
basket_encoded = basket_Netherlands.applymap(hot_encode)
basket_Netherlands = basket_encoded

In [29]:
frq_items = apriori( basket_Netherlands, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
650,(FOLDING BUTTERFLY MIRROR HOT PINK),(FOLDING BUTTERFLY MIRROR RED),0.052632,0.052632,0.052632,1.0,19.0,0.049861,inf
651,(FOLDING BUTTERFLY MIRROR RED),(FOLDING BUTTERFLY MIRROR HOT PINK),0.052632,0.052632,0.052632,1.0,19.0,0.049861,inf
1352,"(FOOD CONTAINER SET 3 LOVE HEART, CARD DOLLY G...",(10 COLOUR SPACEBOY PEN),0.052632,0.052632,0.052632,1.0,19.0,0.049861,inf
1353,(10 COLOUR SPACEBOY PEN),"(FOOD CONTAINER SET 3 LOVE HEART, CARD DOLLY G...",0.052632,0.052632,0.052632,1.0,19.0,0.049861,inf
1382,"(STRAWBERRY LUNCH BOX WITH CUTLERY, CARD DOLLY...",(10 COLOUR SPACEBOY PEN),0.052632,0.052632,0.052632,1.0,19.0,0.049861,inf


In [30]:
basket_encoded = basket_Germany.applymap(hot_encode)
basket_Germany = basket_encoded

In [31]:
frq_items = apriori( basket_Germany, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
34,(PLASTERS IN TIN STRONGMAN),(POSTAGE),0.070022,0.818381,0.067834,0.96875,1.18374,0.010529,5.811816
50,(RETROSPOT TEA SET CERAMIC 11 PC),(POSTAGE),0.056893,0.818381,0.054705,0.961538,1.174928,0.008145,4.722101
52,(ROUND SNACK BOXES SET OF 4 FRUITS),(POSTAGE),0.157549,0.818381,0.150985,0.958333,1.171012,0.022049,4.358862
102,"(ROUND SNACK BOXES SET OF4 WOODLAND, ROUND SNA...",(POSTAGE),0.131291,0.818381,0.124726,0.95,1.160829,0.01728,3.632385
32,(PLASTERS IN TIN SPACEBOY),(POSTAGE),0.107221,0.818381,0.100656,0.938776,1.147113,0.012909,2.966448


In [32]:
basket_encoded = basket_EIRE.applymap(hot_encode)
basket_EIRE = basket_encoded

In [33]:
frq_items = apriori( basket_EIRE, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
195,"(REGENCY TEA PLATE PINK, REGENCY CAKESTAND 3 T...",(REGENCY TEA PLATE GREEN),0.055556,0.079861,0.055556,1.0,12.521739,0.051119,inf
184,"(REGENCY TEAPOT ROSES, REGENCY CAKESTAND 3 TIER)",(REGENCY SUGAR BOWL GREEN),0.052083,0.086806,0.052083,1.0,11.52,0.047562,inf
116,"(GREEN REGENCY TEACUP AND SAUCER, REGENCY CAKE...",(ROSES REGENCY TEACUP AND SAUCER),0.086806,0.166667,0.086806,1.0,6.0,0.072338,inf
128,"(REGENCY SUGAR BOWL GREEN, GREEN REGENCY TEACU...",(ROSES REGENCY TEACUP AND SAUCER),0.0625,0.166667,0.0625,1.0,6.0,0.052083,inf
140,"(GREEN REGENCY TEACUP AND SAUCER, REGENCY TEAP...",(ROSES REGENCY TEACUP AND SAUCER),0.052083,0.166667,0.052083,1.0,6.0,0.043403,inf


In [34]:
basket_encoded = basket_Spain.applymap(hot_encode)
basket_Spain = basket_encoded

In [35]:
frq_items = apriori( basket_Spain, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
240,"(LUNCH BAG PINK POLKADOT, LUNCH BAG CARS BLUE)",(LUNCH BAG BLACK SKULL.),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
245,(LUNCH BAG BLACK SKULL.),"(LUNCH BAG PINK POLKADOT, LUNCH BAG CARS BLUE)",0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
64,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.055556,0.066667,0.055556,1.0,15.0,0.051852,inf
85,(LUNCH BAG BLACK SKULL.),(LUNCH BAG CARS BLUE),0.055556,0.066667,0.055556,1.0,15.0,0.051852,inf
235,"(PINK REGENCY TEACUP AND SAUCER, ROSES REGENCY...",(GREEN REGENCY TEACUP AND SAUCER),0.055556,0.066667,0.055556,1.0,15.0,0.051852,inf


In [36]:
basket_encoded = basket_Portugal.applymap(hot_encode)
basket_Portugal = basket_encoded

In [37]:
frq_items = apriori( basket_Portugal, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1170,(SET 12 COLOUR PENCILS SPACEBOY),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1171,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET 12 COLOUR PENCILS SPACEBOY),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1172,(SET 12 COLOUR PENCILS DOLLY GIRL),(SET OF 4 KNICK KNACK TINS LONDON),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1173,(SET OF 4 KNICK KNACK TINS LONDON),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf
1174,(SET OF 4 KNICK KNACK TINS POPPIES),(SET 12 COLOUR PENCILS DOLLY GIRL),0.051724,0.051724,0.051724,1.0,19.333333,0.049049,inf


In [38]:
basket_encoded = basket_Belgium.applymap(hot_encode)
basket_Belgium = basket_encoded

In [39]:
frq_items = apriori( basket_Belgium, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
426,(SET/6 RED SPOTTY PAPER CUPS),(SET/6 RED SPOTTY PAPER PLATES),0.05102,0.05102,0.05102,1.0,19.6,0.048417,inf
427,(SET/6 RED SPOTTY PAPER PLATES),(SET/6 RED SPOTTY PAPER CUPS),0.05102,0.05102,0.05102,1.0,19.6,0.048417,inf
1383,"(SET/6 RED SPOTTY PAPER CUPS, POSTAGE)",(SET/6 RED SPOTTY PAPER PLATES),0.05102,0.05102,0.05102,1.0,19.6,0.048417,inf
1384,"(SET/6 RED SPOTTY PAPER PLATES, POSTAGE)",(SET/6 RED SPOTTY PAPER CUPS),0.05102,0.05102,0.05102,1.0,19.6,0.048417,inf
1385,(SET/6 RED SPOTTY PAPER CUPS),"(SET/6 RED SPOTTY PAPER PLATES, POSTAGE)",0.05102,0.05102,0.05102,1.0,19.6,0.048417,inf


In [40]:
basket_encoded = basket_Sweden.applymap(hot_encode)
basket_Sweden = basket_encoded

In [41]:
frq_items = apriori( basket_Sweden, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(12 PENCILS SMALL TUBE SKULL),(PACK OF 72 SKULL CAKE CASES),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
1,(PACK OF 72 SKULL CAKE CASES),(12 PENCILS SMALL TUBE SKULL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
4,(36 DOILIES DOLLY GIRL),(ASSORTED BOTTLE TOP MAGNETS),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
5,(ASSORTED BOTTLE TOP MAGNETS),(36 DOILIES DOLLY GIRL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf
180,(CHILDRENS CUTLERY CIRCUS PARADE),(CHILDRENS CUTLERY DOLLY GIRL),0.055556,0.055556,0.055556,1.0,18.0,0.052469,inf


In [42]:
basket_encoded = basket_Finland.applymap(hot_encode)
basket_Finland = basket_encoded

In [43]:
frq_items = apriori( basket_Finland, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(PACK OF 60 PINK PAISLEY CAKE CASES),(60 TEATIME FAIRY CAKE CASES),0.073171,0.073171,0.073171,1.0,13.666667,0.067817,inf
3,(60 TEATIME FAIRY CAKE CASES),(PACK OF 60 PINK PAISLEY CAKE CASES),0.073171,0.073171,0.073171,1.0,13.666667,0.067817,inf
16,"(SET 3 RETROSPOT TEA,COFFEE,SUGAR)",(CAKE PLATE LOVEBIRD PINK),0.073171,0.073171,0.073171,1.0,13.666667,0.067817,inf
17,(CAKE PLATE LOVEBIRD PINK),"(SET 3 RETROSPOT TEA,COFFEE,SUGAR)",0.073171,0.073171,0.073171,1.0,13.666667,0.067817,inf
62,(SET OF 60 PANTRY DESIGN CAKE CASES),(MINT KITCHEN SCALES),0.073171,0.073171,0.073171,1.0,13.666667,0.067817,inf
