In [1]:
# import librarries
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
#Import Data
retail = pd.read_excel('Online Retail.xlsx')
retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [7]:
retail['Country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Finland',
       'Austria', 'Bahrain', 'Israel', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [3]:
#Data type
retail.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [4]:
#Data type conversion 
retail['Description'] = retail['Description'].str.strip()
retail.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
retail['InvoiceNo'] = retail['InvoiceNo'].astype('str')
retail = retail[~retail['InvoiceNo'].str.contains('C')]

In [16]:
# country wise Market basket
basket = (retail[retail['Country'] =="Australia"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))



In [11]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)

In [12]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

In [13]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(RED RETROSPOT CAKE STAND),(36 PENCILS TUBE RED RETROSPOT),0.070175,0.070175,0.070175,1.0,14.25,0.065251,inf
1,(36 PENCILS TUBE RED RETROSPOT),(RED RETROSPOT CAKE STAND),0.070175,0.070175,0.070175,1.0,14.25,0.065251,inf
2,(SET OF 3 CAKE TINS PANTRY DESIGN),(36 PENCILS TUBE RED RETROSPOT),0.157895,0.070175,0.070175,0.444444,6.333333,0.059095,1.673684
3,(36 PENCILS TUBE RED RETROSPOT),(SET OF 3 CAKE TINS PANTRY DESIGN),0.070175,0.157895,0.070175,1.0,6.333333,0.059095,inf
4,(SET OF 6 SOLDIER SKITTLES),(4 TRADITIONAL SPINNING TOPS),0.122807,0.087719,0.070175,0.571429,6.514286,0.059403,2.128655


In [14]:
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(RED RETROSPOT CAKE STAND),(36 PENCILS TUBE RED RETROSPOT),0.070175,0.070175,0.070175,1.000000,14.250000,0.065251,inf
1,(36 PENCILS TUBE RED RETROSPOT),(RED RETROSPOT CAKE STAND),0.070175,0.070175,0.070175,1.000000,14.250000,0.065251,inf
3,(36 PENCILS TUBE RED RETROSPOT),(SET OF 3 CAKE TINS PANTRY DESIGN),0.070175,0.157895,0.070175,1.000000,6.333333,0.059095,inf
5,(4 TRADITIONAL SPINNING TOPS),(SET OF 6 SOLDIER SKITTLES),0.087719,0.122807,0.070175,0.800000,6.514286,0.059403,4.385965
6,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.105263,0.105263,0.105263,1.000000,9.500000,0.094183,inf
7,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.105263,0.105263,0.105263,1.000000,9.500000,0.094183,inf
9,(RED DINER WALL CLOCK),(ALARM CLOCK BAKELIKE GREEN),0.070175,0.105263,0.070175,1.000000,9.500000,0.062789,inf
10,(RED DINER WALL CLOCK),(ALARM CLOCK BAKELIKE RED),0.070175,0.105263,0.070175,1.000000,9.500000,0.062789,inf
17,(REGENCY SUGAR BOWL GREEN),(BAKING SET 9 PIECE RETROSPOT),0.070175,0.122807,0.070175,1.000000,8.142857,0.061557,inf
24,(BLACK/BLUE POLKADOT UMBRELLA),(RED RETROSPOT UMBRELLA),0.087719,0.070175,0.070175,0.800000,11.400000,0.064020,4.649123


In [15]:
basket2 = (retail[retail['Country'] =="Germany"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_sets2 = basket2.applymap(encode_units)
basket_sets2.drop('POSTAGE', inplace=True, axis=1)
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)
rules2 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)

rules2[ (rules2['lift'] >= 4) &
        (rules2['confidence'] >= 0.5)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(PLASTERS IN TIN CIRCUS PARADE),(PLASTERS IN TIN WOODLAND ANIMALS),0.115974,0.137856,0.067834,0.584906,4.242887,0.051846,2.076984
6,(PLASTERS IN TIN SPACEBOY),(PLASTERS IN TIN WOODLAND ANIMALS),0.107221,0.137856,0.061269,0.571429,4.145125,0.046488,2.01167
11,(RED RETROSPOT CHARLOTTE BAG),(WOODLAND CHARLOTTE BAG),0.070022,0.126915,0.059081,0.84375,6.648168,0.050194,5.587746
