## Importing libraries

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

## Loading the dataset

In [2]:
df = pd.read_csv("data/OnlineRetail.csv", encoding="latin1")
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
df["Country"].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

## For simplicity just look at invoices in `Country="France"`

In [4]:
df = df[df["Country"] == "France"]
df = df.drop(columns="Country")
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID
26,536370,22728,ALARM CLOCK BAKELIKE PINK,24,12/1/2010 8:45,3.75,12583.0
27,536370,22727,ALARM CLOCK BAKELIKE RED,24,12/1/2010 8:45,3.75,12583.0
28,536370,22726,ALARM CLOCK BAKELIKE GREEN,12,12/1/2010 8:45,3.75,12583.0
29,536370,21724,PANDA AND BUNNIES STICKER SHEET,12,12/1/2010 8:45,0.85,12583.0
30,536370,21883,STARS GIFT TAPE,24,12/1/2010 8:45,0.65,12583.0


## Select just columns we need

In [5]:
df = df.iloc[:, [0, 2, 3]]
df.head()

Unnamed: 0,InvoiceNo,Description,Quantity
26,536370,ALARM CLOCK BAKELIKE PINK,24
27,536370,ALARM CLOCK BAKELIKE RED,24
28,536370,ALARM CLOCK BAKELIKE GREEN,12
29,536370,PANDA AND BUNNIES STICKER SHEET,12
30,536370,STARS GIFT TAPE,24


## Preprocessing dataset

### Remove the invoices with "POSTAGE" description

In [6]:
df[df["Description"] == "POSTAGE"].head()

Unnamed: 0,InvoiceNo,Description,Quantity
45,536370,POSTAGE,3
5258,536852,POSTAGE,1
6676,536974,POSTAGE,2
7953,537065,POSTAGE,9
14166,537463,POSTAGE,4


In [7]:
df = df[~(df["Description"] == "POSTAGE")]
df.head()

Unnamed: 0,InvoiceNo,Description,Quantity
26,536370,ALARM CLOCK BAKELIKE PINK,24
27,536370,ALARM CLOCK BAKELIKE RED,24
28,536370,ALARM CLOCK BAKELIKE GREEN,12
29,536370,PANDA AND BUNNIES STICKER SHEET,12
30,536370,STARS GIFT TAPE,24


### Remove white spaces form descriptions

In [8]:
df["Description"] = df["Description"].str.strip()
df.head()

Unnamed: 0,InvoiceNo,Description,Quantity
26,536370,ALARM CLOCK BAKELIKE PINK,24
27,536370,ALARM CLOCK BAKELIKE RED,24
28,536370,ALARM CLOCK BAKELIKE GREEN,12
29,536370,PANDA AND BUNNIES STICKER SHEET,12
30,536370,STARS GIFT TAPE,24


### Remove canceled invoices

In [9]:
df[df["InvoiceNo"].str.contains("C")].head()

Unnamed: 0,InvoiceNo,Description,Quantity
19802,C537893,SILK PURSE BABUSHKA BLUE,-1
19803,C537893,CHILDS BREAKFAST SET SPACEBOY,-2
19804,C537893,DOLLY GIRL LUNCH BOX,-2
32695,C539104,LUNCH BAG DOLLY GIRL DESIGN,-3
32751,C539114,RECIPE BOX RETROSPOT,-3


In [10]:
df = df[~df["InvoiceNo"].str.contains("C")]
df.head()

Unnamed: 0,InvoiceNo,Description,Quantity
26,536370,ALARM CLOCK BAKELIKE PINK,24
27,536370,ALARM CLOCK BAKELIKE RED,24
28,536370,ALARM CLOCK BAKELIKE GREEN,12
29,536370,PANDA AND BUNNIES STICKER SHEET,12
30,536370,STARS GIFT TAPE,24


## Create the basket dataframe

In [11]:
basket = df.groupby(by=["InvoiceNo", "Description"])["Quantity"].sum().unstack(fill_value=0.0)
basket

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Convert basket values to one-hot
Defining the one-hot encoder function to make the data suitable for the apriori function

In [12]:
def one_hot_encoder(x):
    if x > 0:
        return True
    else:
        return False

basket_sets = basket.applymap(one_hot_encoder)
basket_sets

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE WOODLAND,...,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,YELLOW SHARK HELICOPTER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536370,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536852,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536974,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
537065,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
537463,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580986,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
581001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
581171,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
581279,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Frequent itemsets

In [13]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)
frequent_itemsets.sort_values(by="support", ascending=False)

Unnamed: 0,support,itemsets
22,0.191214,(RABBIT NIGHT LIGHT)
26,0.183463,(RED TOADSTOOL LED NIGHT LIGHT)
21,0.173127,(PLASTERS IN TIN WOODLAND ANIMALS)
18,0.170543,(PLASTERS IN TIN CIRCUS PARADE)
30,0.160207,(ROUND SNACK BOXES SET OF4 WOODLAND)
11,0.155039,(LUNCH BAG RED RETROSPOT)
14,0.144703,(LUNCH BOX WITH CUTLERY RETROSPOT)
33,0.139535,(SET/6 RED SPOTTY PAPER CUPS)
24,0.139535,(RED RETROSPOT MINI CASES)
19,0.139535,(PLASTERS IN TIN SPACEBOY)


## Association rules

In [14]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules[rules["confidence"] >= 0.8]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.095607,0.098191,0.080103,0.837838,8.532717,0.070716,5.561154,0.976129
3,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.098191,0.095607,0.080103,0.815789,8.532717,0.070716,4.909561,0.978926
16,(SET/6 RED SPOTTY PAPER PLATES),(SET/20 RED RETROSPOT PAPER NAPKINS),0.129199,0.134367,0.103359,0.8,5.953846,0.085999,4.328165,0.95549
18,(SET/6 RED SPOTTY PAPER PLATES),(SET/6 RED SPOTTY PAPER CUPS),0.129199,0.139535,0.124031,0.96,6.88,0.106003,21.511628,0.981454
19,(SET/6 RED SPOTTY PAPER CUPS),(SET/6 RED SPOTTY PAPER PLATES),0.139535,0.129199,0.124031,0.888889,6.88,0.106003,7.837209,0.993243
20,"(SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT...",(SET/20 RED RETROSPOT PAPER NAPKINS),0.124031,0.134367,0.100775,0.8125,6.046875,0.08411,4.61671,0.952802
21,"(SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...",(SET/6 RED SPOTTY PAPER CUPS),0.103359,0.139535,0.100775,0.975,6.9875,0.086353,34.418605,0.955664
22,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.103359,0.129199,0.100775,0.975,7.5465,0.087421,34.832041,0.967487
