# E-Commerce Apriori Implementation -- Joseph Kim

In [14]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules

import kaggle

In [11]:
# download through kaggle api
kaggle.api.dataset_download_files('lissetteg/ecommerce-dataset', path='kaggle', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/lissetteg/ecommerce-dataset


  response_data.getheaders())


In [17]:
# csv from kaggle is called 'data-2'
df = pd.read_csv('kaggle/data-2.csv')

Data Processing for mlxtend

In [None]:
df['Description'] = df['Description'].str.strip()
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [20]:
df = df[df['Quantity'] > 0] # remove all quantities that are below 0, as this is not possible

In [22]:
df.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047.0,United Kingdom


In [24]:
# based on the data, convert into a pivot table from pandas -- this is necessary for apriori to work / manually see association rules
basket = pd.pivot_table(data=df, index='InvoiceNo', columns='Description', values='Quantity', aggfunc='sum', fill_value = 0)
basket.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,returned,taig adjust,test,to push order througha s stock was,website fixed,wrongly coded 20713,wrongly coded 23343,wrongly marked,wrongly marked 23343,wrongly sold (22719) barcode
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
def convert_into_binary(x): # for apriori, we don't need exact quantity as long as it is over 1, so convert into binary
    if x > 0:
        return 1
    else:
        return 0

In [27]:
basket = basket.map(convert_into_binary)

In [28]:
# remove POSTAGE here because that is misleading and is not a product
basket.drop(columns = ['POSTAGE'], inplace = True)

Apriori Implementation -- Min Support and Association Rules

In [48]:
# identify frequent itemsets (support >= 0.03)
# 0.03 min support is chosen so that the frequent itemsets contain only products with 0.03 * 4065 frequency or more items present in 'basket'
frequent_itemsets = apriori(basket, min_support = 0.03, use_colnames = True)
frequent_itemsets



Unnamed: 0,support,itemsets
0,0.047427,(6 RIBBONS RUSTIC CHARM)
1,0.030691,(60 CAKE CASES VINTAGE CHRISTMAS)
2,0.041120,(60 TEATIME FAIRY CAKE CASES)
3,0.030542,(72 SWEETHEART FAIRY CAKE CASES)
4,0.048669,(ALARM CLOCK BAKELIKE GREEN)
...,...,...
130,0.040971,"(JUMBO BAG PINK POLKADOT, JUMBO BAG RED RETROS..."
131,0.033770,"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ..."
132,0.035956,"(JUMBO BAG RED RETROSPOT, JUMBO STORAGE BAG SUKI)"
133,0.031834,"(LUNCH BAG RED RETROSPOT, LUNCH BAG BLACK SKU..."


In [54]:
association = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1, num_itemsets = len(basket))
association.head() # 15 common itemsets found

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.052195,0.048669,0.031784,0.608944,12.511932,1.0,0.029244,2.432722,0.970744,0.460101,0.588938,0.631003
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.048669,0.052195,0.031784,0.653061,12.511932,1.0,0.029244,2.731908,0.967146,0.460101,0.633956,0.631003
2,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),0.050407,0.038041,0.031436,0.623645,16.393893,1.0,0.029519,2.55599,0.988847,0.551394,0.608762,0.725008
3,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.038041,0.050407,0.031436,0.826371,16.393893,1.0,0.029519,5.469083,0.976135,0.551394,0.817154,0.725008
4,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.05294,0.050407,0.038141,0.72045,14.292598,1.0,0.035472,3.396865,0.982022,0.58492,0.705611,0.73855


Association Analysis: Generating All Possible Rules

In [53]:
association[ (association['lift'] >= 4) & (association['confidence'] >= 0.5) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(ALARM CLOCK BAKELIKE RED),(ALARM CLOCK BAKELIKE GREEN),0.052195,0.048669,0.031784,0.608944,12.511932,1.0,0.029244,2.432722,0.970744,0.460101,0.588938,0.631003
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED),0.048669,0.052195,0.031784,0.653061,12.511932,1.0,0.029244,2.731908,0.967146,0.460101,0.633956,0.631003
2,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),0.050407,0.038041,0.031436,0.623645,16.393893,1.0,0.029519,2.55599,0.988847,0.551394,0.608762,0.725008
3,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.038041,0.050407,0.031436,0.826371,16.393893,1.0,0.029519,5.469083,0.976135,0.551394,0.817154,0.725008
4,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.05294,0.050407,0.038141,0.72045,14.292598,1.0,0.035472,3.396865,0.982022,0.58492,0.705611,0.73855
5,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.050407,0.05294,0.038141,0.75665,14.292598,1.0,0.035472,3.891765,0.979403,0.58492,0.743047,0.73855
6,(JUMBO BAG PINK POLKADOT),(JUMBO BAG RED RETROSPOT),0.060489,0.103894,0.040971,0.67734,6.519558,1.0,0.034687,2.777246,0.901123,0.331992,0.639931,0.53585
8,(JUMBO SHOPPER VINTAGE RED PAISLEY),(JUMBO BAG RED RETROSPOT),0.058353,0.103894,0.03377,0.578723,5.570351,1.0,0.027708,2.127121,0.871323,0.262853,0.529881,0.451886
11,(JUMBO STORAGE BAG SUKI),(JUMBO BAG RED RETROSPOT),0.0588,0.103894,0.035956,0.611486,5.885704,1.0,0.029847,2.3065,0.881956,0.283699,0.566443,0.478783
13,(LUNCH BAG BLACK SKULL.),(LUNCH BAG RED RETROSPOT),0.06322,0.077672,0.031834,0.503535,6.482852,1.0,0.026923,1.857791,0.902823,0.291894,0.461726,0.456691
