In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
data = pd.read_csv(r'e:\documents\online_retail.csv' ,delimiter=',')

In [3]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 08:26,3.39,17850.0,United Kingdom


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null object
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


### Deteksi Missing Value

In [5]:
data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

### Terdapat missing value pada variabel Description dan CustomerID

### Mengatasi Missing Value

### Missing value yang diatasi hanya variabel description saja karena variabel CustomerID tidak berpengaruh pada analisis association rule. Missing value diatasi dengan cara menghapus baris yang mengandung mising value.

In [6]:
data.dropna(axis=0, subset=['Description'], inplace=True)

In [7]:
data.isnull().sum()

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     133626
Country             0
dtype: int64

### Missing Value pada variabel Description sudah teratasi

### Selanjutnya menghilangkan transaksi yang merupakan transaksi kredit (Di awali dengan hufuf C pada invoice number)

In [8]:
data['InvoiceNo'] = data['InvoiceNo'].astype('str') 
data = data[~data['InvoiceNo'].str.contains('C')]

### Setelah membersihkan data, selanjutnya adalah membuat sebuah keranjang belanja yang di kenali berdasarkan InvoiceNo.

In [9]:
basket = data.groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')
basket.head()

Description,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
basket.shape

(20610, 4207)

### Kemudian melakukan encoding, dimana jika barang kurang dari sama dengan 0 maka keranjang tersebut bernilai 0 dan jika lebih dari 1 maka nilainya adalah 1, sehingga jika sebuah nota membeli barang A sebanyak 10 buah maka hanya akan dihitung 1. Karena analisis yang di gunakan menyaratkan seperti itu.

In [11]:
def encode_units(x) :
    if x <=0:
        return 0
    if x >= 1:
        return 1
basket_sets = basket.applymap(encode_units)

In [12]:
basket_sets.head()

Description,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
536369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Selanjutnya adalah membuat variabel dimana terdiri dari beberapa barang yang sering / terbeli dari seluruh transaksi menggunakan perintah apriori. Dimana perintah yang di gunakan adalah apriori, dengan data dari basket_sets dengan minimum nilai support 0.02/ 2%.

In [14]:
frequent_itemsets = apriori(basket_sets, min_support = 0.02, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.022707,(3 STRIPEY MICE FELTCRAFT)
1,0.023484,(4 TRADITIONAL SPINNING TOPS)
2,0.046337,(6 RIBBONS RUSTIC CHARM)
3,0.021203,(60 CAKE CASES DOLLY GIRL DESIGN)
4,0.029985,(60 CAKE CASES VINTAGE CHRISTMAS)
5,0.040175,(60 TEATIME FAIRY CAKE CASES)
6,0.029840,(72 SWEETHEART FAIRY CAKE CASES)
7,0.020767,(ALARM CLOCK BAKELIKE CHOCOLATE)
8,0.047550,(ALARM CLOCK BAKELIKE GREEN)
9,0.027656,(ALARM CLOCK BAKELIKE IVORY)


### Diperoleh sebanyak 352 data yang memiliki support >= 0.02

### Selanjutnya adalah membangun sebuah variabel yang memiliki aturan aturan asosiasi dari masing-masing barang, variabel rulesmerupakan hasil dari fungsi yang mencari asosiasi dimana data yang di gunakan berasal dari frequent_items , dengan nilai minium dari lift ratio nya adalah 1

In [15]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(60 TEATIME FAIRY CAKE CASES),(PACK OF 72 RETROSPOT CAKE CASES),0.040175,0.064047,0.021980,0.547101,8.542243,0.019407,2.066585
1,(PACK OF 72 RETROSPOT CAKE CASES),(60 TEATIME FAIRY CAKE CASES),0.064047,0.040175,0.021980,0.343182,8.542243,0.019407,1.461326
2,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE PINK),0.047550,0.037991,0.020281,0.426531,11.227070,0.018475,1.677524
3,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.037991,0.047550,0.020281,0.533844,11.227070,0.018475,2.043202
4,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.047550,0.050995,0.031053,0.653061,12.806462,0.028628,2.735368
5,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.050995,0.047550,0.031053,0.608944,12.806462,0.028628,2.435585
6,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE RED ),0.037991,0.050995,0.022804,0.600255,11.770946,0.020867,2.374029
7,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE PINK),0.050995,0.037991,0.022804,0.447193,11.770946,0.020867,1.740226
8,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT),0.050170,0.036050,0.025328,0.504836,14.003582,0.023519,1.946726
9,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG),0.036050,0.050170,0.025328,0.702557,14.003582,0.023519,3.193320


### Bisa dilihat pada hasil di atas, barang (60 Teatime Fairy Cake Case biasanya terbeli bersamaan dengan Pack Of 72 Retrospot Cake Cases dengan nilai support 0.021980. Yang artinya, dari keseluruhan transaksi, banyaknya customer yang membeli 60 Teatime Fairy Cake Case dan Pack Of 72 Retrospot Cake Cases sebesar 2%. 

### Kemudian selanjutnya adalah melakukan filter untuk nilai lift ratio minimal 2 dan tingkat confidence minimal 0.5

In [16]:
rules[(rules['lift']>=2) &
     (rules['confidence'] >= 0.5)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(60 TEATIME FAIRY CAKE CASES),(PACK OF 72 RETROSPOT CAKE CASES),0.040175,0.064047,0.02198,0.547101,8.542243,0.019407,2.066585
3,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE GREEN),0.037991,0.04755,0.020281,0.533844,11.22707,0.018475,2.043202
4,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.04755,0.050995,0.031053,0.653061,12.806462,0.028628,2.735368
5,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.050995,0.04755,0.031053,0.608944,12.806462,0.028628,2.435585
6,(ALARM CLOCK BAKELIKE PINK),(ALARM CLOCK BAKELIKE RED ),0.037991,0.050995,0.022804,0.600255,11.770946,0.020867,2.374029
8,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT),0.05017,0.03605,0.025328,0.504836,14.003582,0.023519,1.946726
9,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG),0.03605,0.05017,0.025328,0.702557,14.003582,0.023519,3.19332
10,(CHARLOTTE BAG SUKI DESIGN),(RED RETROSPOT CHARLOTTE BAG),0.042795,0.05017,0.0246,0.57483,11.457684,0.022453,2.234001
13,(STRAWBERRY CHARLOTTE BAG),(CHARLOTTE BAG SUKI DESIGN),0.035032,0.042795,0.020136,0.574792,13.43137,0.018637,2.251147
14,(CHARLOTTE BAG SUKI DESIGN),(WOODLAND CHARLOTTE BAG),0.042795,0.040514,0.022125,0.517007,12.76109,0.020391,1.986541


### Bisa dilihat pada hasil di atas, nilai confidence 60 Teatime Fairy Cake Case dan Pack Of 72 Retrospot Cake Cases sebesar 0.547101, yang artinya, dari keseluruhan transaksi, orang yang membeli 60 Teatime Fairy Cake Case kemudian membeli Pack Of 72 Retrospot Cake Cases sebanyak 55%

### Sedangkan nilai confidence Pack Of 72 Retrospot Cake Cases  dan 60 Teatime Fairy Cake Case sebesar 0.533844, yang artinya, dari keseluruhan transaksi, orang yang membeli Pack Of 72 Retrospot Cake Cases kemudian membeli 60 Teatime Fairy Cake Case sebanyak 53%