In [1]:
import math
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

! pip install mlxtend

ModuleNotFoundError: No module named 'plotly'

# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [2]:
# load the data set and show the first five transaction
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [3]:
items = set()
for col in df:
  arr = np.array(df[col].unique())
  items.update(arr)

items = list(items)
items

[nan,
 'Milk',
 'Meat',
 'Bread',
 'Bagel',
 'Pencil',
 'Diaper',
 'Eggs',
 'Wine',
 'Cheese']

## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [4]:
#create an itemset based on the products
itemset = set(items)

# encoding the feature

encoded_vals = []
for index, row in df.iterrows():
    rowset = set(row) 
    labels = {}
    uncommons = list(itemset - rowset)
    commons = list(itemset.intersection(rowset))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)
encoded_vals[0]

{nan: 0,
 'Bagel': 0,
 'Milk': 0,
 'Meat': 1,
 'Bread': 1,
 'Diaper': 1,
 'Pencil': 1,
 'Eggs': 1,
 'Wine': 1,
 'Cheese': 1}

In [5]:
  # create new dataframe from the encoded features
df = pd.DataFrame(encoded_vals) 
  # show the new dataframe
df.head()

Unnamed: 0,NaN,Bagel,Milk,Meat,Bread,Diaper,Pencil,Eggs,Wine,Cheese
0,0,0,0,1,1,1,1,1,1,1
1,0,0,1,1,1,1,1,0,1,1
2,1,0,1,1,0,0,0,1,1,1
3,1,0,1,1,0,0,0,1,1,1
4,1,0,0,1,0,0,1,0,1,0


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [6]:
df.drop([np.nan], axis=1, inplace=True)
df.head()

Unnamed: 0,Bagel,Milk,Meat,Bread,Diaper,Pencil,Eggs,Wine,Cheese
0,0,0,1,1,1,1,1,1,1
1,0,1,1,1,1,1,0,1,1
2,0,1,1,0,0,0,1,1,1
3,0,1,1,0,0,0,1,1,1
4,0,0,1,0,0,1,0,1,0


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [7]:
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules

patterns = apriori(df, min_support = 0.2, use_colnames = True)

sets = patterns.sort_values('support', ascending = False)[:10]
sets.head()



Unnamed: 0,support,itemsets
3,0.504762,(Bread)
8,0.501587,(Cheese)
1,0.501587,(Milk)
2,0.47619,(Meat)
6,0.438095,(Eggs)


Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [8]:
rules = association_rules(sets, metric="lift")
rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
1,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__