Source : https://hands-on.cloud/implementation-of-eclat-algorithm-using-python/

In [1]:
# %pip install pyECLAT
import pandas as pd
import plotly.express as px

from pyECLAT import  Example2
from pyECLAT import ECLAT

In [2]:
dataset = Example2().get()
dataset.head()
# Each row represents a customer’s purchase at a supermarket in this dataset.

Unnamed: 0,0,1,2,3,4,5,6
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams
1,burgers,meatballs,eggs,,,,
2,chutney,,,,,,
3,turkey,avocado,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3001 entries, 0 to 3000
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3001 non-null   object
 1   1       2315 non-null   object
 2   2       1774 non-null   object
 3   3       1374 non-null   object
 4   4       1048 non-null   object
 5   5       775 non-null    object
 6   6       581 non-null    object
dtypes: object(7)
memory usage: 164.2+ KB


## Using Eclat algorithm

In [19]:
eclat = ECLAT(data=dataset)
df_eclat = eclat.df_bin
df_eclat.head()
# Every row represents a transaction. Columns are possible products that might appear in every transaction

Unnamed: 0,white wine,eggplant,bacon,ketchup,nonfat milk,muffins,green tea,milk,ground beef,tea,...,salt,chili,body spray,shrimp,sandwich,pickles,candy bars,soup,salad,black tea
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Counting the items in every column and for every row

In [20]:
# count items in each column
items_total = df_eclat.sum(axis=0)
items_total

white wine      50
eggplant        31
bacon           26
ketchup         11
nonfat milk     29
              ... 
pickles         17
candy bars      28
soup           163
salad           10
black tea       34
Length: 119, dtype: int64

In [21]:
# count items in each row
items_per_transaction = df_eclat.sum(axis=1)
items_per_transaction

0       7
1       3
2       1
3       2
4       5
       ..
2996    1
2997    2
2998    3
2999    7
3000    5
Length: 3001, dtype: int64

## Visualizing items distribution

In [22]:
# Loading items per column stats to the DataFrame
df = pd.DataFrame({"items": items_total.index, "transactions": items_total.values})

df_table = df.sort_values("transactions", ascending=False)

df_table.head(5).style.background_gradient(cmap="Blues")

Unnamed: 0,items,transactions
29,mineral water,711
23,spaghetti,549
47,eggs,532
57,chocolate,485
31,french fries,463


## Visualizing the frequently occurring items

In [8]:
df_table["all"] = "Tree Map"

fig = px.treemap(
    df_table.head(50), path=["all", "items"], values="transactions",
    color=df_table["transactions"].head(50), hover_data=["items"],
    color_continuous_scale="Blues",
)

fig.show()

## Generating association rules

In [24]:
rule_indices, rule_supports = eclat.fit(
    min_support=0.06,
    min_combination=2,
    max_combination=max(items_per_transaction),
    separator=' & ',
    verbose=True,
)

Combination 2 by 2


153it [00:01, 115.02it/s]


Combination 3 by 3


816it [00:05, 143.57it/s]


Combination 4 by 4


3060it [00:23, 132.49it/s]


Combination 5 by 5


8568it [01:01, 140.11it/s]


Combination 6 by 6


18564it [02:11, 141.07it/s]


Combination 7 by 7


31824it [04:10, 127.15it/s]


## Result

In [25]:
result = pd.DataFrame(rule_supports.items(),columns=["Item", "Support"])
result.sort_values(by=["Support"], ascending=False)

Unnamed: 0,Item,Support
0,spaghetti & mineral water,0.060646


**Result**: We found that mineral water and spaghetti are commonly purchased by customers based on the transaction data in our dataset and the minimum support value we’ve provided.