# Market Basket Analysis using Apriori

In [31]:
import numpy as np
import pandas as pd
import plotly.express as px
import networkx as nx

## Load the data

In [32]:
data = pd.read_csv("bread basket.csv")
data.head()

Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
0,1,Bread,30-10-2016 09:58,morning,weekend
1,2,Scandinavian,30-10-2016 10:05,morning,weekend
2,2,Scandinavian,30-10-2016 10:05,morning,weekend
3,3,Hot chocolate,30-10-2016 10:07,morning,weekend
4,3,Jam,30-10-2016 10:07,morning,weekend


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20507 entries, 0 to 20506
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Transaction      20507 non-null  int64 
 1   Item             20507 non-null  object
 2   date_time        20507 non-null  object
 3   period_day       20507 non-null  object
 4   weekday_weekend  20507 non-null  object
dtypes: int64(1), object(4)
memory usage: 801.2+ KB


## Preprocessing

### Changing date to correct datatype

In [34]:
data["date_time"] = pd.to_datetime(data["date_time"])





In [35]:
data["Transaction"].nunique()

9465

### Creating different columns based on Date

In [36]:
data["date"] = data["date_time"].dt.date
data["time"] = data["date_time"].dt.time
data["month"] = data["date_time"].dt.month_name()
data["hour"] = data["date_time"].dt.hour

In [37]:
data["weekday"] = data["date_time"].dt.weekday.map(
    {
        0: "Monday",
        1: "Tuesday",
        2: "Wednesday",
        3: "Thursday",
        4: "Friday",
        5: "Saturday",
        6: "Sunday",
    }
)

In [38]:
data.drop("date_time", axis=1, inplace=True)

In [39]:
data.head()

Unnamed: 0,Transaction,Item,period_day,weekday_weekend,date,time,month,hour,weekday
0,1,Bread,morning,weekend,2016-10-30,09:58:00,October,9,Sunday
1,2,Scandinavian,morning,weekend,2016-10-30,10:05:00,October,10,Sunday
2,2,Scandinavian,morning,weekend,2016-10-30,10:05:00,October,10,Sunday
3,3,Hot chocolate,morning,weekend,2016-10-30,10:07:00,October,10,Sunday
4,3,Jam,morning,weekend,2016-10-30,10:07:00,October,10,Sunday


### Removing spaces and lowering cases

In [40]:
data["Item"] = data["Item"].str.strip().str.lower()

In [41]:
data.head()

Unnamed: 0,Transaction,Item,period_day,weekday_weekend,date,time,month,hour,weekday
0,1,bread,morning,weekend,2016-10-30,09:58:00,October,9,Sunday
1,2,scandinavian,morning,weekend,2016-10-30,10:05:00,October,10,Sunday
2,2,scandinavian,morning,weekend,2016-10-30,10:05:00,October,10,Sunday
3,3,hot chocolate,morning,weekend,2016-10-30,10:07:00,October,10,Sunday
4,3,jam,morning,weekend,2016-10-30,10:07:00,October,10,Sunday


### Exploratory Data Analysis

### Top 20 Products

In [42]:
top20 = data["Item"].value_counts().head(20)

In [43]:
fig = px.bar(data_frame=top20, text_auto=True, title="Top 20 Products")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



Coffee is the most selling product <br>
Of course it is the best !!

### No of transaction per month

In [44]:
trans_per_month = data.groupby("month").agg({"Transaction": "count"})
trans_per_month

Unnamed: 0_level_0,Transaction
month,Unnamed: 1_level_1
April,1157
December,3339
February,3906
January,3356
March,3944
November,4436
October,369


In [45]:
fig = px.bar(data_frame=trans_per_month, text_auto=True, title="Transactions per Month")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



This dataset only contains from April to October

### Orders received each day

In [46]:
each_day = data.groupby("weekday").agg({"Transaction": "count"})
each_day

Unnamed: 0_level_0,Transaction
weekday,Unnamed: 1_level_1
Friday,3124
Monday,2324
Saturday,4605
Sunday,3095
Thursday,2646
Tuesday,2392
Wednesday,2321


In [47]:
fig = px.bar(data_frame=each_day, title="Transactions per day", text_auto=True)
fig.update_layout(
    xaxis={
        "categoryorder": "array",
        "categoryarray": [
            "Monday",
            "Tuesday",
            "Wednesday",
            "Thursday",
            "Friday",
            "Saturday",
            "Sunday",
        ],
    }
)
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



Most sales were on saturday

### Orders received each hour

In [48]:
each_hour = data.groupby("hour").agg({"Transaction": "count"})
each_hour

Unnamed: 0_level_0,Transaction
hour,Unnamed: 1_level_1
1,1
7,24
8,645
9,1966
10,2666
11,3102
12,2854
13,2617
14,2640
15,2115


In [49]:
fig = px.bar(
    data_frame=each_hour,
    title="Transactions per hour",
    text_auto=True,
    height=800,
)
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



### Transactions during period of day

In [50]:
period_day = data.groupby("period_day").agg({"Transaction": "count"})
period_day

Unnamed: 0_level_0,Transaction
period_day,Unnamed: 1_level_1
afternoon,11569
evening,520
morning,8404
night,14


In [51]:
fig = px.bar(
    data_frame=period_day,
    title="Transactions per period of day",
    text_auto=True,
    orientation="h",
)
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



People prefer to order in morning and afternoon

## Apriori Algorithm

In [52]:
from mlxtend.frequent_patterns import association_rules, apriori

In [53]:
transaction = (
    data.groupby(["Transaction", "Item"]).agg(Count=("Item", "count")).reset_index()
)
transaction

Unnamed: 0,Transaction,Item,Count
0,1,bread,1
1,2,scandinavian,2
2,3,cookies,1
3,3,hot chocolate,1
4,3,jam,1
...,...,...,...
18882,9682,tacos/fajita,1
18883,9682,tea,1
18884,9683,coffee,1
18885,9683,pastry,1


Creating a mxn matrix using pivot table with m -> Transaction and n -> Items

In [54]:
basket = transaction.pivot_table(
    index="Transaction", columns="Item", values="Count", aggfunc="sum"
).fillna(0)
basket

Item,adjustment,afternoon with the baker,alfajores,argentina night,art tray,bacon,baguette,bakewell,bare popcorn,basket,...,the bart,the nomad,tiffin,toast,truffles,tshirt,valentine's card,vegan feast,vegan mincepie,victorian sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
def encode(x):
    if x <= 0:
        return 0
    else:
        return 1

In [56]:
my_basket = basket.applymap(encode)  # type: ignore
my_basket

Item,adjustment,afternoon with the baker,alfajores,argentina night,art tray,bacon,baguette,bakewell,bare popcorn,basket,...,the bart,the nomad,tiffin,toast,truffles,tshirt,valentine's card,vegan feast,vegan mincepie,victorian sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
frequent_items = apriori(my_basket, min_support=0.01, use_colnames=True)
frequent_items


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type



Unnamed: 0,support,itemsets
0,0.036344,(alfajores)
1,0.016059,(baguette)
2,0.327205,(bread)
3,0.040042,(brownie)
4,0.103856,(cake)
...,...,...
56,0.023666,"(coffee, toast)"
57,0.014369,"(tea, sandwich)"
58,0.010037,"(cake, bread, coffee)"
59,0.011199,"(bread, pastry, coffee)"


In [58]:
rules = association_rules(frequent_items, metric="lift", min_threshold=1)
rules.sort_values(by="confidence", ascending=False, inplace=True)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
31,(toast),(coffee),0.033597,0.478394,0.023666,0.704403,1.472431,0.007593,1.764582,0.332006
29,(spanish brunch),(coffee),0.018172,0.478394,0.010882,0.598837,1.251766,0.002189,1.300235,0.204851
18,(medialuna),(coffee),0.061807,0.478394,0.035182,0.569231,1.189878,0.005614,1.210871,0.170091
23,(pastry),(coffee),0.086107,0.478394,0.047544,0.552147,1.154168,0.006351,1.164682,0.146161
1,(alfajores),(coffee),0.036344,0.478394,0.019651,0.540698,1.130235,0.002264,1.135648,0.119574
17,(juice),(coffee),0.038563,0.478394,0.020602,0.534247,1.11675,0.002154,1.119919,0.108738
25,(sandwich),(coffee),0.071844,0.478394,0.038246,0.532353,1.112792,0.003877,1.115384,0.109205
6,(cake),(coffee),0.103856,0.478394,0.054728,0.526958,1.101515,0.005044,1.102664,0.10284
27,(scone),(coffee),0.034548,0.478394,0.018067,0.522936,1.093107,0.001539,1.093366,0.088224
13,(cookies),(coffee),0.054411,0.478394,0.028209,0.518447,1.083723,0.002179,1.083174,0.0817


In [59]:
import plotly.graph_objects as go

# Assuming 'rules' is the result of association_rules
G = nx.Graph()

for index, rule in rules.iterrows():
    antecedents = ", ".join(rule["antecedents"])
    consequents = ", ".join(rule["consequents"])
    lift_value = round(rule["lift"], 2)
    G.add_edge(antecedents, consequents, weight=lift_value)

pos = nx.spring_layout(G, seed=42)

edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color="#888"),
    hoverinfo="none",
    mode="lines",
)

node_x = []
node_y = []
node_names = []
node_text = []  # List to store text for each node

for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_names.append(node)

    # Extract lift value from edge
    neighbors = G.neighbors(node)
    if neighbors:
        lift_value = G[node][list(neighbors)[0]]["weight"]
        node_text.append(f"{node}\nLift: {lift_value}")
    else:
        node_text.append(f"{node}")

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode="markers+text",  # Combine markers and text
    hoverinfo="text",
    textposition="bottom center",  # Adjust text position
    textfont=dict(size=8),  # Adjust text size
    text=node_text,  # Display node names and lift values
    marker=dict(
        showscale=True,
        colorscale="YlGnBu",
        size=10,
        colorbar=dict(
            thickness=15, title="Node Connections", xanchor="left", titleside="right"
        ),
    ),
)

fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        title="Association Rules Network Graph",
        showlegend=False,
        hovermode="closest",
        margin=dict(b=0, l=0, r=0, t=40),  # Adjust top margin for the title
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    ),
)

fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [60]:
import plotly.graph_objects as go

# Assuming 'rules' is the result of association_rules
G = nx.Graph()

for index, rule in rules.iterrows():
    antecedents = ", ".join(rule["antecedents"])
    consequents = ", ".join(rule["consequents"])
    lift_value = round(rule["lift"], 2)
    G.add_edge(antecedents, consequents, weight=lift_value)

# Circular layout
pos = nx.circular_layout(G)

edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color="#888"),
    hoverinfo="none",
    mode="lines",
)

node_x = []
node_y = []
node_names = []
node_text = []  # List to store text for each node

for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_names.append(node)

    # Extract lift value from edge
    neighbors = G.neighbors(node)
    if neighbors:
        lift_value = G[node][list(neighbors)[0]]["weight"]
        node_text.append(f"{node}\nLift: {lift_value}")
    else:
        node_text.append(f"{node}")

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode="markers+text",  # Combine markers and text
    hoverinfo="text",
    textposition="bottom center",  # Adjust text position
    textfont=dict(size=8),  # Adjust text size
    text=node_text,  # Display node names and lift values
    marker=dict(
        showscale=True,
        colorscale="YlGnBu",
        size=10,
        colorbar=dict(
            thickness=15, title="Node Connections", xanchor="left", titleside="right"
        ),
    ),
)

fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        title="Association Rules Circular Graph",
        showlegend=False,
        hovermode="closest",
        margin=dict(b=0, l=0, r=0, t=40),  # Adjust top margin for the title
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    ),
)

fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.

