In [1]:
# Import packages
import pandas as pd
import numpy as np
import datetime as dt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import matplotlib.pyplot as plt
import networkx as nx

import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Import order product dataset
df = pd.read_csv('order_products.csv', usecols=['order_id', 'product_id'])
df.head()

Unnamed: 0,order_id,product_id
0,10,24
1,10,83
2,10,16
3,10,24
4,10,83


In [3]:
# Import product label mapping
products = pd.read_csv('products.csv')
products.head()

Unnamed: 0,product_id,department_id,product_name
0,61,19,cookies cakes
1,104,13,spices seasonings
2,94,7,tea
3,38,1,frozen meals
4,5,13,marinades meat preparation


In [4]:
# Merge product labels
df = pd.merge(df, products, how='left', on='product_id').\
    drop(["product_id", "department_id"], axis=1)
df.head()

Unnamed: 0,order_id,product_name
0,10,fresh fruits
1,10,fresh vegetables
2,10,fresh herbs
3,10,fresh fruits
4,10,fresh vegetables


In [5]:
# Get modes and unique values
df.astype(object).describe()

Unnamed: 0,order_id,product_name
count,2019501,2019501
unique,200000,134
top,790903,fresh fruits
freq,137,226039


In [6]:
# Check product frequencies
df['product_name'].value_counts()

fresh fruits                  226039
fresh vegetables              212611
packaged vegetables fruits    109596
yogurt                         90751
packaged cheese                61502
                               ...  
kitchen supplies                 561
baby bath body care              515
baby accessories                 504
beauty                           387
frozen juice                     279
Name: product_name, Length: 134, dtype: int64

In [7]:
# Pivot the data - lines as orders and products as columns
pt = pd.pivot_table(df, index='order_id', columns='product_name', 
                    aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
pt.head()

product_name,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Check in how many orders was product yogurt sold
pt['fresh fruits'].sum()

111199.0

In [34]:
# Apply the APRIORI algorithm to get frequent itemsets
# Rules supported in at least 5% of the transactions (more info at http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/)
frequent_itemsets = apriori(pt, min_support=0.05, use_colnames=True)
frequent_itemsets
#frequent_itemsets.sort_values(by='support').head()

Unnamed: 0,support,itemsets
0,0.076635,(baking ingredients)
1,0.163865,(bread)
2,0.067765,(breakfast bakery)
3,0.074330,(butter)
4,0.069305,(candy chocolate)
...,...,...
151,0.051295,"(packaged vegetables fruits, yogurt, milk)"
152,0.051915,"(packaged cheese, packaged vegetables fruits, ..."
153,0.062535,"(fresh vegetables, milk, packaged vegetables f..."
154,0.068325,"(packaged cheese, fresh vegetables, packaged v..."


In [35]:
# Generate the association rules - by confidence
rulesConfidence = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.50)
rulesConfidence.sort_values(by='confidence', ascending=False, inplace=True)
rulesConfidence.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
65,"(fresh herbs, fresh fruits)",(fresh vegetables),0.070135,0.44436,0.061815,0.881372,1.983463,0.03065,4.683872
123,"(fresh vegetables, packaged vegetables fruits,...",(fresh fruits),0.087995,0.555995,0.07624,0.866413,1.558311,0.027315,3.323711
113,"(fresh vegetables, milk, packaged vegetables f...",(fresh fruits),0.073075,0.555995,0.062535,0.855765,1.539159,0.021906,3.078336
34,(fresh herbs),(fresh vegetables),0.093005,0.44436,0.078655,0.845707,1.903203,0.037327,3.601205
118,"(packaged cheese, fresh vegetables, packaged v...",(fresh fruits),0.08197,0.555995,0.068325,0.833537,1.49918,0.02275,2.667284


In [36]:
# Generate the association rules - by lift
rulesLift = association_rules(frequent_itemsets, metric="lift", min_threshold=1.5)
rulesLift.sort_values(by='lift', ascending=False, inplace=True)
rulesLift.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
29,"(fresh vegetables, fresh fruits)",(fresh herbs),0.31756,0.093005,0.061815,0.194656,2.092964,0.03228,1.126221
30,(fresh herbs),"(fresh vegetables, fresh fruits)",0.093005,0.31756,0.061815,0.664642,2.092964,0.03228,2.034958
31,(fresh vegetables),"(fresh herbs, fresh fruits)",0.44436,0.070135,0.061815,0.13911,1.983463,0.03065,1.080121
28,"(fresh herbs, fresh fruits)",(fresh vegetables),0.070135,0.44436,0.061815,0.881372,1.983463,0.03065,4.683872
129,"(fresh vegetables, yogurt)","(packaged vegetables fruits, fresh fruits)",0.14466,0.26987,0.07624,0.527029,1.952899,0.037201,1.54371


#### Rules are of type `frozenset`, which is a Python built-in type that behaves similarly to sets except that it is immutable. Frozensets have functions to check if there are subsets, supersets, etc. More info at https://www.journaldev.com/22850/python-frozenset


### EXPLORE FREQUENT_ITEMSETS

In [37]:
# Add a column with the length
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# Length=2 and Support>=0.2
frequent_itemsets[(frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.2)]

Unnamed: 0,support,itemsets,length
75,0.31756,"(fresh vegetables, fresh fruits)",2
85,0.26987,"(packaged vegetables fruits, fresh fruits)",2
100,0.234555,"(fresh vegetables, packaged vegetables fruits)",2


In [38]:
len(frequent_itemsets[(frequent_itemsets['length'] == 2)])

70

In [39]:
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.076635,(baking ingredients),1
1,0.163865,(bread),1
2,0.067765,(breakfast bakery),1
3,0.074330,(butter),1
4,0.069305,(candy chocolate),1
...,...,...,...
151,0.051295,"(packaged vegetables fruits, yogurt, milk)",3
152,0.051915,"(packaged cheese, packaged vegetables fruits, ...",3
153,0.062535,"(fresh vegetables, milk, packaged vegetables f...",4
154,0.068325,"(packaged cheese, fresh vegetables, packaged v...",4


In [40]:
# Generate the association rules - by lift
rulesLift = association_rules(frequent_itemsets, metric="lift", min_threshold=0)
rulesLift.sort_values(by='confidence', ascending=False, inplace=True)
rulesLift.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
189,"(fresh herbs, fresh fruits)",(fresh vegetables),0.070135,0.44436,0.061815,0.881372,1.983463,0.03065,4.683872
372,"(fresh vegetables, packaged vegetables fruits,...",(fresh fruits),0.087995,0.555995,0.07624,0.866413,1.558311,0.027315,3.323711
344,"(fresh vegetables, milk, packaged vegetables f...",(fresh fruits),0.073075,0.555995,0.062535,0.855765,1.539159,0.021906,3.078336
86,(fresh herbs),(fresh vegetables),0.093005,0.44436,0.078655,0.845707,1.903203,0.037327,3.601205
358,"(packaged cheese, fresh vegetables, packaged v...",(fresh fruits),0.08197,0.555995,0.068325,0.833537,1.49918,0.02275,2.667284


In [41]:
rulesLift[(rulesLift['confidence']>=0.8)&(rulesLift['lift']>=1.5)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
189,"(fresh herbs, fresh fruits)",(fresh vegetables),0.070135,0.44436,0.061815,0.881372,1.983463,0.03065,4.683872
372,"(fresh vegetables, packaged vegetables fruits,...",(fresh fruits),0.087995,0.555995,0.07624,0.866413,1.558311,0.027315,3.323711
344,"(fresh vegetables, milk, packaged vegetables f...",(fresh fruits),0.073075,0.555995,0.062535,0.855765,1.539159,0.021906,3.078336
86,(fresh herbs),(fresh vegetables),0.093005,0.44436,0.078655,0.845707,1.903203,0.037327,3.601205


In [42]:
rulesLift

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
189,"(fresh herbs, fresh fruits)",(fresh vegetables),0.070135,0.444360,0.061815,0.881372,1.983463,0.030650,4.683872
372,"(fresh vegetables, packaged vegetables fruits,...",(fresh fruits),0.087995,0.555995,0.076240,0.866413,1.558311,0.027315,3.323711
344,"(fresh vegetables, milk, packaged vegetables f...",(fresh fruits),0.073075,0.555995,0.062535,0.855765,1.539159,0.021906,3.078336
86,(fresh herbs),(fresh vegetables),0.093005,0.444360,0.078655,0.845707,1.903203,0.037327,3.601205
358,"(packaged cheese, fresh vegetables, packaged v...",(fresh fruits),0.081970,0.555995,0.068325,0.833537,1.499180,0.022750,2.667284
...,...,...,...,...,...,...,...,...,...
57,(fresh fruits),(hot dogs bacon sausage),0.555995,0.084190,0.054425,0.097888,1.162698,0.007616,1.015184
247,(fresh fruits),"(packaged vegetables fruits, frozen produce)",0.555995,0.066985,0.054415,0.097870,1.461067,0.017172,1.034235
76,(fresh fruits),(soup broth bouillon),0.555995,0.083365,0.053880,0.096907,1.162446,0.007529,1.014996
81,(fresh fruits),(spreads),0.555995,0.078185,0.053560,0.096332,1.232101,0.010090,1.020081


In [43]:
data_scatter = dict(type='scatter',
    y=rulesLift['confidence'],
    x=rulesLift['lift'],
    #text=rulesLift.index,
    #mode='markers',
    #marker=dict(
    #size=rulesLift['support'],
    hovertemplate=#'Grand Prix: ' + df_racetracks["name_x"] + '<br>'
                    #'RuleID: ' + rulesLift.index + '<br>'+
                    'Lift: ' + rulesLift["lift"].astype(str) + '<br>'+
                    'Confidence: ' + rulesLift['confidence'].astype(str) + '<br>'+
                    'Support: ' + rulesLift['support'].astype(str) + '<br>'+    
                    'Antecedents: ' + rulesLift['antecedents'].astype(str) + '<br>'+
                    'Consequents: ' + rulesLift['consequents'].astype(str) + '<br>'
    '<extra></extra>',
    #color=scatterdf['Avg Salary'],  # set color equal to a variable
    #color_continuous_scale='mint',  # one of plotly colorscales
    #showscale=False,
    mode='markers',
    marker=dict(size=8,
                #size=rulesLift['support'],
                color=rulesLift['support'],
                colorscale='oranges',
                showscale=True,
                line_width=2),
    )
layout = dict(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)'
)
fig = go.Figure(data=data_scatter, layout=layout)
fig.add_hline(y=0.8, line_width=2, line_dash="dash", line_color="green")
fig.add_vline(x=1.5, line_width=2, line_dash="dash", line_color="green")
fig

In [44]:
# Best Complementary products by Stregnth of Lift and Cofidence
rulesLift[(rulesLift['confidence']>=0.8)&(rulesLift['lift']>=1.5)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
189,"(fresh herbs, fresh fruits)",(fresh vegetables),0.070135,0.44436,0.061815,0.881372,1.983463,0.03065,4.683872
372,"(fresh vegetables, packaged vegetables fruits,...",(fresh fruits),0.087995,0.555995,0.07624,0.866413,1.558311,0.027315,3.323711
344,"(fresh vegetables, milk, packaged vegetables f...",(fresh fruits),0.073075,0.555995,0.062535,0.855765,1.539159,0.021906,3.078336
86,(fresh herbs),(fresh vegetables),0.093005,0.44436,0.078655,0.845707,1.903203,0.037327,3.601205


In [45]:
data_scatter = dict(type='scatter',
    y=rulesLift['confidence'],
    x=rulesLift['lift'],
    #text=rulesLift.index,
    #mode='markers',
    #marker=dict(
    #size=rulesLift['support'],
    hovertemplate=#'Grand Prix: ' + df_racetracks["name_x"] + '<br>'
                    #'RuleID: ' + rulesLift.index + '<br>'+
                    'Lift: ' + rulesLift["lift"].astype(str) + '<br>'+
                    'Confidence: ' + rulesLift['confidence'].astype(str) + '<br>'+
                    'Support: ' + rulesLift['support'].astype(str) + '<br>'+    
                    'Antecedents: ' + rulesLift['antecedents'].astype(str) + '<br>'+
                    'Consequents: ' + rulesLift['consequents'].astype(str) + '<br>'
    '<extra></extra>',
    #color=scatterdf['Avg Salary'],  # set color equal to a variable
    #color_continuous_scale='mint',  # one of plotly colorscales
    #showscale=False,
    mode='markers',
    marker=dict(size=8,
                #size=rulesLift['support'],
                color=rulesLift['support'],
                colorscale='oranges',
                showscale=True,
                line_width=2),
    )

layout = dict(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)'
)
fig = go.Figure(data=data_scatter,layout=layout)
fig.add_hline(y=0.5, line_width=2, line_dash="dash", line_color="green")
fig.add_vline(x=1.2, line_width=2, line_dash="dash", line_color="green")

In [21]:
# Best Complementary products by Stregnth of Lift and Cofidence
rulesLift[(rulesLift['confidence']>=0.5)&(rulesLift['lift']>=1.2)].sort_values(by='support',ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
53,(fresh fruits),(fresh vegetables),0.555995,0.444360,0.317560,0.571156,1.285346,0.070498,1.295670
52,(fresh vegetables),(fresh fruits),0.444360,0.555995,0.317560,0.714646,1.285346,0.070498,1.555978
72,(packaged vegetables fruits),(fresh fruits),0.365415,0.555995,0.269870,0.738530,1.328304,0.066701,1.698112
103,(packaged vegetables fruits),(fresh vegetables),0.365415,0.444360,0.234555,0.641887,1.444519,0.072179,1.551576
102,(fresh vegetables),(packaged vegetables fruits),0.444360,0.365415,0.234555,0.527849,1.444519,0.072179,1.344030
...,...,...,...,...,...,...,...,...,...
220,"(fresh fruits, refrigerated)",(fresh vegetables),0.086430,0.444360,0.051445,0.595222,1.339503,0.013039,1.372702
334,"(milk, yogurt)",(packaged vegetables fruits),0.095705,0.365415,0.051295,0.535970,1.466743,0.016323,1.367551
182,"(eggs, packaged vegetables fruits)",(fresh vegetables),0.068650,0.444360,0.050675,0.738165,1.661186,0.020170,2.122097
183,"(eggs, fresh vegetables)",(packaged vegetables fruits),0.084310,0.365415,0.050675,0.601056,1.644858,0.019867,1.590660


# Subs

In [22]:
# Apply the APRIORI algorithm to get frequent itemsets
# Rules supported in at least 5% of the transactions (more info at http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/)
frequent_itemsets = apriori(pt, min_support=0.03, use_colnames=True)
frequent_itemsets.sort_values(by='support').head()

Unnamed: 0,support,itemsets
210,0.03006,"(juice nectars, yogurt)"
129,0.03017,"(eggs, soy lactosefree)"
239,0.030175,"(refrigerated, soy lactosefree)"
370,0.030265,"(packaged vegetables fruits, yogurt, fresh fru..."
173,0.03029,"(fresh herbs, packaged cheese)"


In [23]:
# Generate the association rules - by lift
rulesLift = association_rules(frequent_itemsets, metric="lift", min_threshold=0)
rulesLift.sort_values(by='lift', ascending=True, inplace=True)
rulesLift.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
203,(fresh fruits),(soft drinks),0.555995,0.08731,0.039585,0.071197,0.815447,-0.008959,0.982652
202,(soft drinks),(fresh fruits),0.08731,0.555995,0.039585,0.453384,0.815447,-0.008959,0.81228
197,(fresh fruits),(paper goods),0.555995,0.063575,0.03209,0.057716,0.907847,-0.003257,0.993782
196,(paper goods),(fresh fruits),0.063575,0.555995,0.03209,0.504758,0.907847,-0.003257,0.896542
266,(water seltzer sparkling water),(fresh vegetables),0.193005,0.44436,0.083355,0.43188,0.971915,-0.002409,0.978033
267,(fresh vegetables),(water seltzer sparkling water),0.44436,0.193005,0.083355,0.187584,0.971915,-0.002409,0.993328
317,(milk),(water seltzer sparkling water),0.243325,0.193005,0.04661,0.191555,0.992485,-0.000353,0.998206
316,(water seltzer sparkling water),(milk),0.193005,0.243325,0.04661,0.241496,0.992485,-0.000353,0.997589
315,(soy lactosefree),(milk),0.168355,0.243325,0.04122,0.24484,1.006225,0.000255,1.002006
314,(milk),(soy lactosefree),0.243325,0.168355,0.04122,0.169403,1.006225,0.000255,1.001262
