# Association rules - Olist dataset

#### Appling Machine Learning on Olist dataset in order to find out whether there are rules governing products that are bought together or not. In this specific case, we are going to analyse the products' category.

<img src="https://miro.medium.com/max/980/1*WzlhttgnH7PBuxUcnO-ciw.jpeg" width="300">



In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [3]:
#Open dataframes
dataframeOrdem = pd.read_excel("fOrder.xlsx")
dataframeProduto = pd.read_excel("dProducts.xlsx")

#Order do not have product category
dataframeOrdem.head(5)

Unnamed: 0,order_id,product_id,seller_id
0,00010242fe8c5a6d1ba2dd792cb16214,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202
1,00018f77f2f0320c557190d7a144bdd3,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36
2,000229ec398224ef6ca0657da4fc703e,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d
3,00024acbcdf0a6daa1e931b038114c75,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4
4,00042b26cf59d7ce69dfabb4e55b4fd9,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87


In [4]:
#Creating a new dataframe, merging dataframe Order and product categorie by product id and dropping missing values
dataframeMerged = pd.merge(dataframeOrdem, dataframeProduto, left_on='product_id', right_on='product_id').dropna()

dataframeMerged.head(5)

Unnamed: 0,order_id,product_id,seller_id,product_category_name
0,00010242fe8c5a6d1ba2dd792cb16214,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,cool_stuff
1,130898c0987d1801452a8ed92a670612,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,cool_stuff
2,532ed5e14e24ae1f0d735b91524b98b9,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,cool_stuff
3,6f8c31653edb8c83e1a739408b5ff750,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,cool_stuff
4,7d19f4ef4d04461989632411b7e588b9,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,cool_stuff


In [5]:
#Grouping dataframe by order and transforming it from pandas object to list, keeping only category names
grouped = dataframeMerged.groupby("order_id")
productOfGrouped = grouped['product_category_name'].apply(list).to_list()
productOfGrouped[0:5]

[['cool_stuff'],
 ['pet_shop'],
 ['moveis_decoracao'],
 ['perfumaria'],
 ['ferramentas_jardim']]

In [6]:
#Deleting orders one item only in order to increase both confidence and lift
for lista in productOfGrouped:
    if len(lista) == 1:
        productOfGrouped.remove(lista)

In [7]:
#One hot encoding

from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
teArray = te.fit(productOfGrouped).transform(productOfGrouped)
teDataframe = pd.DataFrame(teArray, columns=te.columns_)

teDataframe.head()



Unnamed: 0,agro_industria_e_comercio,alimentos,alimentos_bebidas,artes,artes_e_artesanato,artigos_de_festas,artigos_de_natal,audio,automotivo,bebes,...,pet_shop,portateis_casa_forno_e_cafe,portateis_cozinha_e_preparadores_de_alimentos,relogios_presentes,seguros_e_servicos,sinalizacao_e_seguranca,tablets_impressao_imagem,telefonia,telefonia_fixa,utilidades_domesticas
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [8]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(teDataframe, min_support = 0.00005, use_colnames = True) 
frequent_itemsets.sort_values(by=['support'], ascending = True).head(10)

#Support is defined as frequency that both items occur together in the basket divided by total of transactions in the dataset
#Since there are too many orders with no category variability, support is low

Unnamed: 0,support,itemsets
113,5.9e-05,"(esporte_lazer, ferramentas_jardim)"
130,5.9e-05,"(moveis_sala, moveis_escritorio)"
90,5.9e-05,"(moveis_decoracao, beleza_saude)"
87,5.9e-05,"(bebidas, beleza_saude)"
107,5.9e-05,"(esporte_lazer, construcao_ferramentas_ilumina..."
83,5.9e-05,"(fashion_bolsas_e_acessorios, bebes)"
79,5.9e-05,"(automotivo, telefonia)"
114,5.9e-05,"(esporte_lazer, informatica_acessorios)"
93,5.9e-05,"(beleza_saude, relogios_presentes)"
116,5.9e-05,"(esporte_lazer, perfumaria)"


In [9]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
#rules
rules.sort_values(by=['lift'], ascending = False).drop(['antecedent support', 'consequent support', 'leverage', 'conviction'], axis=1)

#Lift > 1 means that there's a significative association rule

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(casa_conforto),(cama_mesa_banho),0.00084,0.187773,1.823128
