# Regras de associação - Apriori

## 1. Introdução

### 1.1 Importação e carga do dataset

In [2]:
import sklearn
import pandas as pd
import numpy as np

In [3]:
dfs = pd.read_csv('mercado2.csv', sep=';'  , engine='python', header=None)

### 1.2 Análise das compras por transação

In [4]:
dfs['qt_itens']  = len(dfs[0].str.split(",").tolist()[0])

In [5]:
for i in range(0,len(dfs)):
    dfs['qt_itens'].iloc[i] = len(dfs.iloc[i].str.split(",").tolist()[0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [6]:
dfs.head()

Unnamed: 0,0,qt_itens
0,"shrimp,almonds,avocado,vegetables mix,green gr...",20
1,"burgers,meatballs,eggs",3
2,chutney,1
3,"turkey,avocado",2
4,"mineral water,milk,energy bar,whole wheat rice...",5


In [7]:
dfs.describe()

Unnamed: 0,qt_itens
count,7501.0
mean,3.914545
std,2.90554
min,1.0
25%,2.0
50%,3.0
75%,5.0
max,20.0


## 2. Regras de associação  - algoritmo Apriori

In [8]:
from apyori import apriori

### 2.1 Preparação dos dados para o algoritmo

In [9]:
transactions = dfs.values.tolist()

In [10]:
dfs = dfs[0].str.split(",", expand = True)

In [11]:
dfs.fillna(0,inplace=True)

In [12]:
dfs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,chutney,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,turkey,avocado,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,mineral water,milk,energy bar,whole wheat rice,green tea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
transactions = []
for i in range(0,len(dfs)):
    transactions.append([str(dfs.values[i,j]) for j in range(0,20) if str(dfs.values[i,j])!='0'])

### 2.2 Execução do algoritmo Apriori

#### 2.2.1 Iteração 1 - parâmetros: suporte mínimo - 0.01 e confiança mínima - 0.2

In [136]:
rules = apriori(transactions, min_support = 0.01, min_confidence = 0.2)

In [137]:
#Support degree (support), first enter an empty list, and then assign
supports=[]
#Confidence
confidences=[]
#Lift
lifts=[]
#Based on items_base
bases=[]
#Derivation items items_add
adds=[]

In [138]:
for r in rules:
    for x in r.ordered_statistics:
        supports.append(r.support)
        confidences.append(x.confidence)
        lifts.append(x.lift)
        bases.append(list(x.items_base))
        adds.append(list(x.items_add))

In [139]:
df_result = pd.DataFrame({
    'support':supports,
    'confidence':confidences,
    'lift':lifts,
    'base':bases,
    'adds':adds
})

##### OBS.: Excluindo os resultados com apenas 1 item (compra de apenas 1 item)

In [140]:
df_result = df_result[(df_result.base.str.len() >= 1)]

#### 2.2.1.1 Resumo do suporte, confiança e lift

In [141]:
df_result.describe()

Unnamed: 0,support,confidence,lift
count,162.0,162.0,162.0
mean,0.020105,0.288604,1.640048
std,0.010826,0.075359,0.37042
min,0.010132,0.200306,0.902495
25%,0.012665,0.228827,1.359001
50%,0.016398,0.265724,1.572121
75%,0.02303,0.332569,1.855226
max,0.059725,0.506667,3.291994


##### // (df_result.support > 0.015) & (df_result.confidence > 0.28) & (df_result.lift > 1.7)

#### 2.2.1.2 Filtro dos primeiros 30 resultados, ordenados em ordem decrescente de support, confidence, lift

In [142]:
df_result.sort_values(by=['support','confidence','lift'], ascending=False).head(30)

Unnamed: 0,support,confidence,lift,base,adds
103,0.059725,0.343032,1.439085,[spaghetti],[mineral water]
102,0.059725,0.250559,1.439085,[mineral water],[spaghetti]
29,0.05266,0.3214,1.348332,[chocolate],[mineral water]
30,0.05266,0.220917,1.348332,[mineral water],[chocolate]
51,0.050927,0.283383,1.188845,[eggs],[mineral water]
52,0.050927,0.213647,1.188845,[mineral water],[eggs]
87,0.047994,0.37037,1.553774,[milk],[mineral water]
88,0.047994,0.201342,1.553774,[mineral water],[milk]
78,0.040928,0.416554,1.747522,[ground beef],[mineral water]
80,0.039195,0.398915,2.291162,[ground beef],[spaghetti]


#### 2.2.1.2 Filtro dos primeiros 30 resultados, ordenados em ordem crescente de lift, support, confidence

In [105]:
df_result.sort_values(by=['lift','support','confidence'], ascending=[True,False,False]).head(30)

Unnamed: 0,support,confidence,lift,base,adds
130,0.010532,0.131012,0.729019,[cookies],[eggs]
131,0.010532,0.058605,0.729019,[eggs],[cookies]
144,0.011065,0.061573,0.77623,[eggs],[escalope]
145,0.011065,0.139496,0.77623,[escalope],[eggs]
86,0.010399,0.129353,0.789486,[cookies],[chocolate]
85,0.010399,0.063466,0.789486,[chocolate],[cookies]
194,0.013865,0.141113,0.825652,[ground beef],[french fries]
193,0.013865,0.081123,0.825652,[french fries],[ground beef]
199,0.033729,0.197348,0.827912,[french fries],[mineral water]
200,0.033729,0.141499,0.827912,[mineral water],[french fries]


#### 2.2.2 Iteração 2 - parâmetros: suporte mínimo - 0.03, confiança mínima - 0.35 e lift mínimo - 1.1

In [130]:
rules = apriori(transactions, min_support = 0.03, min_confidence = 0.35, min_lift = 1.1)

In [131]:
#Support degree (support), first enter an empty list, and then assign
supports=[]
#Confidence
confidences=[]
#Lift
lifts=[]
#Based on items_base
bases=[]
#Derivation items items_add
adds=[]

In [132]:
for r in rules:
    for x in r.ordered_statistics:
        supports.append(r.support)
        confidences.append(x.confidence)
        lifts.append(x.lift)
        bases.append(list(x.items_base))
        adds.append(list(x.items_add))

In [133]:
df_result = pd.DataFrame({
    'support':supports,
    'confidence':confidences,
    'lift':lifts,
    'base':bases,
    'adds':adds
})

##### OBS.: Excluindo os resultados com apenas 1 item (compra de apenas 1 item)

In [134]:
df_result = df_result[(df_result.base.str.len() >= 1) | (df_result.adds.str.len() >= 1)]

#### Filtro dos primeiros 30 resultados, ordenados em ordem decrescente de support, confidence, lift

In [135]:
df_result.sort_values(by=['support','confidence','lift'], ascending=False).head(30)

Unnamed: 0,support,confidence,lift,base,adds
3,0.047994,0.37037,1.553774,[milk],[mineral water]
1,0.040928,0.416554,1.747522,[ground beef],[mineral water]
2,0.039195,0.398915,2.291162,[ground beef],[spaghetti]
0,0.035729,0.374825,1.572463,[frozen vegetables],[mineral water]
4,0.033729,0.354839,1.488616,[pancakes],[mineral water]
