In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [85]:
# header=None，不将第一行作为head
dataset = pd.read_csv('./Market_Basket_Optimisation.csv', header = None) 
# shape为(7501,20)

In [86]:
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7497,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7498,chicken,,,,,,,,,,,,,,,,,,,
7499,escalope,green tea,,,,,,,,,,,,,,,,,,


In [87]:
# 生成一个完整的产品清单
product = []
for i in range(0, dataset.shape[0]):
    for j in range(0,dataset.shape[1]):
        if str(dataset.values[i,j]) != 'nan' and  dataset.values[i,j] not in product:
            product.append(dataset.values[i,j])
product = pd.Series(product)
product

0              shrimp
1             almonds
2             avocado
3      vegetables mix
4        green grapes
            ...      
115      burger sauce
116           oatmeal
117         asparagus
118             cream
119           napkins
Length: 120, dtype: object

In [88]:
# 创建一个全为零的dummies矩阵
zero_matrix = np.zeros((dataset.shape[0], product.shape[0]))
dummies = pd.DataFrame(zero_matrix, columns=product)
dummies

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,...,melons,cauliflower,green beans,ketchup,bramble,burger sauce,oatmeal,asparagus,cream,napkins
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
# 遍历dataset，把dataset每条Index中买过的产品标记到dummies中
for i in range(0,dataset.shape[0]):
    for j in enumerate(dataset.iloc[i]):
        x = dummies.columns.get_indexer(j)[1]
        if x != -1:
            dummies.iloc[i, x] = 1

In [93]:
# 挖掘频繁项集，最小支持度为0.05
itemsets = apriori(dummies,use_colnames=True, min_support=0.05)
itemsets = itemsets.sort_values(by="support" , ascending=False) 
print('-'*20, '频繁项集', '-'*20)
print(itemsets)

-------------------- 频繁项集 --------------------
     support                    itemsets
3   0.238368             (mineral water)
7   0.179709                      (eggs)
14  0.174110                 (spaghetti)
11  0.170911              (french fries)
17  0.163845                 (chocolate)
2   0.132116                 (green tea)
9   0.129583                      (milk)
22  0.098254               (ground beef)
13  0.095321         (frozen vegetables)
20  0.095054                  (pancakes)
6   0.087188                   (burgers)
24  0.081056                      (cake)
15  0.080389                   (cookies)
23  0.079323                  (escalope)
1   0.076523            (low fat yogurt)
0   0.071457                    (shrimp)
19  0.068391                  (tomatoes)
5   0.065858                 (olive oil)
4   0.063325           (frozen smoothie)
8   0.062525                    (turkey)
18  0.059992                   (chicken)
26  0.059725  (spaghetti, mineral water)
10  0.0585

In [94]:
# 根据频繁项集计算关联规则，设置最小提升度为1,
rules =  association_rules(itemsets, metric='lift', min_threshold=1)
# 按照提升度从大到小进行排序
rules = rules.sort_values(by='lift' , ascending=False) 
rules.to_csv('rules.csv')
rules = pd.DataFrame(rules)
print('-'*20, '关联规则', '-'*20)
rules

-------------------- 关联规则 --------------------


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(spaghetti),(mineral water),0.17411,0.238368,0.059725,0.343032,1.439085,0.018223,1.159314
1,(mineral water),(spaghetti),0.238368,0.17411,0.059725,0.250559,1.439085,0.018223,1.102008
2,(chocolate),(mineral water),0.163845,0.238368,0.05266,0.3214,1.348332,0.013604,1.122357
3,(mineral water),(chocolate),0.238368,0.163845,0.05266,0.220917,1.348332,0.013604,1.073256
4,(eggs),(mineral water),0.179709,0.238368,0.050927,0.283383,1.188845,0.00809,1.062815
5,(mineral water),(eggs),0.238368,0.179709,0.050927,0.213647,1.188845,0.00809,1.043158
