In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
dataset = pd.read_csv('D:\data engine/Market_Basket_Optimisation.csv',header=None)
print(dataset)
print(dataset.shape)

                 0                  1            2                 3   \
0            shrimp            almonds      avocado    vegetables mix   
1           burgers          meatballs         eggs               NaN   
2           chutney                NaN          NaN               NaN   
3            turkey            avocado          NaN               NaN   
4     mineral water               milk   energy bar  whole wheat rice   
...             ...                ...          ...               ...   
7496         butter         light mayo  fresh bread               NaN   
7497        burgers  frozen vegetables         eggs      french fries   
7498        chicken                NaN          NaN               NaN   
7499       escalope          green tea          NaN               NaN   
7500           eggs    frozen smoothie  yogurt cake    low fat yogurt   

                4                 5     6               7             8   \
0     green grapes  whole weat flour  yams  cot

##基于mlxtend的数据关联分析
##数据整理

In [30]:
temp_list=[]
for i in range(0,dataset.shape[0]):
    temp_str=''
    for j in range(0,dataset.shape[1]):
        if str(dataset.values[i,j]) != 'nan':
            temp_str += str(dataset.values[i,j])+'|'
    temp_list.append(temp_str)
dataset_new = pd.DataFrame(data=temp_list)
dataset_new.columns = ['MarketBasket']
print(dataset_new)
dataset_new.to_csv('temp_data.csv',index=True)

                                           MarketBasket
0     shrimp|almonds|avocado|vegetables mix|green gr...
1                               burgers|meatballs|eggs|
2                                              chutney|
3                                       turkey|avocado|
4     mineral water|milk|energy bar|whole wheat rice...
...                                                 ...
7496                     butter|light mayo|fresh bread|
7497  burgers|frozen vegetables|eggs|french fries|ma...
7498                                           chicken|
7499                                escalope|green tea|
7500   eggs|frozen smoothie|yogurt cake|low fat yogurt|

[7501 rows x 1 columns]


##对数据进行one-hot编码

In [14]:
dataset_new_hot_encoded = dataset_new.drop('MarketBasket',1).join(dataset_new.MarketBasket.str.get_dummies('|'))
print(dataset_new_hot_encoded)

       asparagus  almonds  antioxydant juice  asparagus  avocado  babies food  \
0              0        1                  1          0        1            0   
1              0        0                  0          0        0            0   
2              0        0                  0          0        0            0   
3              0        0                  0          0        1            0   
4              0        0                  0          0        0            0   
...          ...      ...                ...        ...      ...          ...   
7496           0        0                  0          0        0            0   
7497           0        0                  0          0        0            0   
7498           0        0                  0          0        0            0   
7499           0        0                  0          0        0            0   
7500           0        0                  0          0        0            0   

      bacon  barbecue sauce

In [15]:
dataset_new_hot_encoded = dataset_new_hot_encoded.dropna(axis=1)
print(dataset_new_hot_encoded)

       asparagus  almonds  antioxydant juice  asparagus  avocado  babies food  \
0              0        1                  1          0        1            0   
1              0        0                  0          0        0            0   
2              0        0                  0          0        0            0   
3              0        0                  0          0        1            0   
4              0        0                  0          0        0            0   
...          ...      ...                ...        ...      ...          ...   
7496           0        0                  0          0        0            0   
7497           0        0                  0          0        0            0   
7498           0        0                  0          0        0            0   
7499           0        0                  0          0        0            0   
7500           0        0                  0          0        0            0   

      bacon  barbecue sauce

##挖掘频繁项集

In [16]:
itemsets = apriori(dataset_new_hot_encoded,use_colnames=True, min_support=0.05)
itemsets = itemsets.sort_values(by="support" , ascending=False) 
print('-'*20, '频繁项集', '-'*20)
print(itemsets)

-------------------- 频繁项集 --------------------
     support                    itemsets
16  0.238368             (mineral water)
6   0.179709                      (eggs)
21  0.174110                 (spaghetti)
8   0.170911              (french fries)
3   0.163845                 (chocolate)
12  0.132116                 (green tea)
15  0.129583                      (milk)
13  0.098254               (ground beef)
10  0.095321         (frozen vegetables)
18  0.095054                  (pancakes)
0   0.087188                   (burgers)
1   0.081056                      (cake)
4   0.080389                   (cookies)
7   0.079323                  (escalope)
14  0.076523            (low fat yogurt)
19  0.071457                    (shrimp)
22  0.068391                  (tomatoes)
17  0.065858                 (olive oil)
9   0.063325           (frozen smoothie)
23  0.062525                    (turkey)
2   0.059992                   (chicken)
27  0.059725  (spaghetti, mineral water)
24  0.0585


# 根据频繁项集计算关联规则

In [17]:
rules =  association_rules(itemsets, metric='lift', min_threshold=1)
rules = rules.sort_values(by="lift" , ascending=False) 
print('-'*20, '关联规则', '-'*20)
print(rules)

-------------------- 关联规则 --------------------
       antecedents      consequents  antecedent support  consequent support  \
0      (spaghetti)  (mineral water)            0.174110            0.238368   
1  (mineral water)      (spaghetti)            0.238368            0.174110   
2      (chocolate)  (mineral water)            0.163845            0.238368   
3  (mineral water)      (chocolate)            0.238368            0.163845   
4           (eggs)  (mineral water)            0.179709            0.238368   
5  (mineral water)           (eggs)            0.238368            0.179709   

    support  confidence      lift  leverage  conviction  
0  0.059725    0.343032  1.439085  0.018223    1.159314  
1  0.059725    0.250559  1.439085  0.018223    1.102008  
2  0.052660    0.321400  1.348332  0.013604    1.122357  
3  0.052660    0.220917  1.348332  0.013604    1.073256  
4  0.050927    0.283383  1.188845  0.008090    1.062815  
5  0.050927    0.213647  1.188845  0.008090    1.04

#基于efficient_apriori的数据关联分析

In [18]:
from efficient_apriori import apriori

#生成transactions数据

In [28]:
transactions=[]
for i in range(0,dataset.shape[0]):
    temp=[]
    for j in range(0,dataset.shape[1]):
        if str(dataset.values[i,j]) !='nan':
            temp.append(dataset.values[i,j])
        transactions.append(temp)

In [29]:
itemsets,rules = apriori(transactions, min_support=0.05, min_confidence=0.2)
print('频繁项集：', itemsets)
print('关联规则：', rules)

频繁项集： {1: {('low fat yogurt',): 11480, ('green tea',): 19820, ('frozen smoothie',): 9500, ('olive oil',): 9880, ('mineral water',): 35760, ('shrimp',): 10720, ('eggs',): 26960, ('burgers',): 13080, ('turkey',): 9380, ('whole wheat rice',): 8780, ('milk',): 19440, ('french fries',): 25640, ('soup',): 7580, ('spaghetti',): 26120, ('frozen vegetables',): 14300, ('cookies',): 12060, ('cooking oil',): 7660, ('chicken',): 9000, ('chocolate',): 24580, ('tomatoes',): 10260, ('pancakes',): 14260, ('grated cheese',): 7860, ('ground beef',): 14740, ('escalope',): 11900, ('cake',): 12160}, 2: {('eggs', 'mineral water'): 7640, ('mineral water', 'spaghetti'): 8960, ('chocolate', 'mineral water'): 7900}}
关联规则： [{mineral water} -> {eggs}, {eggs} -> {mineral water}, {spaghetti} -> {mineral water}, {mineral water} -> {spaghetti}, {mineral water} -> {chocolate}, {chocolate} -> {mineral water}]
