# P073 关联规则挖掘-购物篮数据拆分

In [2]:
# 关联分析是指从大量数据中发现数据之间有趣的关联
# 典型例子“购物篮分析”。67%的顾客会在购买啤酒的同时购买尿布。因此通过合理的啤酒和尿布货架摆放或捆绑销售可以提高超市的服务质量的效益

In [4]:
import numpy as np
import pandas as pd

In [6]:
data = {
    'products': [
        'bread eggs',
        'bread eggs milk',
        'milk cheese',
        'bread butter cheese',
        'eggs milk',
        'bread milk butter cheese'
    ]
}

In [8]:
data

{'products': ['bread eggs',
  'bread eggs milk',
  'milk cheese',
  'bread butter cheese',
  'eggs milk',
  'bread milk butter cheese']}

In [10]:
transactions = pd.DataFrame(data=data, index=range(1,7))

In [12]:
transactions

Unnamed: 0,products
1,bread eggs
2,bread eggs milk
3,milk cheese
4,bread butter cheese
5,eggs milk
6,bread milk butter cheese


In [14]:
expanded = transactions["products"].str.split(expand=True)

In [16]:
expanded

Unnamed: 0,0,1,2,3
1,bread,eggs,,
2,bread,eggs,milk,
3,milk,cheese,,
4,bread,butter,cheese,
5,eggs,milk,,
6,bread,milk,butter,cheese


# P074 关联规则挖掘-计算购买商品的去重列表

In [20]:
products = set()

In [24]:
for column in expanded.columns:
    for product in expanded[column].unique():
        if product:
            products.add(product)

In [26]:
products

{'bread', 'butter', 'cheese', 'eggs', 'milk'}

In [28]:
products = sorted(list(products))

In [30]:
products

['bread', 'butter', 'cheese', 'eggs', 'milk']

# P075 关联规则挖掘-实现one-hot编码

In [33]:
expanded

Unnamed: 0,0,1,2,3
1,bread,eggs,,
2,bread,eggs,milk,
3,milk,cheese,,
4,bread,butter,cheese,
5,eggs,milk,,
6,bread,milk,butter,cheese


In [35]:
products

['bread', 'butter', 'cheese', 'eggs', 'milk']

In [37]:
transactions_encoded = np.zeros(
    (len(expanded), len(products)), dtype='int8'
)

In [40]:
transactions_encoded

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int8)

In [50]:
for row in zip(range(len(expanded)), expanded.values):
    for index, product in enumerate(products):
        if product in row[1]:
            transactions_encoded[row[0], index] = 1

In [52]:
transactions_encoded

array([[1, 0, 0, 1, 0],
       [1, 0, 0, 1, 1],
       [0, 0, 1, 0, 1],
       [1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1],
       [1, 1, 1, 0, 1]], dtype=int8)

In [56]:
transactions_encoded_df = pd.DataFrame(
    transactions_encoded,
    columns=products
)

In [58]:
transactions_encoded_df

Unnamed: 0,bread,butter,cheese,eggs,milk
0,1,0,0,1,0
1,1,0,0,1,1
2,0,0,1,0,1
3,1,1,1,0,0
4,0,0,0,1,1
5,1,1,1,0,1


# P076 关联规则挖掘-计算商品的支持度

In [61]:
# 某个商品A的支持度
# 包含这个商品的交易数目 / 所有的交易数目

In [67]:
support = transactions_encoded_df.sum() / len(transactions_encoded_df)

In [69]:
support

bread     0.666667
butter    0.333333
cheese    0.500000
eggs      0.500000
milk      0.666667
dtype: float64

# P077 关联规则挖掘-计算多个商品的支持度

In [72]:
# 商品（A, B）的支持度
# 同时包含商品（A, B）的交易数目 / 所有的交易数目

In [74]:
transactions_encoded_df

Unnamed: 0,bread,butter,cheese,eggs,milk
0,1,0,0,1,0
1,1,0,0,1,1
2,0,0,1,0,1
3,1,1,1,0,0
4,0,0,0,1,1
5,1,1,1,0,1


In [78]:
sup_butter_bread = (
    len(transactions_encoded_df.query("butter==1 and bread==1")) / len(transactions_encoded_df)
)

In [80]:
sup_butter_bread

0.3333333333333333

In [82]:
sup_butter_milk = (
    len(transactions_encoded_df.query("butter==1 and milk==1")) / len(transactions_encoded_df)
)

In [84]:
sup_butter_milk

0.16666666666666666

# P078 关联规则挖掘-计算关联规则以及置信度

In [87]:
# 关联规则 A->B 的置信度，等于
# 同时包含（A,B）的交易数目 / 包含A的交易数目

In [89]:
transactions_encoded_df

Unnamed: 0,bread,butter,cheese,eggs,milk
0,1,0,0,1,0
1,1,0,0,1,1
2,0,0,1,0,1
3,1,1,1,0,0
4,0,0,0,1,1
5,1,1,1,0,1


In [91]:
conf_cheese_bread = (
    len(transactions_encoded_df.query("cheese==1 and bread==1")) / len(transactions_encoded_df.query("cheese==1"))
)

In [93]:
conf_cheese_bread

0.6666666666666666

In [None]:
conf_chees_bread = (
    len(transactions_encoded_df.query("cheese==1 and bread==1")) / len(transactions_encoded_df.query("cheese==1"))
)