In [1]:
import pandas as pd

In [2]:
data = pd.DataFrame()
data['用户'] = [1, 2, 3, 4, 5, ]
data['购买商品列表'] = ['A, B, C', 'A, B', 'B, C', 'A, B, C, D', 'B, C, D']

In [3]:
data

Unnamed: 0,用户,购买商品列表
0,1,"A, B, C"
1,2,"A, B"
2,3,"B, C"
3,4,"A, B, C, D"
4,5,"B, C, D"


### 一些概念

##### 事务库

如上面表格数据就是一个事务库


##### 事务

每一条记录就是一个事务

##### 项和项集

每条记录中，每件商品为一个“项”，如"A"；项集为商品的集合，如”A,B,C“

##### 关联规则

形如A -> B的表达式，A为前件，B为后件，表示一个用户如果购买了商品A，也会购买商品B，有时也表示为{A,B} -> {C}

##### 支持度

项集的支持度定义为包含该项集的事务在所有事务中所占的比率，如上{A,B}项集在事务1,2,4出现过，则它的支持度为3/5 = 60%

##### 频繁项集

支持度大于设定的阈值的项集

##### 置信度

关联规则X -> Y的置信度为在购买项集X的基础上购买项集Y的概率 $P(Y|X) = \frac{P(XY)}{P(X)}$，可理解为项集Y在包含项集X的事务中出现的频繁程度。

##### 强关联规则（最终目标）

在实际应用中，通常是先寻找满足最小支持度的频繁项集，然后在频繁项集中寻找满足最小置信度的关联规则，这样的关联规则称为强关联规则。

### Apriori核心思想  ([əpriˈɔri])

用于快速挖掘频繁项集。如果连长度n-1的项集都不是频繁项集，那么就不用考虑长度为n的项集了。所以，先计算长度为1的项集，挖掘出其中的频繁项集；再将长度为1的频繁项集进行排列组合，挖掘其中长度为2的频繁项集；....

In [4]:
!pip install mlxtend



In [5]:
df = pd.read_excel('./data/病症.xlsx')

In [6]:
df

Unnamed: 0,病人编号,病人症状
0,1,"消化不良,便秘"
1,2,"心悸,失眠"
2,3,"腰疼,脱发,眼干"
3,4,"腹胀,便秘,哮喘,胸闷气短,消化不良"
4,5,"神经衰弱,失眠,月经不调"
...,...,...
995,996,"心悸,眼干,月经不调"
996,997,"腰疼,脱发,眼干,易怒,月经不调"
997,998,"心悸,眼干,月经不调,鼻炎"
998,999,"脱发,眼干,月经不调"


In [7]:
symptoms = []

for i in df['病人症状'].tolist():
    symptoms.append(i.split(','))

In [12]:
symptoms

[['消化不良', '便秘'],
 ['心悸', '失眠'],
 ['腰疼', '脱发', '眼干'],
 ['腹胀', '便秘', '哮喘', '胸闷气短', '消化不良'],
 ['神经衰弱', '失眠', '月经不调'],
 ['神经衰弱', '消化不良', '月经不调'],
 ['失眠', '眼干', '月经不调'],
 ['腹胀', '便秘', '哮喘', '胸闷气短', '消化不良'],
 ['腰疼', '脱发', '眼干', '心悸'],
 ['神经衰弱', '消化不良', '月经不调'],
 ['腰疼', '眼干', '月经不调'],
 ['心悸', '腹胀', '便秘', '消化不良'],
 ['心悸', '月经不调', '消化不良'],
 ['心悸', '失眠', '月经不调'],
 ['心悸', '神经衰弱', '消化不良', '便秘'],
 ['失眠', '月经不调', '胸闷气短'],
 ['心悸', '失眠', '脱发', '眼干', '月经不调'],
 ['哮喘', '胸闷气短'],
 ['心悸', '月经不调', '消化不良'],
 ['消化不良', '月经不调'],
 ['腹胀', '便秘', '消化不良'],
 ['失眠', '月经不调'],
 ['腰疼', '脱发', '眼干', '易怒'],
 ['失眠', '月经不调'],
 ['哮喘', '腰疼'],
 ['心悸', '腹胀'],
 ['失眠', '眼干', '月经不调'],
 ['失眠', '眼干', '月经不调'],
 ['消化不良', '便秘'],
 ['失眠', '月经不调'],
 ['消化不良', '便秘'],
 ['心悸', '神经衰弱', '消化不良', '便秘'],
 ['哮喘', '鼻炎', '脱发'],
 ['心悸', '腹胀'],
 ['心悸', '失眠', '月经不调'],
 ['腰疼', '眼干', '便秘'],
 ['心悸', '失眠'],
 ['心悸', '神经衰弱', '易怒', '消化不良'],
 ['神经衰弱', '消化不良'],
 ['心悸', '失眠', '月经不调'],
 ['哮喘', '胸闷气短'],
 ['心悸', '失眠', '眼干', '月经不调', '消化不良'],
 ['哮喘', '胸闷气短'],
 ['消化不良', '

In [8]:
from mlxtend.preprocessing import TransactionEncoder

TE = TransactionEncoder()
data = TE.fit_transform(symptoms)

In [9]:
print(data)

[[ True False False ... False False False]
 [False False  True ... False False False]
 [False False False ...  True False False]
 ...
 [False False False ... False False  True]
 [False False False ... False False False]
 [False  True False ... False False False]]


In [10]:
df_sym = pd.DataFrame(data, columns=TE.columns_)

In [11]:
df_sym

Unnamed: 0,便秘,哮喘,失眠,心悸,易怒,月经不调,消化不良,眼干,神经衰弱,耳鸣耳聋,胸闷气短,脱发,腰疼,腹胀,鼻炎
0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False
1,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,False,True,True,False,False
3,True,True,False,False,False,False,True,False,False,False,True,False,False,True,False
4,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False
996,False,False,False,False,True,True,False,True,False,False,False,True,True,False,False
997,False,False,False,True,False,True,False,True,False,False,False,False,False,False,True
998,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False


In [13]:
from mlxtend.frequent_patterns import apriori

items = apriori(df_sym, min_support=0.1, use_colnames=True)

In [19]:
items.columns

Index(['support', 'itemsets'], dtype='object')

In [21]:
items_sorted = items.sort_values('support', axis = 0, ascending = False, inplace = False)
items_sorted

Unnamed: 0,support,itemsets
4,0.484,(月经不调)
3,0.452,(心悸)
2,0.393,(失眠)
5,0.344,(消化不良)
13,0.318,"(失眠, 月经不调)"
6,0.311,(眼干)
15,0.249,"(心悸, 月经不调)"
12,0.218,"(失眠, 心悸)"
7,0.188,(神经衰弱)
0,0.184,(便秘)


In [23]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(items, min_threshold=0.7)

In [24]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(便秘),(消化不良),0.184,0.344,0.16,0.869565,2.527806,0.096704,5.029333
1,(失眠),(月经不调),0.393,0.484,0.318,0.80916,1.671819,0.127788,2.70384
2,(神经衰弱),(消化不良),0.188,0.344,0.164,0.87234,2.535873,0.099328,5.138667
3,(脱发),(眼干),0.151,0.311,0.143,0.94702,3.04508,0.096039,13.004875
4,(腰疼),(眼干),0.16,0.311,0.136,0.85,2.733119,0.08624,4.593333
5,"(失眠, 心悸)",(月经不调),0.218,0.484,0.167,0.766055,1.582758,0.061488,2.205647
6,"(神经衰弱, 心悸)",(消化不良),0.108,0.344,0.1,0.925926,2.691645,0.062848,8.856


### 说明

* antecedents: 前件

* consequents: 后件

* lift：该关联规则的提升度

$lift(\{A\} \rightarrow \{B\}) = \frac{support(\{A,B\})}{support(A)\times support(B)}$

值越大表明A和B的关联度越强


* leverage： 关联规则的杠杆率

$leverage(\{A\} \rightarrow \{B\}) = supper(\{A,B\}) - support(A) \times support(B) $

值越大表明A和B的关联度越强

* conviction：关联规则的确信度

$conv(\{A\} \rightarrow \{B\}) = \frac{1-support(B)}{1-conf(\{A\} \rightarrow \{B\})}$

值越大表明A和B的关联度越强

In [None]:
for i, j in rules.iterrows():
    X = j['antecedents']
    Y = j['consequents']
    x = ', '.join([item for item])