In [24]:
import time
import pandas as pd
import numpy as np
data = pd.read_csv(r"C:\Users\86131\Desktop\毕设\data\movies.csv")
# 利用str可以对其中的每个元素进行独热编码处理，无需循环
# （no genres listed）是因为表中有的值为（no genres listed）
data = data.join(data.genres.str.get_dummies()).drop('genres',axis=1)
data.set_index(['movieId','title'], inplace=True)
data.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [70]:
def _support(x,row):
    # axis=0，按行走，算的是每一列的和,=1按列走，算的是每一行的和
    ans = np.sum(x,axis=0) / row
    # reshape(-1),二维数组变一维
    return np.array(ans).reshape(-1)

# 接收旧的频繁项集，产生新的候选项集
def generate_new_combinations(old_combinations):
    # 二维数组扁平化后去重，得到所有项的类型,值为[ 1  2  3  4  5  6  8  9 11 14 15 16 17]
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        # 用于获取当前组合中的最大值，old_combination=[1]时，old_combination[-1]=1
        # 当old_combination=[1 8]时，old_combination[-1]=8
        max_combination = old_combination[-1]
        
        # mask是个True和False组成的数组，大于当前组合最大的为True
        #[False  True  True  True  True  True  True  True  True  True  True  True  True]
        mask = items_types_in_previous_step > max_combination
        
        # valid_items 是大于当前组合中最大元素的序列 。
        # old_combination = [6 8]时，valid_items=[15 17] 
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)  # tuple类型有迭代器
        for item in valid_items:
            # yield from old_tuple 的作用是将 old_tuple 中的元素一个一个地 yield 出来，相当于将 old_tuple 中的元素按顺序加入到生成器的输出中。
            yield from old_tuple
            # yield item 则是将 item 这个大于当前组合中最大元素的新元素 yield 出来，结合之前的 old_tuple，就形成了一个新的组合。
            # 通过连续的 yield from old_tuple 和 yield item 操作，可以生成所有可能的新组合，其中每个新组合都在旧组合的基础上加入了一个大于当前组合最大元素的新元素。
            yield item

# 参数：df：数据
# min_support：最小支持度
# use_colnames：是否使用列名,默认索引
def Apriori(df,min_support=0.5,use_colnames=False):
    value = df.values
    support = _support(value,value.shape[0])
    # id从0到value.shape[1]-1,为列名分配id值
    id = np.arange(value.shape[1])

    # support 是一个列表，support_dict的是满足最小支持度的频发一项集，如{1: array([0.18764114, 0.12964484, 0.06271813,
    # 0.06815849, 0.38554712,0.12307534, 0.44764935, 0.07996305, 0.10039006, 0.05881749,0.16382673, 0.10059536, 0.19441593])}
    support_dict = {1: support[support >= min_support]}
    # reshape(-1,1)将一维数组转换为只有一列的二维数组,reshape参数（行，列），当为-1时，由总的/另一个参数确定
    # {1: array([[ 1],[ 2],[ 3],[ 4],[ 5],[ 6],[ 8],[ 9],[11],[14],[15],[16],[17]])}
    itemset_dict = {1: id[support >= min_support].reshape(-1,1)}

    max_itemset = 1
    # all_ones 二维数组，列数为1,shape(9742, 1)
    all_ones = np.ones((value.shape[0],1))

    # 迭代生成候选项集，并检查它们是否满足最小支持度的要求。如果满足，则将其添加到频繁项集字典中。
    while max_itemset:
        next_max_itemset = max_itemset + 1

        # 由上方得到的频繁一项集生成候选二项集，对候选二项集处理得到频繁二项集，下一轮循环中由频繁二项集生成候选三项集
        # generate_new_combinations只是生成候选集，而不是频繁集
        combin = generate_new_combinations(itemset_dict[max_itemset])
        combin = np.fromiter(combin,dtype = int)
        combin = combin.reshape(-1,next_max_itemset)

        if combin.size == 0:
            break

        # 切片，[:, combin] 表示取所有行，但只取列索引为 combin 中指定的列。
        # 将同行中选中的列进行交运算
        # combin只说明了候选项集包含哪几列，要想计算频繁项集，还要将combin指定的列进行交集运算，只有三者都为一时_bools为True
        _bools = np.all(value[:, combin], axis=2)

        support = _support(np.array(_bools), value.shape[0])
        # 实现剪枝，筛选满足最小支持度的候选集
        _mask = (support >= min_support).reshape(-1)
        if any(_mask):
            itemset_dict[next_max_itemset] = np.array(combin[_mask])
            support_dict[next_max_itemset] = np.array(support[_mask])
            max_itemset = next_max_itemset
        else:
            # Exit condition
            break

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]], dtype="object")

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ["support", "itemsets"]
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df["itemsets"] = res_df["itemsets"].apply(
            lambda x: frozenset([mapping[i] for i in x])
        )
    res_df = res_df.reset_index(drop=True)

    return res_df

In [69]:
Apriori(data,min_support=0.05,use_colnames=True)

[[ 1  2]
 [ 1  3]
 [ 1  4]
 [ 1  5]
 [ 1  6]
 [ 1  8]
 [ 1  9]
 [ 1 11]
 [ 1 14]
 [ 1 15]
 [ 1 16]
 [ 1 17]
 [ 2  3]
 [ 2  4]
 [ 2  5]
 [ 2  6]
 [ 2  8]
 [ 2  9]
 [ 2 11]
 [ 2 14]
 [ 2 15]
 [ 2 16]
 [ 2 17]
 [ 3  4]
 [ 3  5]
 [ 3  6]
 [ 3  8]
 [ 3  9]
 [ 3 11]
 [ 3 14]
 [ 3 15]
 [ 3 16]
 [ 3 17]
 [ 4  5]
 [ 4  6]
 [ 4  8]
 [ 4  9]
 [ 4 11]
 [ 4 14]
 [ 4 15]
 [ 4 16]
 [ 4 17]
 [ 5  6]
 [ 5  8]
 [ 5  9]
 [ 5 11]
 [ 5 14]
 [ 5 15]
 [ 5 16]
 [ 5 17]
 [ 6  8]
 [ 6  9]
 [ 6 11]
 [ 6 14]
 [ 6 15]
 [ 6 16]
 [ 6 17]
 [ 8  9]
 [ 8 11]
 [ 8 14]
 [ 8 15]
 [ 8 16]
 [ 8 17]
 [ 9 11]
 [ 9 14]
 [ 9 15]
 [ 9 16]
 [ 9 17]
 [11 14]
 [11 15]
 [11 16]
 [11 17]
 [14 15]
 [14 16]
 [14 17]
 [15 16]
 [15 17]
 [16 17]]
[[ 1  2  5]
 [ 1  2  6]
 [ 1  2  8]
 [ 1  2 15]
 [ 1  2 17]
 [ 1  8 15]
 [ 1  8 17]
 [ 5  8 15]
 [ 5  8 17]
 [ 5 15 17]
 [ 6  8 15]
 [ 6  8 17]
 [ 8 15 17]]


Unnamed: 0,support,itemsets
0,0.187641,(Action)
1,0.129645,(Adventure)
2,0.062718,(Animation)
3,0.068158,(Children)
4,0.385547,(Comedy)
5,0.123075,(Crime)
6,0.447649,(Drama)
7,0.079963,(Fantasy)
8,0.10039,(Horror)
9,0.058817,(Mystery)
