In [2]:
import numpy as np
import pandas as pd

# old_combinations是一个二维数组，根据上一步生成的项集组合，结合频繁项集性质，生成新的项集组合
def generate_new_combinations(old_combinations):
    # 二维数组扁平化后去重，得到所有项的类型
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        # mask是个True和False组成的数组
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:
            yield from old_tuple
            yield item


# 速度更快，内存占用低，逐行处理，有效利用缓存
def generate_new_combinations_low_memory(old_combinations, X, min_support):
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    rows_count = X.shape[0]
    threshold = min_support * rows_count
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        mask_rows = X[:, old_tuple].all(axis=1)
        supports = X[mask_rows][:, valid_items].sum(axis=0)
        valid_indices = (supports >= threshold).nonzero()[0]
        for index in valid_indices:
            yield supports[index]
            yield from old_tuple
            yield valid_items[index]


# verbose 0：不打印任何输出信息。
# 1：打印算法执行过程中的一些关键信息，如当前正在处理的候选项集的大小。
def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, low_memory=False):
    # 对x沿着纵轴求和，1的个数除以总的行数
    def _support(_x, _n_rows):
        out = np.sum(_x, axis=0) / _n_rows
        #如果 out 不是一个标准的 NumPy 数组，那么使用 np.array(out).reshape(-1) 会将其转换为标准的 NumPy 数组，并且重新塑形为一维数组。
        return np.array(out).reshape(-1)

    if min_support <= 0.0:
        raise ValueError("最小支持度应当为正数 %s." % min_support)

    # [行,列]
    X = df.values
    support = _support(X, X.shape[0])
    ary_col_idx = np.arange(X.shape[1])
    
    # support_dict 是一个字典，键为项集的大小（这里是 1），值为支持度不低于 min_support 的频繁项集的支持度。一维数组
    # itemset_dict 是一个字典，键为项集的大小（这里是 1），值为支持度不低于 min_support 的频繁项集的列索引,二维数组
    # support_dict和itemset_dict是一一对应的，键为项集大小，itemset_dict值为满足最小支持度的列索引，support_dict值为满足最小支持度的列的支持度值
    # 值的类型都是键值对，value为二维数组
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])
    # all_ones.是一个二维数组，（9742，1），9742行，1列的二维数组，每个元素值为一
    all_ones = np.ones((int(rows_count), 1))

    while max_itemset and max_itemset < (max_len or float("inf")):
        next_max_itemset = max_itemset + 1
        
        #低内存模式 
        if low_memory:
            combin = generate_new_combinations_low_memory(
                itemset_dict[max_itemset], X, min_support
            )
            # slightly faster than creating an array from a list of tuples
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset + 1)

            if combin.size == 0:
                break
            if verbose:
                print(
                    "\rProcessing %d combinations | Sampling itemset size %d"
                    % (combin.size, next_max_itemset),
                    end="",
                )

            itemset_dict[next_max_itemset] = combin[:, 1:]
            support_dict[next_max_itemset] = combin[:, 0].astype(float) / rows_count
            max_itemset = next_max_itemset
            
        else:
            # combin是一个二维数组，存储的是候选集（二项候选集、三项候选集等）
            # 生成频繁一项集，然后利用频繁一项集来生成候选二项集，再根据候选二项集来生成频繁二项集，然后利用频繁二项集来生成候选三项集，
            combin = generate_new_combinations(itemset_dict[max_itemset])
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset)
            if combin.size == 0:
                break
            if verbose:
                print("\rProcessing %d combinations | Sampling itemset size %d" % (combin.size, next_max_itemset),end="",)
            
            _bools = np.all(X[:, combin], axis=2)
            
            support = _support(np.array(_bools), rows_count)
            _mask = (support >= min_support).reshape(-1)
            if any(_mask):
                itemset_dict[next_max_itemset] = np.array(combin[_mask])
                support_dict[next_max_itemset] = np.array(support[_mask])
                max_itemset = next_max_itemset
            else:
                # Exit condition
                break

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]], dtype="object")

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ["support", "itemsets"]
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df["itemsets"] = res_df["itemsets"].apply(
            lambda x: frozenset([mapping[i] for i in x])
        )
    res_df = res_df.reset_index(drop=True)

    if verbose:
        print()  # adds newline if verbose counter was used

    return res_df

In [6]:
import time
movies = pd.read_csv(r'C:\Users\86131\Desktop\毕设\data\movies.csv')
movies = movies.drop('genres', axis=1).join(movies.genres.str.get_dummies())
movies.set_index(['movieId','title'], inplace=True)
t=time.time()
ret = apriori(movies,min_support=0.05,use_colnames=True,low_memory=True)
print(f'coast:{time.time() - t:.4f}s')
ret

coast:0.0040s


Unnamed: 0,support,itemsets
0,0.187641,(Action)
1,0.129645,(Adventure)
2,0.062718,(Animation)
3,0.068158,(Children)
4,0.385547,(Comedy)
5,0.123075,(Crime)
6,0.447649,(Drama)
7,0.079963,(Fantasy)
8,0.10039,(Horror)
9,0.058817,(Mystery)


In [3]:
from itertools import combinations

import numpy as np
import pandas as pd

def association_rules(df, metric="confidence", min_threshold=0.8, support_only=False):
    if not df.shape[0]:
        raise ValueError(
            "The input DataFrame `df` containing " "the frequent itemsets is empty."
        )

    # check for mandatory columns
    if not all(col in df.columns for col in ["support", "itemsets"]):
        raise ValueError(
            "Dataframe needs to contain the\
                         columns 'support' and 'itemsets'"
        )

    def conviction_helper(sAC, sA, sC):
        confidence = sAC / sA
        conviction = np.empty(confidence.shape, dtype=float)
        if not len(conviction.shape):
            conviction = conviction[np.newaxis]
            confidence = confidence[np.newaxis]
            sAC = sAC[np.newaxis]
            sA = sA[np.newaxis]
            sC = sC[np.newaxis]
        conviction[:] = np.inf
        conviction[confidence < 1.0] = (1.0 - sC[confidence < 1.0]) / (
            1.0 - confidence[confidence < 1.0]
        )

        return conviction

    def zhangs_metric_helper(sAC, sA, sC):
        denominator = np.maximum(sAC * (1 - sA), sA * (sC - sAC))
        numerator = metric_dict["leverage"](sAC, sA, sC)

        with np.errstate(divide="ignore", invalid="ignore"):
            # ignoring the divide by 0 warning since it is addressed in the below np.where
            zhangs_metric = np.where(denominator == 0, 0, numerator / denominator)

        return zhangs_metric

    # metrics for association rules
    metric_dict = {
        "antecedent support": lambda _, sA, __: sA,
        "consequent support": lambda _, __, sC: sC,
        "support": lambda sAC, _, __: sAC,
        "confidence": lambda sAC, sA, _: sAC / sA,
        "lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC) / sC,
        "leverage": lambda sAC, sA, sC: metric_dict["support"](sAC, sA, sC) - sA * sC,
        "conviction": lambda sAC, sA, sC: conviction_helper(sAC, sA, sC),
        "zhangs_metric": lambda sAC, sA, sC: zhangs_metric_helper(sAC, sA, sC),
    }

    columns_ordered = [
        "antecedent support",
        "consequent support",
        "support",
        "confidence",
        "lift",
        "leverage",
        "conviction",
        "zhangs_metric",
    ]

    # check for metric compliance
    if support_only:
        metric = "support"
    else:
        if metric not in metric_dict.keys():
            raise ValueError(
                "Metric must be 'confidence' or 'lift', got '{}'".format(metric)
            )

    # get dict of {frequent itemset} -> support
    keys = df["itemsets"].values
    values = df["support"].values
    frozenset_vect = np.vectorize(lambda x: frozenset(x))
    frequent_items_dict = dict(zip(frozenset_vect(keys), values))

    # prepare buckets to collect frequent rules
    rule_antecedents = []
    rule_consequents = []
    rule_supports = []

    # iterate over all frequent itemsets
    for k in frequent_items_dict.keys():
        sAC = frequent_items_dict[k]
        # to find all possible combinations
        for idx in range(len(k) - 1, 0, -1):
            # of antecedent and consequent
            for c in combinations(k, r=idx):
                antecedent = frozenset(c)
                consequent = k.difference(antecedent)

                if support_only:
                    # support doesn't need these,
                    # hence, placeholders should suffice
                    sA = None
                    sC = None

                else:
                    try:
                        sA = frequent_items_dict[antecedent]
                        sC = frequent_items_dict[consequent]
                    except KeyError as e:
                        s = (
                            str(e) + "You are likely getting this error"
                            " because the DataFrame is missing "
                            " antecedent and/or consequent "
                            " information."
                            " You can try using the "
                            " `support_only=True` option"
                        )
                        raise KeyError(s)
                    # check for the threshold

                score = metric_dict[metric](sAC, sA, sC)
                if score >= min_threshold:
                    rule_antecedents.append(antecedent)
                    rule_consequents.append(consequent)
                    rule_supports.append([sAC, sA, sC])

    # check if frequent rule was generated
    if not rule_supports:
        return pd.DataFrame(columns=["antecedents", "consequents"] + columns_ordered)

    else:
        # generate metrics
        rule_supports = np.array(rule_supports).T.astype(float)
        df_res = pd.DataFrame(
            data=list(zip(rule_antecedents, rule_consequents)),
            columns=["antecedents", "consequents"],
        )

        if support_only:
            sAC = rule_supports[0]
            for m in columns_ordered:
                df_res[m] = np.nan
            df_res["support"] = sAC

        else:
            sAC = rule_supports[0]
            sA = rule_supports[1]
            sC = rule_supports[2]
            for m in columns_ordered:
                df_res[m] = metric_dict[m](sAC, sA, sC)

        return df_res

In [4]:
rules = association_rules(ret, metric='lift', min_threshold=1.25)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Action),(Adventure),0.187641,0.129645,0.062615,0.333698,2.57394,0.038289,1.306247,0.752735
1,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.57394,0.038289,1.571224,0.702576
2,(Thriller),(Action),0.194416,0.187641,0.067235,0.345829,1.843034,0.030754,1.241814,0.567807
3,(Action),(Thriller),0.187641,0.194416,0.067235,0.358315,1.843034,0.030754,1.25542,0.563072
4,(Comedy),(Romance),0.385547,0.163827,0.090741,0.235357,1.43662,0.027578,1.093547,0.494622
5,(Romance),(Comedy),0.163827,0.385547,0.090741,0.553885,1.43662,0.027578,1.377341,0.363467
6,(Crime),(Thriller),0.123075,0.194416,0.058407,0.474562,2.440963,0.034479,1.533167,0.673177
7,(Thriller),(Crime),0.194416,0.123075,0.058407,0.300422,2.440963,0.034479,1.253506,0.732792
8,(Romance),(Drama),0.163827,0.447649,0.095874,0.585213,1.307302,0.022537,1.331649,0.281121
9,(Drama),(Romance),0.447649,0.163827,0.095874,0.214171,1.307302,0.022537,1.064065,0.425574


In [5]:
# 对lift降序排序，查看lift较大的是哪些规则
rules_sort = rules.sort_values(by=['lift'], ascending=False)
rules_sort

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.57394,0.038289,1.571224,0.702576
0,(Action),(Adventure),0.187641,0.129645,0.062615,0.333698,2.57394,0.038289,1.306247,0.752735
6,(Crime),(Thriller),0.123075,0.194416,0.058407,0.474562,2.440963,0.034479,1.533167,0.673177
7,(Thriller),(Crime),0.194416,0.123075,0.058407,0.300422,2.440963,0.034479,1.253506,0.732792
2,(Thriller),(Action),0.194416,0.187641,0.067235,0.345829,1.843034,0.030754,1.241814,0.567807
3,(Action),(Thriller),0.187641,0.194416,0.067235,0.358315,1.843034,0.030754,1.25542,0.563072
4,(Comedy),(Romance),0.385547,0.163827,0.090741,0.235357,1.43662,0.027578,1.093547,0.494622
5,(Romance),(Comedy),0.163827,0.385547,0.090741,0.553885,1.43662,0.027578,1.377341,0.363467
8,(Romance),(Drama),0.163827,0.447649,0.095874,0.585213,1.307302,0.022537,1.331649,0.281121
9,(Drama),(Romance),0.447649,0.163827,0.095874,0.214171,1.307302,0.022537,1.064065,0.425574
