# Association Rule

- 各パラメータの値を良い感じにしたい

In [10]:
import sys
project_dir_path = "/Users/keisukeonoue/ws/lukasiewicz_2"
sys.path.append(project_dir_path)

import os

import pandas as pd
from sklearn.model_selection import KFold

from src.association_rule import get_rules
from src.association_rule import ArrangeRules


data_dir_path = os.path.join(project_dir_path, "data/pima_indian_diabetes")
file_path_2 = os.path.join(data_dir_path, "diabetes_discretized.csv")

df_origin_2 = pd.read_csv(file_path_2, index_col=0).reset_index(drop=True)
feature_names = df_origin_2.columns.drop(['Outcome']).to_list()
conclusion_name = 'Outcome'

rules_df_kf_list = []
KB_origin_kf_list = []
rule_info_list = []

for min_threshold in [i / 10 for i in range(10)]:

    rules_df_kf = []
    KB_origin_kf = []

    result_dict = {}
    result_dict['all'] = []
    result_dict['→ target'] = []
    result_dict['→ (target, something)'] = []
    result_dict['all (decomposed)'] = []
    result_dict['→ target (decomposed)'] = []

    kf = KFold(n_splits=5)
    for i, (train_idx, test_idx) in enumerate(kf.split(df_origin_2)):
        print(f"fold: {i+1} of 5")

        rules_df = get_rules(
            df_origin_2.copy().iloc[train_idx], 
            min_threshold=min_threshold,
            conclusion_name='Outcome'
        )
        # rules_df = rules_df[(rules_df['lift'] - 1).abs() > settings['rule_thr']]

        rule_processor = ArrangeRules(
            rules_df,
            feature_names=feature_names,
            conclusion_name=conclusion_name,
        )
        KB_origin = rule_processor.construct_KB()

        result_dict['all'].append(rules_df.shape[0])

        cnt1 = 0
        cnt2 = 0
        for consequent in rules_df['consequents']:
            tmp = [item for item in consequent]
            if "Outcome" in tmp:
                if len(tmp) == 1:
                    cnt1 += 1
                else:
                    cnt2 += 1
            
        result_dict['→ target'].append(cnt1)
        result_dict['→ (target, something)'].append(cnt2)
        result_dict['all (decomposed)'].append(len(KB_origin))
        result_dict['→ target (decomposed)'].append(cnt1 + cnt2)

        rules_df_kf.append(rules_df)
        KB_origin_kf.append(KB_origin)

    rules_df_kf_list.append(rules_df_kf)
    KB_origin_kf_list.append(KB_origin_kf)
    rule_info_list.append(result_dict)


fold: 1 of 5
fold: 2 of 5
fold: 3 of 5
fold: 4 of 5




fold: 5 of 5
fold: 1 of 5
fold: 2 of 5
fold: 3 of 5




fold: 4 of 5
fold: 5 of 5
fold: 1 of 5




fold: 2 of 5
fold: 3 of 5
fold: 4 of 5
fold: 5 of 5
fold: 1 of 5




fold: 2 of 5
fold: 3 of 5
fold: 4 of 5
fold: 5 of 5
fold: 1 of 5
fold: 2 of 5




fold: 3 of 5
fold: 4 of 5
fold: 5 of 5
fold: 1 of 5
fold: 2 of 5
fold: 3 of 5




fold: 4 of 5
fold: 5 of 5
fold: 1 of 5
fold: 2 of 5
fold: 3 of 5
fold: 4 of 5
fold: 5 of 5
fold: 1 of 5
fold: 2 of 5
fold: 3 of 5
fold: 4 of 5
fold: 5 of 5
fold: 1 of 5
fold: 2 of 5
fold: 3 of 5
fold: 4 of 5




fold: 5 of 5
fold: 1 of 5
fold: 2 of 5
fold: 3 of 5
fold: 4 of 5
fold: 5 of 5




all                  : 全ルール数

→ target             : consequents が正解ラベルのみのルールの数

→ (target, something): consequents が複数項目かつ正解ラベルを含むようなルールの数

all (decomposed)     : consequents を１つずつに分解した際のルール数

→ target (decomposed): 分解後の consequents が正解ラベルのルールの数

※ 注意： 各項目ごとに値が5個ずつ存在し，これは実際の実験の際に KFold でデータを5分割するためである．

---

- confidence を閾値として動かした際の上記項目の値の変化 を見たところ，confidence が 0.8 以上で，'→ target (decomposed)' の値が，0 になった．
- '→ target (decomposed)' の値が 0 だと，学習させた予測モデルが予測の際に与えたルールを守るかを調べることが難しくなるので，今回の実験において，ルール採用の閾値としての confidence の値は 0.0 ~ 0.7 の間とする．


In [12]:
for thr, info in zip([i/10 for i in range(10)], rule_info_list):
    print(f'confidence {thr}: {info}')

confidence 0.0: {'all': [1880, 1816, 1656, 1688, 1770], '→ target': [24, 22, 21, 25, 24], '→ (target, something)': [24, 22, 18, 26, 24], 'all (decomposed)': [3504, 3328, 2974, 3040, 3198], '→ target (decomposed)': [48, 44, 39, 51, 48]}
confidence 0.1: {'all': [1880, 1816, 1656, 1688, 1770], '→ target': [24, 22, 21, 25, 24], '→ (target, something)': [24, 22, 18, 26, 24], 'all (decomposed)': [3504, 3328, 2974, 3040, 3198], '→ target (decomposed)': [48, 44, 39, 51, 48]}
confidence 0.2: {'all': [1704, 1639, 1490, 1536, 1605], '→ target': [24, 21, 21, 25, 24], '→ (target, something)': [8, 7, 4, 14, 9], 'all (decomposed)': [2996, 2829, 2521, 2615, 2738], '→ target (decomposed)': [32, 28, 25, 39, 33]}
confidence 0.3: {'all': [1331, 1289, 1185, 1211, 1265], '→ target': [15, 13, 12, 20, 15], '→ (target, something)': [0, 0, 0, 1, 0], 'all (decomposed)': [2098, 2011, 1819, 1849, 1945], '→ target (decomposed)': [15, 13, 12, 21, 15]}
confidence 0.4: {'all': [973, 911, 840, 862, 912], '→ target': [3