In [1]:
import json
import os

import cvxpy as cp
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from src.misc import is_symbol
from src.setup_problem_primal import SetupPrimal
from src.evaluation import evaluate_model
from src.rulefit import RuleFitClassifier, ArrangeRules

# データの読み込み

In [2]:
project_dir_path = "/Users/keisukeonoue/ws/lukasiewicz_2"
data_dir_path = os.path.join(project_dir_path, "data/pima_indian_diabetes")

# 入力ファイル
file_path_1 = os.path.join(data_dir_path, "diabetes_cleaned_normalized.csv")
file_path_2 = os.path.join(data_dir_path, "diabetes_discretized.csv")

In [3]:
# 通常データ
df_origin_1 = pd.read_csv(file_path_1, index_col=0).reset_index(drop=True)
X_origin_1 = df_origin_1.drop(["Outcome"], axis=1)
y_origin_1 = df_origin_1["Outcome"]

# 離散データ（論理制約の抽出に使用）
df_origin_2 = pd.read_csv(file_path_2, index_col=0).reset_index(drop=True)
X_origin_2 = df_origin_2.drop(["Outcome"], axis=1)
y_origin_2 = df_origin_2["Outcome"]

display(df_origin_1.head())
display(df_origin_2.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.461538,0.675325,0.5,0.583333,0.484277,0.493261,0.644444,1
1,0.076923,0.266234,0.411765,0.483333,0.264151,0.245283,0.222222,0
2,0.615385,0.902597,0.382353,0.0,0.160377,0.533693,0.244444,1
3,0.076923,0.292208,0.411765,0.383333,0.311321,0.079964,0.0,0
4,0.384615,0.467532,0.529412,0.0,0.232704,0.110512,0.2,0


Unnamed: 0,Outcome,Pregnancies_Low,Pregnancies_Medium,Pregnancies_High,Glucose_Low,Glucose_Medium,Glucose_High,BloodPressure_Low,BloodPressure_Medium,BloodPressure_High,...,SkinThickness_High,BMI_Low,BMI_Medium,BMI_High,DiabetesPedigreeFunction_Low,DiabetesPedigreeFunction_Medium,DiabetesPedigreeFunction_High,Age_Low,Age_Medium,Age_High
0,1,0,1,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
1,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
2,1,0,1,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
3,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
4,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0


# 実験設定

In [4]:
# 実験設定
settings = {
    'path': os.path.join(project_dir_path, "experiment_result/tmp/version_test"),
    'source_paths': [file_path_1, file_path_2],
    'experiment_name': 'pima_indian_diabetes_test',
    'seed': 42,
    'n_unsupervised': 15,
    'c1': 10,
    'c2': 10,
    'result': {}
}

if not os.path.exists(settings['path']):
    os.makedirs(settings['path'])
    os.makedirs(os.path.join(settings['path'], "rules"))

In [5]:
train_idx, test_idx = train_test_split(df_origin_1.index, test_size=0.2, random_state=settings['seed'])
print(f"fold: 1 of 1")

settings['result'][f'fold_0'] = {}

fold: 1 of 1


# 論理制約の抽出

In [6]:
X_train = X_origin_2.copy().iloc[train_idx].values
y_train = y_origin_2.copy().iloc[train_idx].values
X_test = X_origin_2.copy().iloc[test_idx].values
y_test = y_origin_2.copy().iloc[test_idx].values
feature_names = list(X_origin_2.columns)

# ルールの獲得 
rulefit = RuleFitClassifier(
    rfmode='classify',
    tree_generator=RandomForestClassifier(random_state=42),
    random_state=42,
    exp_rand_tree_size=False
)

rulefit.fit(X_train, y_train, feature_names=feature_names)

# ルールの整形
rules_df = rulefit.get_rules(exclude_zero_coef=True)
rule_processor = ArrangeRules(
    rules_df,
    feature_names=feature_names,
    conclusion_name="Outcome"
)
KB_origin = rule_processor.construct_KB()
rule_processor.save_KB_as_txt(os.path.join(settings['path'], f'rules/rules_0.txt'))

In [7]:
# 論理制約の評価のためのデータ作成
rules_tmp = []
for rule in KB_origin:
    if "Outcome" in rule:
        tmp = {}
        for idx, item in enumerate(rule):
            if not is_symbol(item):
                if idx == 0 or rule[idx - 1] != '¬':
                    tmp[item] = 1
                elif item != "Outcome":
                    tmp[item] = 0
                else:
                    tmp[item] = -1

        rules_tmp.append(tmp)

rule_violation_check = {}

for h, rule in enumerate(rules_tmp):
    outcome = rule['Outcome']

    condition_parts = [
        f"{column} == {value}"
        for column, value in rule.items()
        if column != "Outcome"
    ]
    condition = " & ".join(condition_parts)

    satisfying_idxs = df_origin_2.loc[test_idx].query(condition).index

    rule_violation_check[h] = (satisfying_idxs, outcome)

# テストデータ
df_tmp = df_origin_1.copy().iloc[test_idx, :]
df_tmp = df_tmp.rename(columns={'Outcome': 'target'})
df_tmp['target'] = df_tmp['target'].replace(0, -1)

input_for_test = {
    'data': df_tmp,
    'rule': rule_violation_check
}

# 入力データの整形

In [8]:
# 訓練データ
L = {}
for col_name in df_origin_2.columns:
    df_new = X_origin_1.copy().iloc[train_idx, :]
    df_new['target'] = df_origin_2[col_name].replace(0, -1)
    L[col_name] = df_new

np.random.seed(seed=settings['seed'])
arr_u = np.random.rand(settings['n_unsupervised'], X_origin_1.shape[1])
U = {key: arr_u for key in L.keys()}

S = {key: np.vstack([df.drop(['target'], axis=1).values, arr_u]) for key, df in L.items()}


In [9]:
len_j = len(L)
len_l = len(train_idx)
len_u = settings['n_unsupervised']
len_s = len_l + len_u

len_h = len(KB_origin)
len_i = len_u * 2

In [10]:

input_dict = {
    'L': L,
    'U': U,
    'S': S,
    'len_j': len_j,
    'len_l': len_l,
    'len_u': len_u,
    'len_s': len_s,
    'len_h': len_h,
    'len_i': len_i,
    'c1': settings['c1'],
    'c2': settings['c2'],
    'KB_origin': KB_origin,
    'target_predicate': 'Outcome',
    # 'kernel_function': "~~logistic regression~~",
}

# 予測モデルの学習

In [11]:
problem_instance = SetupPrimal(input_dict)
objective_function, constraints = problem_instance.main()
problem = cp.Problem(objective_function, constraints)
result = problem.solve(verbose=True)

_define_cvxpy_variables took 0.00033402442932128906 seconds!
load_rules took 0.0016987323760986328 seconds!
obj coeff
c1: 10
c2: 10
logistic_regression_loss took 0.24641895294189453 seconds!
consistency constraints
_construct_consistency_constraints took 1.48297119140625 seconds!
                                     CVXPY                                     
                                     v1.3.2                                    
(CVXPY) Jun 23 10:08:01 AM: Your problem has 176 variables, 24244 constraints, and 0 parameters.




(CVXPY) Jun 23 10:08:02 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Jun 23 10:08:02 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Jun 23 10:08:02 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Jun 23 10:08:03 AM: Compiling problem (target solver=ECOS).
(CVXPY) Jun 23 10:08:03 AM: Reduction chain: Dcp2Cone -> CvxAttr2Constr -> ConeMatrixStuffing -> ECOS
(CVXPY) Jun 23 10:08:03 AM: Applying reduction Dcp2Cone
(CVXPY) Jun 23 10:08:05 AM: Applying reduction CvxAttr2Constr
(CVXPY) Jun 23 10:08:05 AM: Applying reduction ConeMatrixStuffing
(CVXPY) Jun 23 10:08:18 AM: Applying reduction ECOS
(CVXPY) Ju

  return np.log(values[0])


# 予測

In [12]:
# テスト
X_test = input_for_test['data'].drop(['target'], axis=1)
y_test = input_for_test['data']['target']

In [13]:
p_name = problem_instance.problem_info['target_predicate']
p_trained = problem_instance.problem_info['predicates_dict'][p_name]
y_pred = p_trained(X_test).value

y_pred_interpreted = np.where(y_pred >= 0.5, 1, -1)

# 予測結果の評価

In [14]:
result = evaluate_model(
    y_test,
    y_pred,
    y_pred_interpreted,
    input_for_test,
    test_idx
)

settings['result'][f'fold_0']['logistic regression (L)'] = result



accuracy: 0.7835820895522388
precision: 0.7027027027027027
recall: 0.5909090909090909
f1: 0.6419753086419754
auc: 0.8398989898989898

[[79 11]
 [18 26]]


# 実験結果の保存

In [None]:
with open(os.path.join(settings['path'], 'result.json'), 'w') as f:
    json.dump(settings, f, indent=4)