In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
project_dir_path = '/home/onoue/ws/lukasiewicz_1'
sys.path.append(project_dir_path)
import shutil

import pandas as pd
import numpy as np
import cvxpy as cp

from sklearn.model_selection import train_test_split

from src.setup_problem_primal_modular import Setup
from src.objective_function import specimen_construct_objective_function_loss_v3
from src.evaluation import EvaluateModel

# 提案アルゴリズム用データフォーマット

- 実験設定（seed, その他）

1. 前処理はすでに終わっているものとする（入力データとして diabetes_discretized.csv をとる）
2. seed 値 と train/test の比率を決める
3. 教師有りデータ（L_Predicate.csv）と教師無しデータ（U.csv）を作成し，rules.txt と一緒に train ディレクトリに格納
4. 最終的に予測を行う Predicate についてのテストデータ（ex. L_Outcome.csv）を作成し test ディレクトリに格納

In [4]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)
X = data.drop(["Outcome"], axis=1)
y = data["Outcome"]

random_state = 42
test_size = 0.2

X_train, X_test, _, _ = train_test_split(X, y,
                                         test_size=test_size,
                                         random_state=random_state)

train_index = X_train.index
test_index = X_test.index


train_data = data.loc[train_index, :]
outcome = train_data['Outcome']
features = train_data.drop(['Outcome'], axis=1)
feature_names = list(features.columns)


train_data_dir_path = os.path.join(project_dir_path, 'inputs/pima_indian_diabetes/train')

if not os.path.exists(train_data_dir_path):
    os.mkdir(train_data_dir_path)


df = features.copy()
df['target'] = outcome.replace(0, -1)
display(df)

file_name = "L_" + "Outcome" + '.csv'
file_path = os.path.join(train_data_dir_path, file_name)
df.to_csv(file_path)


for feature_name in feature_names:
    df = features.copy()
    df['target'] = df[feature_name].replace(0, -1)
    # display(df)

    file_name = "L_" + feature_name + '.csv'
    file_path = os.path.join(train_data_dir_path, file_name)
    df.to_csv(file_path)


data_num = 15
data_dim = 21

arr_U = np.random.randint(2, size=(data_num, data_dim))
df_U = pd.DataFrame(arr_U)
df_U.to_csv(os.path.join(train_data_dir_path, 'U.csv'))

# rule.txt
import shutil
source_path = './data/rules.txt'
destination_path = os.path.join(train_data_dir_path, "rules.txt")
shutil.copy(source_path, destination_path)


test_data = data.loc[test_index, :]

outcome = test_data['Outcome']
features = test_data.drop(['Outcome'], axis=1)
feature_names = list(features.columns)


test_data_dir_path = os.path.join(project_dir_path, 'inputs/pima_indian_diabetes/test')

if not os.path.exists(test_data_dir_path):
    os.mkdir(test_data_dir_path)

 
df = features.copy()
df['target'] = outcome.replace(0, -1)
display(df)

file_name = "L_" + "Outcome" + '.csv'
file_path = os.path.join(test_data_dir_path, file_name)
df.to_csv(file_path)


Unnamed: 0,Pregnancies_Low,Pregnancies_Medium,Pregnancies_High,Glucose_Low,Glucose_Medium,Glucose_High,BloodPressure_Low,BloodPressure_Medium,BloodPressure_High,SkinThickness_Low,...,BMI_Low,BMI_Medium,BMI_High,DiabetesPedigreeFunction_Low,DiabetesPedigreeFunction_Medium,DiabetesPedigreeFunction_High,Age_Low,Age_Medium,Age_High,target
275,1,0,0,0,1,0,0,1,0,0,...,0,0,1,0,1,0,1,0,0,-1
555,0,1,0,0,1,0,0,1,0,0,...,1,0,0,1,0,0,0,1,0,-1
699,1,0,0,0,1,0,0,1,0,1,...,0,0,1,0,0,1,1,0,0,-1
73,1,0,0,0,1,0,0,0,1,1,...,0,1,0,1,0,0,1,0,0,-1
170,0,1,0,0,1,0,0,1,0,1,...,0,1,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,1,0,0,0,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,0,-1
128,1,0,0,0,1,0,0,0,1,0,...,0,1,0,1,0,0,0,1,0,1
315,1,0,0,0,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,0,-1
505,0,0,1,1,0,0,0,1,0,1,...,0,1,0,1,0,0,0,1,0,-1


Unnamed: 0,Pregnancies_Low,Pregnancies_Medium,Pregnancies_High,Glucose_Low,Glucose_Medium,Glucose_High,BloodPressure_Low,BloodPressure_Medium,BloodPressure_High,SkinThickness_Low,...,BMI_Low,BMI_Medium,BMI_High,DiabetesPedigreeFunction_Low,DiabetesPedigreeFunction_Medium,DiabetesPedigreeFunction_High,Age_Low,Age_Medium,Age_High,target
419,1,0,0,0,1,0,0,1,0,0,...,1,0,0,1,0,0,1,0,0,1
186,0,1,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,1
556,1,0,0,0,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,0,-1
738,1,0,0,0,1,0,1,0,0,1,...,0,1,0,0,1,0,1,0,0,-1
320,1,0,0,0,1,0,1,0,0,1,...,1,0,0,0,1,0,1,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,1,0,0,0,1,0,0,1,0,1,...,1,0,0,1,0,0,1,0,0,-1
487,1,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,1,-1
638,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,1,0,0,1
248,0,0,1,0,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,0,-1


# モデルの訓練

In [3]:
data_dir_path = os.path.join(project_dir_path, 'inputs/pima_indian_diabetes')
file_list = os.listdir(os.path.join(data_dir_path, 'train'))

L_files = [filename for filename in file_list 
           if filename.startswith('L') and filename.endswith('.csv')]

U_files = [filename for filename in file_list 
           if filename.startswith('U') and filename.endswith('.csv')]

file_names_dict = {
    'supervised': L_files,
    'unsupervised': U_files,
    'rule': ['rules.txt']
}

constraints_flag_dict = {
    'pointwise': False,
    'logical': False,
    'consistency': True
}

problem_instance = Setup(data_dir_path, file_names_dict, specimen_construct_objective_function_loss_v3)
objective, constraints = problem_instance.main(c1=10, c2=10, constraints_flag_dict=constraints_flag_dict)

problem = cp.Problem(objective, constraints)
result = problem.solve(verbose=True)

load_data took 0.08819174766540527 seconds!
load_rules took 0.0005688667297363281 seconds!
specimen_construct_objective_function_loss_v3 took 0.5862741470336914 seconds!
consistency constraints
__call__ took 5.844139575958252 seconds!
                                     CVXPY                                     
                                     v1.3.2                                    




(CVXPY) Dec 24 08:13:40 AM: Your problem has 484 variables, 24244 constraints, and 0 parameters.
(CVXPY) Dec 24 08:13:42 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Dec 24 08:13:42 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Dec 24 08:13:42 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Dec 24 08:13:44 AM: Compiling problem (target solver=ECOS).
(CVXPY) Dec 24 08:13:44 AM: Reduction chain: Dcp2Cone -> CvxAttr2Constr -> ConeMatrixStuffing -> ECOS
(CVXPY) Dec 24 08:13:44 AM: Applying reduction Dcp2Cone
(CVXPY) Dec 24 08:13:50 AM: Applying reduction CvxAttr2Constr
(CVXPY) Dec 24 08:13:51 AM: App

  return np.log(values[0])


# モデルの評価

In [None]:
from src.evaluation import EvaluateModel

evaluate_model = EvaluateModel(problem_instance)
evaluate_model.evaluate()

# 表埋め 手順

- 元データファイル： diabetes_cleaned.csv, diabetes_discretized.csv

In [32]:
file_path = os.path.join(data_dir_path, "test", "L_Outcome.csv")

test_data = pd.read_csv(file_path, index_col=0)
X_test = test_data.drop(['target'], axis=1)
y_test = test_data['target']

p = problem_instance.predicates_dict['Outcome']
y_pred = p(X_test).value
y_pred_interpreted = np.where(y_pred >= 0.5, 1, -1)

In [40]:
X_test

Unnamed: 0,Pregnancies_Low,Pregnancies_Medium,Pregnancies_High,Glucose_Low,Glucose_Medium,Glucose_High,BloodPressure_Low,BloodPressure_Medium,BloodPressure_High,SkinThickness_Low,...,SkinThickness_High,BMI_Low,BMI_Medium,BMI_High,DiabetesPedigreeFunction_Low,DiabetesPedigreeFunction_Medium,DiabetesPedigreeFunction_High,Age_Low,Age_Medium,Age_High
419,1,0,0,0,1,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0
186,0,1,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
556,1,0,0,0,1,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
738,1,0,0,0,1,0,1,0,0,1,...,0,0,1,0,0,1,0,1,0,0
320,1,0,0,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,1,0,0,0,1,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
487,1,0,0,0,0,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
638,0,1,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,1,1,0,0
248,0,0,1,0,1,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0


## ルール違反 カウント

- 「→ Outcome」or「→ ¬ Outcome」が残るようにしたい

- 例えば「¬ Glucose_Low ⊗ ¬ Age_Low ⊗ ¬ BMI_Low → Outcome」ならば，「Glucose_Low == 0, Age_Low == 0, BMI_Low == 0」の X_test の要素について，Outcome(x_test) == 0 ならばルール違反となる

1. KB_origin から「,'→', 'Outcome']」または「,'→', '¬', 'Outcome']」となっている rule のみ抜き出して，is_symbol(item) == False となる item に対して，直前の要素が '¬' でないならば item == 1 とし，下のような辞書を作成．

{
    item1: 1,
    item2: 0,
    item3: 1，
    Outcome: 1 （or -1）
}

2. 辞書が示す条件を満たすの X_test の要素に対して，対応する y_interpreted を調べ，

In [46]:
from src.misc import is_symbol

In [14]:
rules_tmp = []

for rule in problem_instance.KB_origin:
    if "Outcome" in rule:
        tmp = {}
        for idx, item in enumerate(rule):
            if not is_symbol(item):
                if idx == 0 or rule[idx - 1] != '¬':
                    tmp[item] = 1
                elif item != "Outcome":
                    tmp[item] = 0
                else:
                    tmp[item] = -1

        rules_tmp.append(tmp)

idx_tmp = X_test.index
y_pred_interpreted = pd.DataFrame(y_pred_interpreted, index=idx_tmp)

for rule in rules_tmp:
    outcome = rule["Outcome"]
    condition = " & ".join([f"{column} == {value}" for column, value in rule.items() if column != "Outcome"])
    tmp = y_pred_interpreted.loc[X_test.query(condition).index]
    print((tmp == outcome).sum())
    print(tmp.shape[0]) 



NameError: name 'is_symbol' is not defined