In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import KFold

In [3]:
file_path_1 = "data/pima_indian_diabetes/diabetes_cleaned_normalized.csv"
file_path_2 = "data/pima_indian_diabetes/diabetes_discretized.csv"
file_path_3 = "data/pima_indian_diabetes/rules_3.txt"

In [4]:
df_origin_1 = pd.read_csv(file_path_1, index_col=0)
df_origin_2 = pd.read_csv(file_path_2, index_col=0)

display(df_origin_1.head())
display(df_origin_2.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.461538,0.675325,0.5,0.583333,0.484277,0.493261,0.644444,1
1,0.076923,0.266234,0.411765,0.483333,0.264151,0.245283,0.222222,0
2,0.615385,0.902597,0.382353,0.0,0.160377,0.533693,0.244444,1
3,0.076923,0.292208,0.411765,0.383333,0.311321,0.079964,0.0,0
5,0.384615,0.467532,0.529412,0.0,0.232704,0.110512,0.2,0


Unnamed: 0,Outcome,Pregnancies_Low,Pregnancies_Medium,Pregnancies_High,Glucose_Low,Glucose_Medium,Glucose_High,BloodPressure_Low,BloodPressure_Medium,BloodPressure_High,...,SkinThickness_High,BMI_Low,BMI_Medium,BMI_High,DiabetesPedigreeFunction_Low,DiabetesPedigreeFunction_Medium,DiabetesPedigreeFunction_High,Age_Low,Age_Medium,Age_High
0,1,0,1,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
1,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
2,1,0,1,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
3,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
5,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0


In [5]:
settings = {
    'seed': 42,
    'n_splits': 5,
    'n_unsupervised': 15,
    'c1': 10,
    'c2': 10,
}

In [6]:
kf = KFold(n_splits=settings['n_splits'])

idx_split = {}

for i, (train_idx, test_idx) in enumerate(kf.split(df_origin_1)):
    idx_split[i] = train_idx, test_idx

In [7]:
X_origin_1 = df_origin_1.drop(["Outcome"], axis=1)
y_origin_1 = df_origin_1["Outcome"]

In [8]:
train_index, test_index = idx_split[0]

In [9]:
# 訓練データ

L = {}
for col_name in df_origin_2.columns:
    df_new = X_origin_1.copy().iloc[train_index, :]
    df_new['target'] = df_origin_2[col_name].replace(0, -1)
    L[col_name] = df_new

np.random.seed(seed=settings['seed'])
arr_u = np.random.rand(settings['n_unsupervised'], X_origin_1.shape[1])
U = {key: arr_u for key in L.keys()}

S = {key: np.vstack([df.drop(['target'], axis=1).values, arr_u]) for key, df in L.items()}


In [10]:
# ルール

from src.misc import is_symbol

KB_origin =  []

with open(file_path_3, 'r') as file:
    for line in file:
        formula = line.split()
        KB_origin.append(formula)

rules_tmp = []
for rule in KB_origin:
    if "Outcome" in rule:
        tmp = {}
        for idx, item in enumerate(rule):
            if not is_symbol(item):
                if idx == 0 or rule[idx - 1] != '¬':
                    tmp[item] = 1
                elif item != "Outcome":
                    tmp[item] = 0
                else:
                    tmp[item] = -1

        rules_tmp.append(tmp)

rules_tmp

[{'Pregnancies_Low': 1, 'Outcome': -1},
 {'Pregnancies_High': 1, 'Outcome': 1},
 {'Glucose_Low': 1, 'Outcome': -1},
 {'Glucose_High': 1, 'Outcome': 1},
 {'BMI_Low': 1, 'Outcome': -1},
 {'BMI_Medium': 1, 'Outcome': 1},
 {'DiabetesPedigreeFunction_Low': 1, 'Outcome': -1},
 {'Age_Low': 1, 'Outcome': -1},
 {'Age_Medium': 1, 'Outcome': 1},
 {'Pregnancies_Medium': 0,
  'Glucose_High': 1,
  'BMI_Low': 0,
  'DiabetesPedigreeFunction_Low': 0,
  'BMI_Medium': 1,
  'Outcome': 1},
 {'Glucose_Low': 0,
  'DiabetesPedigreeFunction_Low': 0,
  'Age_Medium': 1,
  'BloodPressure_Medium': 1,
  'BMI_Low': 0,
  'Glucose_Medium': 1,
  'Outcome': 1},
 {'Glucose_High': 0,
  'Glucose_Low': 0,
  'Pregnancies_High': 0,
  'DiabetesPedigreeFunction_Low': 1,
  'BloodPressure_Medium': 0,
  'BloodPressure_Low': 0,
  'Age_Low': 0,
  'SkinThickness_Medium': 0,
  'Outcome': -1},
 {'BMI_Low': 0,
  'Glucose_Medium': 1,
  'Pregnancies_High': 0,
  'SkinThickness_Low': 0,
  'DiabetesPedigreeFunction_Medium': 0,
  'BloodPressu

In [11]:
len_j = len(L)
len_l = len(train_index)
len_u = settings['n_unsupervised']
len_s = len_l + len_u

len_h = len(KB_origin)
len_i = len_u * 2

In [12]:
# テストデータ

df_new = df_origin_1.copy().iloc[test_index, :]
df_new = df_new.rename(columns={'Outcome': 'target'})
df_new['target'] = df_new['target'].replace(0, -1)

test_data = {
    'Outcome': df_new
}




In [28]:
from src.misc import linear_kernel

inputs_luka = {
    'L': L,
    'U': U,
    'S': S,
    'len_j': len_j,
    'len_l': len_l,
    'len_u': len_u,
    'len_s': len_s,
    'len_h': len_h,
    'len_i': len_i,
    'c1': settings['c1'],
    'c2': settings['c2'],
    'KB_origin': KB_origin,
    'target_predicate': 'Outcome',
    'kernel_function': linear_kernel
}

In [29]:
from src.setup_problem_dual_single_task_2 import Setup
# from src.objective_function_single_task import ObjectiveFunction
from src.setup_problem_dual_single_task_2 import ObjectiveFunction

import cvxpy as cp

problem_instance = Setup(inputs_luka, ObjectiveFunction)
objective_function, constraints = problem_instance.main()
problem = cp.Problem(objective_function, constraints)
result = problem.solve(verbose=True)

load_rules took 0.01969313621520996 seconds!
define_cvxpy_variables took 0.0010578632354736328 seconds!
shape of P: (1807, 1807)
finish l
finish h
finish s
finish l h
finish l s
finish h s
_construct_P_j took 1.0457019805908203 seconds!
construct_constraints took 0.16031622886657715 seconds!
                                     CVXPY                                     
                                     v1.3.2                                    
(CVXPY) Feb 03 09:04:25 AM: Your problem has 36756 variables, 1257 constraints, and 0 parameters.




(CVXPY) Feb 03 09:04:26 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Feb 03 09:04:26 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Feb 03 09:04:26 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Feb 03 09:04:26 AM: Compiling problem (target solver=OSQP).
(CVXPY) Feb 03 09:04:26 AM: Reduction chain: FlipObjective -> CvxAttr2Constr -> Qp2SymbolicQp -> QpMatrixStuffing -> OSQP
(CVXPY) Feb 03 09:04:26 AM: Applying reduction FlipObjective
(CVXPY) Feb 03 09:04:26 AM: Applying reduction CvxAttr2Constr
(CVXPY) Feb 03 09:04:26 AM: Applying reduction Qp2SymbolicQp
(CVXPY) Feb 03 09:04:26 AM: Applying reduc



(CVXPY) Feb 03 09:04:31 AM: Applying reduction OSQP
(CVXPY) Feb 03 09:04:31 AM: Finished problem compilation (took 5.409e+00 seconds).
-------------------------------------------------------------------------------
                                Numerical solver                               
-------------------------------------------------------------------------------
(CVXPY) Feb 03 09:04:31 AM: Invoking solver OSQP  to obtain a solution.
-----------------------------------------------------------------
           OSQP v0.6.3  -  Operator Splitting QP Solver
              (c) Bartolomeo Stellato,  Goran Banjac
        University of Oxford  -  Stanford University 2021
-----------------------------------------------------------------
problem:  variables n = 38563, constraints m = 39820
          nnz(P) + nnz(A) = 1092163
settings: linear system solver = qdldl,
          eps_abs = 1.0e-05, eps_rel = 1.0e-05,
          eps_prim_inf = 1.0e-04, eps_dual_inf = 1.0e-04,
          rho = 1.0

In [31]:
from src.setup_problem_dual_single_task_2 import Predicate_dual

p_trained = Predicate_dual(inputs_luka)

[I 2024-02-03 09:08:06,497] A new study created in memory with name: no-name-27cdc463-9e9f-4b0e-9f0a-b07aad1f70d6
[I 2024-02-03 09:08:06,603] Trial 0 finished with value: 0.6753731343283582 and parameters: {'b': -3.6192514370449054}. Best is trial 0 with value: 0.6753731343283582.
[I 2024-02-03 09:08:06,664] Trial 1 finished with value: 0.6753731343283582 and parameters: {'b': -1.9308180404775186}. Best is trial 0 with value: 0.6753731343283582.



min_bound: -3.779336212229724
max_bound: 1.2206637877702762



[I 2024-02-03 09:08:06,701] Trial 2 finished with value: 0.6753731343283582 and parameters: {'b': -3.712092338481778}. Best is trial 0 with value: 0.6753731343283582.
[I 2024-02-03 09:08:06,757] Trial 3 finished with value: 0.6940298507462687 and parameters: {'b': -0.3754725752360071}. Best is trial 3 with value: 0.6940298507462687.
[I 2024-02-03 09:08:06,790] Trial 4 finished with value: 0.6753731343283582 and parameters: {'b': -3.393588105788927}. Best is trial 3 with value: 0.6940298507462687.
[I 2024-02-03 09:08:06,842] Trial 5 finished with value: 0.6753731343283582 and parameters: {'b': -1.7526080584681503}. Best is trial 3 with value: 0.6940298507462687.
[I 2024-02-03 09:08:06,871] Trial 6 finished with value: 0.6735074626865671 and parameters: {'b': -1.2130248918770907}. Best is trial 3 with value: 0.6940298507462687.
[I 2024-02-03 09:08:06,921] Trial 7 finished with value: 0.333955223880597 and parameters: {'b': 0.0825635465891632}. Best is trial 3 with value: 0.69402985074626