In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import time

import numpy as np
import pandas as pd
import cvxpy as cp

from src.format_winston_data import WinstonDataFormatter
from src.format_winston_data import generate_and_save_unsupervised_data

from src.setup_problem_v2 import Setup

# データの準備

In [11]:
source_dir_path = "./data"
save_dir_path = "./inputs/winston_full"

data_formatter = WinstonDataFormatter(source_dir_path)
data_formatter.format_and_save_data(save_dir_path)

generate_and_save_unsupervised_data(save_dir_path,
                                    data_num=20,
                                    data_dim=3)

df_test = pd.read_csv(os.path.join(save_dir_path, "L_albatross(x).csv"), index_col=0)
display(df_test.head())
print(df_test.shape)

Done!
Done!


Unnamed: 0,R,G,B,label
0,0.48664,0.440085,0.342231,-1.0
1,0.427333,0.406967,0.37666,-1.0
2,0.509321,0.512104,0.39858,-1.0
3,0.581645,0.572609,0.54955,-1.0
4,0.505475,0.517679,0.42228,-1.0


(6500, 4)


In [16]:
source_dir_path = "./data"
save_dir_path = "./inputs/winston_10"

data_formatter = WinstonDataFormatter(source_dir_path)
data_formatter.format_and_save_data(save_dir_path,
                                    sample_num_per_animal=2)

generate_and_save_unsupervised_data(save_dir_path,
                                    data_num=20,
                                    data_dim=3)

df_test = pd.read_csv(os.path.join(save_dir_path, "L_albatross(x).csv"), index_col=0)
display(df_test.head())
print(df_test.shape)

Done!
Done!


Unnamed: 0,R,G,B,label
6187,0.459693,0.491589,0.324279,-1.0
5243,0.447529,0.434663,0.30718,-1.0
232,0.515541,0.511313,0.49294,-1.0
507,0.541042,0.428629,0.278256,-1.0
1653,0.489753,0.476355,0.410386,-1.0


(10, 4)


In [17]:
source_dir_path = "./data"
save_dir_path = "./inputs/winston_100"

data_formatter = WinstonDataFormatter(source_dir_path)
data_formatter.format_and_save_data(save_dir_path,
                                    sample_num_per_animal=20)

generate_and_save_unsupervised_data(save_dir_path,
                                    data_num=20,
                                    data_dim=3)

df_test = pd.read_csv(os.path.join(save_dir_path, "L_albatross(x).csv"), index_col=0)
display(df_test.head())
print(df_test.shape)

Done!
Done!


Unnamed: 0,R,G,B,label
5840,0.529877,0.468574,0.448287,-1.0
5283,0.4913,0.460999,0.391749,-1.0
5750,0.5,0.5,0.5,-1.0
5758,0.562385,0.490812,0.424986,-1.0
5702,0.47789,0.4777,0.443831,-1.0


(100, 4)


# 学習

## 各 p に対して，教師データの数が 10 個

In [77]:
data_dir_path = './inputs/winston_10'

file_list = os.listdir(data_dir_path)

L_files = [filename.split('.csv')[0] for filename in file_list 
           if filename.startswith('L') and filename.endswith('.csv')]

U_files = [filename.split('.csv')[0] for filename in file_list 
           if filename.startswith('U') and filename.endswith('.csv')]

file_names_dict = {
    'supervised': L_files,
    'unsupervised': U_files,
    'rule': ['rules_2']
}

problem_instance = Setup(data_dir_path, file_names_dict)
objective, constraints = problem_instance.main(c1=10, c2=0.3)

start_time = time.time()
problem = cp.Problem(objective, constraints)
result = problem.solve(verbose=True)
end_time = time.time()
print()
print(f'学習時間: {end_time - start_time} 秒')

Loading data ...
Done in 0.04738140106201172 seconds! 

Loading rules ...
Done in 0.00019621849060058594 seconds! 

Identifying predicates ...
Done in 0.0019109249114990234 seconds! 

Constructing objective function ...
Done in 0.03390622138977051 seconds! 

Constructing constraints ...
Done in 0.6918408870697021 seconds! 

All done. 

                                     CVXPY                                     
                                     v1.3.2                                    
(CVXPY) Oct 28 04:55:39 PM: Your problem has 498 variables, 3450 constraints, and 0 parameters.
(CVXPY) Oct 28 04:55:39 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Oct 28 04:55:39 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 28 04:55:39 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
--------------------------------------------------------------

In [37]:
# for formula in problem_instance.KB_origin:
#     print(formula)

# print()

# for new_formula in problem_instance.KB:
#     print(new_formula)

In [78]:
# テストデータ
test_data_dir_path = './inputs/winston_full'
test_animals = ['albatross', 'cheetah', 'ostrich', 'penguin', 'zebra']

test_data = problem_instance.prepare_test_data(test_data_dir_path, test_animals)

def test_trained_predicate(problem_instance, test_data):
    result_dict = {}
    p_dict = problem_instance.predicates_dict

    for p_name, p_data in test_data.items():
        pred_vals = []
        preds = []

        p = p_dict[p_name]
        cnt = 0

        for data in p_data:
            x, ans = data[:-1], data[-1]
            pred_val = p(x).value

            pred_vals.append(pred_val)

            if (pred_val >= 0.5 and ans == 1) or (pred_val < 0.5 and ans == -1):
                cnt += 1

            pred = (pred_val >= 0.5 and ans == 1) or (pred_val < 0.5 and ans == -1)
            
            preds.append(pred)

        p_arr = np.hstack([p_data, np.array(pred_vals).reshape(-1, 1), np.array(preds).reshape(-1, 1)])
        p_df = pd.DataFrame(p_arr, columns=['r', 'g', 'b', 'Ans', 'pred_val', 'pred'])

        result_dict[p_name] = p_df
        
        print(cnt)
        print(f'Accuracy of {p_name}: {cnt / len(p_data)}')

    return result_dict


res_dict = test_trained_predicate(problem_instance, test_data)

5200
Accuracy of albatross(x): 0.8
5200
Accuracy of cheetah(x): 0.8
5200
Accuracy of ostrich(x): 0.8
5200
Accuracy of penguin(x): 0.8
5200
Accuracy of zebra(x): 0.8


In [55]:
len(problem_instance.L)

31

In [57]:
test_animals

['albatross', 'cheetah', 'ostrich', 'penguin', 'zebra']

In [None]:
pd.read_csv()

In [56]:
problem_instance.L

array([[[ 0.51548432,  0.4533864 ,  0.38860145, -1.        ],
        [ 0.46517115,  0.46238609,  0.45014919, -1.        ],
        [ 0.50268406,  0.51702762,  0.50812705,  1.        ],
        ...,
        [ 0.53528694,  0.52298571,  0.32538035, -1.        ],
        [ 0.46227439,  0.42251048,  0.42294233,  1.        ],
        [ 0.5535101 ,  0.54497347,  0.4811481 ,  1.        ]],

       [[ 0.54052452,  0.4937535 ,  0.4089825 ,  1.        ],
        [ 0.42204827,  0.39576059,  0.32168631,  1.        ],
        [ 0.44926467,  0.44488827,  0.41302272, -1.        ],
        ...,
        [ 0.47105572,  0.4983398 ,  0.43885245,  1.        ],
        [ 0.46053577,  0.47561262,  0.53212055, -1.        ],
        [ 0.34924947,  0.34241552,  0.30371739, -1.        ]],

       [[ 0.59524781,  0.59223666,  0.49026362, -1.        ],
        [ 0.4051761 ,  0.44384515,  0.35118756, -1.        ],
        [ 0.47100261,  0.45500622,  0.33649725, -1.        ],
        ...,
        [ 0.52775279,  0.45

In [53]:
test_data

{'albatross(x)': array([[ 0.48664018,  0.44008494,  0.3422309 , -1.        ],
        [ 0.42733264,  0.4069667 ,  0.37666028, -1.        ],
        [ 0.50932126,  0.512104  ,  0.39857979, -1.        ],
        ...,
        [ 0.51725972,  0.50938418,  0.4982856 , -1.        ],
        [ 0.47030804,  0.43970613,  0.37393799, -1.        ],
        [ 0.39154442,  0.40050628,  0.42496127, -1.        ]]),
 'cheetah(x)': array([[ 0.48664018,  0.44008494,  0.3422309 , -1.        ],
        [ 0.42733264,  0.4069667 ,  0.37666028, -1.        ],
        [ 0.50932126,  0.512104  ,  0.39857979, -1.        ],
        ...,
        [ 0.51725972,  0.50938418,  0.4982856 , -1.        ],
        [ 0.47030804,  0.43970613,  0.37393799, -1.        ],
        [ 0.39154442,  0.40050628,  0.42496127, -1.        ]]),
 'ostrich(x)': array([[ 0.48664018,  0.44008494,  0.3422309 , -1.        ],
        [ 0.42733264,  0.4069667 ,  0.37666028, -1.        ],
        [ 0.50932126,  0.512104  ,  0.39857979, -1.       

In [32]:
res_dict['albatross(x)']

Unnamed: 0,r,g,b,Ans,pred_val,pred
0,0.486640,0.440085,0.342231,-1.0,8.279502e-12,1.0
1,0.427333,0.406967,0.376660,-1.0,8.752989e-12,1.0
2,0.509321,0.512104,0.398580,-1.0,1.092828e-11,1.0
3,0.581645,0.572609,0.549550,-1.0,1.354286e-11,1.0
4,0.505475,0.517679,0.422280,-1.0,1.149094e-11,1.0
...,...,...,...,...,...,...
6495,0.618658,0.583356,0.521907,-1.0,1.284633e-11,1.0
6496,0.506803,0.401962,0.288625,-1.0,5.974145e-12,1.0
6497,0.517260,0.509384,0.498286,-1.0,1.200926e-11,1.0
6498,0.470308,0.439706,0.373938,-1.0,8.984672e-12,1.0


In [38]:
tmp = res_dict['albatross(x)'].loc[:, 'Ans'].to_list()

arr1 = (np.array(tmp) + 1) / 2

In [39]:
tmp = res_dict['albatross(x)'].loc[:, 'pred_val'].to_list()
arr2 = np.array(tmp)
arr2

array([8.27950248e-12, 8.75298876e-12, 1.09282826e-11, ...,
       1.20092640e-11, 8.98467200e-12, 9.83753997e-12])

In [30]:
from sklearn.metrics import roc_curve

ModuleNotFoundError: No module named 'sklearn'

In [40]:
import numpy as np
from sklearn import metrics
y = arr1
scores = arr2
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)



In [43]:
metrics.roc_auc_score(y, scores)

0.6069573224852072

In [None]:
tmp1 = res_dict['albatross(x)'].loc[:, 'Ans'].to_list()
arr1 = (np.array(tmp1) + 1) / 2
tmp2 = res_dict['albatross(x)'].loc[:, 'pred_val'].to_list()
arr2 = np.array(tmp2)

y = arr1
scores = arr2

metrics.roc_auc_score(y, scores)

In [None]:
res_dict['cheetah(x)']

In [44]:
tmp1 = res_dict['cheetah(x)'].loc[:, 'Ans'].to_list()
arr1 = (np.array(tmp1) + 1) / 2
tmp2 = res_dict['cheetah(x)'].loc[:, 'pred_val'].to_list()
arr2 = np.array(tmp2)

y = arr1
scores = arr2

metrics.roc_auc_score(y, scores)

0.4688070266272189

In [46]:
score_dict = {}

for predicate in res_dict.keys():
    tmp1 = res_dict[predicate].loc[:, 'Ans'].to_list()
    arr1 = (np.array(tmp1) + 1) / 2
    tmp2 = res_dict[predicate].loc[:, 'pred_val'].to_list()
    arr2 = np.array(tmp2)

    y = arr1
    scores = arr2

    roc_auc = metrics.roc_auc_score(y, scores)

    score_dict[predicate] = roc_auc

score_dict

{'albatross(x)': 0.6069573224852072,
 'cheetah(x)': 0.4688070266272189,
 'ostrich(x)': 0.5000685650887574,
 'penguin(x)': 0.5360473372781065,
 'zebra(x)': 0.5319147189349113}

In [52]:
np.array(list(score_dict.values())).mean()

0.5287589940828402

In [79]:
# テストデータ
test_data_dir_path = './inputs/winston_10'
test_animals = ['albatross', 'cheetah', 'ostrich', 'penguin', 'zebra']

test_data = problem_instance.prepare_test_data(test_data_dir_path, test_animals)

def test_trained_predicate(problem_instance, test_data):
    result_dict = {}
    p_dict = problem_instance.predicates_dict

    for p_name, p_data in test_data.items():
        pred_vals = []
        preds = []

        p = p_dict[p_name]
        cnt = 0

        for data in p_data:
            x, ans = data[:-1], data[-1]
            pred_val = p(x).value

            pred_vals.append(pred_val)

            if (pred_val >= 0.5 and ans == 1) or (pred_val < 0.5 and ans == -1):
                cnt += 1

            pred = (pred_val >= 0.5 and ans == 1) or (pred_val < 0.5 and ans == -1)
            
            preds.append(pred)

        p_arr = np.hstack([p_data, np.array(pred_vals).reshape(-1, 1), np.array(preds).reshape(-1, 1)])
        p_df = pd.DataFrame(p_arr, columns=['r', 'g', 'b', 'Ans', 'pred_val', 'pred'])

        result_dict[p_name] = p_df
        
        print(cnt)
        print(f'Accuracy of {p_name}: {cnt / len(p_data)}')

    return result_dict


res_dict = test_trained_predicate(problem_instance, test_data)

8
Accuracy of albatross(x): 0.8
8
Accuracy of cheetah(x): 0.8
8
Accuracy of ostrich(x): 0.8
8
Accuracy of penguin(x): 0.8
8
Accuracy of zebra(x): 0.8


In [80]:
score_dict = {}

for predicate in res_dict.keys():
    tmp1 = res_dict[predicate].loc[:, 'Ans'].to_list()
    arr1 = (np.array(tmp1) + 1) / 2
    tmp2 = res_dict[predicate].loc[:, 'pred_val'].to_list()
    arr2 = np.array(tmp2)

    y = arr1
    scores = arr2

    roc_auc = metrics.roc_auc_score(y, scores)

    score_dict[predicate] = roc_auc

score_dict

{'albatross(x)': 0.6875,
 'cheetah(x)': 0.5,
 'ostrich(x)': 0.5625,
 'penguin(x)': 0.25,
 'zebra(x)': 0.5625}

In [81]:
np.array(list(score_dict.values())).mean()

0.5125

In [85]:
problem_instance.L[0]

array([[ 0.51548432,  0.4533864 ,  0.38860145, -1.        ],
       [ 0.46517115,  0.46238609,  0.45014919, -1.        ],
       [ 0.50268406,  0.51702762,  0.50812705,  1.        ],
       [ 0.51562029,  0.47202459,  0.40856782,  1.        ],
       [ 0.50500199,  0.50415852,  0.46693104,  1.        ],
       [ 0.57393299,  0.50544942,  0.29529846,  1.        ],
       [ 0.48814955,  0.45962042,  0.28293   , -1.        ],
       [ 0.53528694,  0.52298571,  0.32538035, -1.        ],
       [ 0.46227439,  0.42251048,  0.42294233,  1.        ],
       [ 0.5535101 ,  0.54497347,  0.4811481 ,  1.        ]])

In [83]:
problem_instance.w_j.value

array([[ 2.60831588e-10,  6.79940229e-11,  1.83178064e-10,
         1.00000000e+00],
       [-7.05782491e-11,  3.82506821e-10, -1.04369485e-10,
         1.17633008e-11],
       [-1.67505156e-11,  1.03618471e-10,  1.73866409e-12,
        -3.01842257e-11],
       [-8.63928421e-11, -4.17016010e-11, -2.87835298e-11,
         9.18702336e-11],
       [-1.51992243e-11,  3.41715304e-11,  6.07682149e-11,
        -2.00296456e-11],
       [ 4.66919037e-12,  4.99943176e-11,  1.21791600e-10,
        -6.15018396e-11],
       [-4.16915678e-11, -5.17379371e-12, -3.72537076e-11,
         5.12713952e-11],
       [ 1.23577878e-10,  4.50833147e-11, -9.70121714e-11,
         6.04569639e-11],
       [ 1.45145140e-11,  6.95727806e-11,  7.55450069e-11,
        -5.60178977e-11],
       [-2.35587002e-12,  6.59392982e-11,  3.58122202e-11,
        -4.65169238e-11],
       [ 7.69595260e-12,  1.09849370e-10,  4.45353838e-11,
        -5.46680746e-11],
       [ 1.10634693e-10,  3.02377940e-10,  1.29795302e-10,
      

## 各 p に対して，教師データの数が 100 個

In [61]:
data_dir_path = './inputs/winston_100'

file_list = os.listdir(data_dir_path)

L_files = [filename.split('.csv')[0] for filename in file_list 
           if filename.startswith('L') and filename.endswith('.csv')]

U_files = [filename.split('.csv')[0] for filename in file_list 
           if filename.startswith('U') and filename.endswith('.csv')]

file_names_dict = {
    'supervised': L_files,
    'unsupervised': U_files,
    'rule': ['rules_2']
}

problem_instance = Setup(data_dir_path, file_names_dict)
objective, constraints = problem_instance.main(c1=10, c2=10)

start_time = time.time()
problem = cp.Problem(objective, constraints)
result = problem.solve(verbose=True)
end_time = time.time()
print()
print(f'学習時間: {end_time - start_time} 秒')

Loading data ...
Done in 0.06169414520263672 seconds! 

Loading rules ...
Done in 0.00022840499877929688 seconds! 

Identifying predicates ...
Done in 0.0018243789672851562 seconds! 

Constructing objective function ...
Done in 0.33126401901245117 seconds! 

Constructing constraints ...
Done in 3.79935622215271 seconds! 

All done. 

                                     CVXPY                                     
                                     v1.3.2                                    




(CVXPY) Oct 28 04:43:16 PM: Your problem has 3288 variables, 11820 constraints, and 0 parameters.
(CVXPY) Oct 28 04:43:18 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Oct 28 04:43:18 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 28 04:43:18 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Oct 28 04:43:19 PM: Compiling problem (target solver=ECOS).
(CVXPY) Oct 28 04:43:19 PM: Reduction chain: Dcp2Cone -> CvxAttr2Constr -> ConeMatrixStuffing -> ECOS
(CVXPY) Oct 28 04:43:19 PM: Applying reduction Dcp2Cone
(CVXPY) Oct 28 04:43:23 PM: Applying reduction CvxAttr2Constr
(CVXPY) Oct 28 04:43:26 PM: Ap

In [62]:
# テストデータ
test_data_dir_path = './inputs/winston_100'
test_animals = ['albatross', 'cheetah', 'ostrich', 'penguin', 'zebra']

test_data = problem_instance.prepare_test_data(test_data_dir_path, test_animals)

def test_trained_predicate(problem_instance, test_data):
    result_dict = {}
    p_dict = problem_instance.predicates_dict

    for p_name, p_data in test_data.items():
        pred_vals = []
        preds = []

        p = p_dict[p_name]
        cnt = 0

        for data in p_data:
            x, ans = data[:-1], data[-1]
            pred_val = p(x).value

            pred_vals.append(pred_val)

            if (pred_val >= 0.5 and ans == 1) or (pred_val < 0.5 and ans == -1):
                cnt += 1

            pred = (pred_val >= 0.5 and ans == 1) or (pred_val < 0.5 and ans == -1)
            
            preds.append(pred)

        p_arr = np.hstack([p_data, np.array(pred_vals).reshape(-1, 1), np.array(preds).reshape(-1, 1)])
        p_df = pd.DataFrame(p_arr, columns=['r', 'g', 'b', 'Ans', 'pred_val', 'pred'])

        result_dict[p_name] = p_df
        
        print(cnt)
        print(f'Accuracy of {p_name}: {cnt / len(p_data)}')

    return result_dict


res_dict = test_trained_predicate(problem_instance, test_data)

80
Accuracy of albatross(x): 0.8
80
Accuracy of cheetah(x): 0.8
80
Accuracy of ostrich(x): 0.8
80
Accuracy of penguin(x): 0.8
80
Accuracy of zebra(x): 0.8


In [64]:
score_dict = {}

for predicate in res_dict.keys():
    tmp1 = res_dict[predicate].loc[:, 'Ans'].to_list()
    arr1 = (np.array(tmp1) + 1) / 2
    tmp2 = res_dict[predicate].loc[:, 'pred_val'].to_list()
    arr2 = np.array(tmp2)

    y = arr1
    scores = arr2

    roc_auc = metrics.roc_auc_score(y, scores)

    score_dict[predicate] = roc_auc

score_dict

{'albatross(x)': 0.2075,
 'cheetah(x)': 0.33375,
 'ostrich(x)': 0.47937500000000005,
 'penguin(x)': 0.58,
 'zebra(x)': 0.64375}

In [65]:
np.array(list(score_dict.values())).mean()

0.44887499999999997

In [74]:
problem_instance.L

array([[[ 0.55059907,  0.52967695,  0.48787809, -1.        ],
        [ 0.47657505,  0.42794515,  0.35911005, -1.        ],
        [ 0.56701418,  0.49737742,  0.39688024, -1.        ],
        ...,
        [ 0.455731  ,  0.495668  ,  0.49332179,  1.        ],
        [ 0.37638097,  0.39618622,  0.43324492,  1.        ],
        [ 0.40472473,  0.37958853,  0.35013862,  1.        ]],

       [[ 0.515991  ,  0.48633653,  0.43161938,  1.        ],
        [ 0.58406314,  0.51277585,  0.40726625,  1.        ],
        [ 0.53864228,  0.49065586,  0.42627378,  1.        ],
        ...,
        [ 0.57286635,  0.54406587,  0.50548818, -1.        ],
        [ 0.45651196,  0.43114004,  0.34140452, -1.        ],
        [ 0.45922861,  0.46966501,  0.43414376, -1.        ]],

       [[ 0.55953692,  0.46860704,  0.35988031, -1.        ],
        [ 0.48022717,  0.38520612,  0.28788724, -1.        ],
        [ 0.50619979,  0.50675476,  0.38912861, -1.        ],
        ...,
        [ 0.30138667,  0.29

In [76]:
problem_instance.w_j.value

array([[-6.22194326e-10, -1.67215930e-10,  4.83517404e-10,
         1.00000000e+00],
       [ 6.24614160e-10,  2.03726727e-10, -3.25341291e-10,
         2.39991175e-11],
       [ 1.05886824e-10,  3.73121621e-11,  6.89195289e-11,
        -6.91145758e-11],
       [ 7.77525917e-11,  6.21715395e-11, -6.37975124e-11,
        -1.35195124e-11],
       [ 1.85608191e-10, -2.89058670e-11,  6.78827251e-11,
        -7.06985236e-11],
       [-1.94752526e-11, -9.93869495e-11,  4.00668699e-10,
        -6.31791870e-11],
       [ 1.35949015e-10, -1.83556676e-11, -5.34086023e-11,
        -8.49233437e-12],
       [ 4.81520210e-10, -2.36733644e-11, -2.64315559e-10,
         8.98691499e-11],
       [ 1.19832127e-10, -3.84207516e-11, -3.68458714e-11,
         2.01170036e-12],
       [ 5.11561393e-11, -5.05152970e-11,  9.58008199e-11,
        -5.10788208e-11],
       [ 1.74441167e-10, -1.45884540e-10,  1.90239842e-10,
        -6.72439015e-11],
       [ 4.89699298e-11, -1.38864802e-10,  1.75209126e-11,
      

In [40]:
# テストデータ
test_data_dir_path = './inputs/winston_full'
test_animals = ['albatross', 'cheetah', 'ostrich', 'penguin', 'zebra']

test_data = problem_instance.prepare_test_data(test_data_dir_path, test_animals)

def test_trained_predicate(problem_instance, test_data):
    p_dict = problem_instance.predicates_dict
    
    for p_name, p_data in test_data.items():
        p = p_dict[p_name]
        cnt = 0

        for data in p_data:
            x, ans = data[:-1], data[-1]
            pred = p(x).value

            if (pred >= 0.5 and ans == 1) or (pred < 0.5 and ans == -1):
                cnt += 1
        
        print(cnt)
        print(f'Accuracy of {p_name}: {cnt / len(p_data)}')


test_trained_predicate(problem_instance, test_data)

5200
Accuracy of albatross(x): 0.8
5200
Accuracy of cheetah(x): 0.8
5200
Accuracy of ostrich(x): 0.8
5200
Accuracy of penguin(x): 0.8
5200
Accuracy of zebra(x): 0.8


In [86]:
file_names_dict = {
    'supervised': ['L_p1(x)', 'L_p2(x)', 'L_p3(x)'],
    'unsupervised': ['U'],
    'rule': ['rules']
}

data_dir_path = './inputs/toy_data'

problem_instance = Setup(data_dir_path, file_names_dict)
objective, constraints = problem_instance.main(c1=2.5, c2=100)

problem = cp.Problem(objective, constraints)
result = problem.solve(verbose=True)

Loading data ...
Done in 0.0071048736572265625 seconds! 

Loading rules ...
Done in 0.00010180473327636719 seconds! 

Identifying predicates ...
Done in 0.0003237724304199219 seconds! 

Constructing objective function ...
Done in 0.0020737648010253906 seconds! 

Constructing constraints ...
Done in 0.02717280387878418 seconds! 

All done. 

                                     CVXPY                                     
                                     v1.3.2                                    
(CVXPY) Oct 28 05:16:19 PM: Your problem has 25 variables, 96 constraints, and 0 parameters.
(CVXPY) Oct 28 05:16:19 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Oct 28 05:16:19 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 28 05:16:19 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
------------------------------------------------------------

In [95]:
# テストデータ
test_data_dir_path = './inputs/toy_data'
test_p_names = ['p1', 'p2', 'p3']

test_data = problem_instance.prepare_test_data(test_data_dir_path, test_p_names)


def test_trained_predicate(problem_instance, test_data):
    result_dict = {}
    p_dict = problem_instance.predicates_dict

    for p_name, p_data in test_data.items():
        pred_vals = []
        preds = []

        p = p_dict[p_name]
        cnt = 0

        for data in p_data:
            x, ans = data[:-1], data[-1]
            pred_val = p(x).value

            pred_vals.append(pred_val)

            if (pred_val >= 0.5 and ans == 1) or (pred_val < 0.5 and ans == -1):
                cnt += 1

            pred = (pred_val >= 0.5 and ans == 1) or (pred_val < 0.5 and ans == -1)
            
            preds.append(pred)

        p_arr = np.hstack([p_data, np.array(pred_vals).reshape(-1, 1), np.array(preds).reshape(-1, 1)])
        p_df = pd.DataFrame(p_arr, columns=['r', 'g', 'Ans', 'pred_val', 'pred'])

        result_dict[p_name] = p_df
        
        print(cnt)
        print(f'Accuracy of {p_name}: {cnt / len(p_data)}')

    return result_dict


res_dict = test_trained_predicate(problem_instance, test_data)


score_dict = {}

for predicate in res_dict.keys():
    tmp1 = res_dict[predicate].loc[:, 'Ans'].to_list()
    arr1 = (np.array(tmp1) + 1) / 2
    tmp2 = res_dict[predicate].loc[:, 'pred_val'].to_list()
    arr2 = np.array(tmp2)

    y = arr1
    scores = arr2

    roc_auc = metrics.roc_auc_score(y, scores)

    score_dict[predicate] = roc_auc

score_dict

3
Accuracy of p1(x): 0.75
4
Accuracy of p2(x): 1.0
4
Accuracy of p3(x): 1.0


{'p1(x)': 1.0, 'p2(x)': 1.0, 'p3(x)': 1.0}

In [93]:
res_dict

{'p1(x)':      r    g  Ans      pred_val  pred
 0  0.1  0.5 -1.0  1.897741e-10   1.0
 1  0.4  0.4 -1.0  2.202232e-02   1.0
 2  0.3  0.8  1.0  4.965737e-01   0.0
 3  0.9  0.7  1.0  6.720435e-01   1.0,
 'p2(x)':      r    g  Ans  pred_val  pred
 0  0.1  0.3 -1.0  0.081905   1.0
 1  0.6  0.4 -1.0  0.276013   1.0
 2  0.2  0.8  1.0  0.817760   1.0
 3  0.7  0.6  1.0  0.576222   1.0,
 'p3(x)':      r    g  Ans  pred_val  pred
 0  0.4  0.2 -1.0  0.173206   1.0
 1  0.9  0.3 -1.0  0.173945   1.0
 2  0.2  0.6  1.0  0.872630   1.0
 3  0.5  0.7  1.0  0.936685   1.0}

In [92]:
score_dict

{'p1(x)': 1.0, 'p2(x)': 1.0, 'p3(x)': 1.0}