In [6]:
import numpy as np
import h5py
from tqdm import tqdm
import pandas as pd
import random
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({"text.usetex": True, "font.family": "serif", "font.serif": ["Computer Modern Roman"]})

from utils import global_monotonicity_violation, global_convexity_violation, peaking_detection

In [7]:
anchor_list_lcdb10 = np.ceil(16 * 2 ** ((np.arange(35)) / 2)).astype(int)
learner_zoo_LCDB10 = [ 'SVC_linear', 'SVC_poly', 'SVC_rbf', 'SVC_sigmoid', 'sklearn.tree.DecisionTreeClassifier', 'sklearn.tree.ExtraTreeClassifier', 'sklearn.linear_model.LogisticRegression', 'sklearn.linear_model.PassiveAggressiveClassifier', 'sklearn.linear_model.Perceptron', 'sklearn.linear_model.RidgeClassifier', 'sklearn.linear_model.SGDClassifier', 'sklearn.neural_network.MLPClassifier', 'sklearn.discriminant_analysis.LinearDiscriminantAnalysis', 'sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis', 'sklearn.naive_bayes.BernoulliNB', 'sklearn.naive_bayes.MultinomialNB', 'sklearn.neighbors.KNeighborsClassifier', 'sklearn.ensemble.ExtraTreesClassifier', 'sklearn.ensemble.RandomForestClassifier', 'sklearn.ensemble.GradientBoostingClassifier' ]
# claim data id
data_ids_LCDB10_github = [3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 36, 38, 44, 46, 54, 57, 60, 179, 180, 181, 182, 183, 184, 185, 188, 273, 293, 300, 351, 354, 357, 389, 390, 391, 392, 393, 395, 396, 398, 399, 401, 485, 554, 679, 715, 718, 720, 722, 723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799, 803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847, 849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930, 934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995, 1000, 1002, 1018, 1019, 1020, 1021, 1036, 1037, 1039, 1040, 1041, 1042, 1049, 1050, 1053, 1059, 1067, 1068, 1069, 1111, 1116, 1119, 1120, 1128, 1130, 1134, 1138, 1139, 1142, 1146, 1161, 1166, 1216, 1242, 1457, 1461, 1464, 1468, 1475, 1485, 1486, 1487, 1489, 1494, 1501, 1515, 1569, 1590, 4134, 4135, 4136, 4137, 4534, 4538, 4541, 4552, 23380, 23512, 23517, 40497, 40498, 40668, 40670, 40685, 40691, 40701, 40900, 40926, 40971, 40975, 40978, 40981, 40982, 40983, 40984, 40996, 41026, 41027, 41064, 41065, 41066, 41138, 41142, 41143, 41144, 41145, 41146, 41147, 41150, 41156, 41157, 41158, 41159, 41161, 41162, 41163, 41164, 41165, 41166, 41167, 41168, 41169, 41946, 42732, 42733, 42734]
# real lcdb 1.0
data_ids_LCDB10 = [3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 36, 38, 44, 46, 54, 57, 60, 179, 180, 181, 182, 184, 185, 188, 273, 293, 300, 351, 354, 357, 389, 390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722, 723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799, 803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847, 849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930, 934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995, 1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1042, 1049, 1050, 1053, 1067, 1068, 1069, 1111, 1116, 1119, 1120, 1128, 1130, 1134, 1138, 1139, 1142, 1146, 1161, 1166, 1216, 1457, 1461, 1464, 1468, 1475, 1485, 1486, 1487, 1489, 1494, 1515, 1590, 4134, 4135, 4137, 4534, 4538, 4541, 23512, 23517, 40498, 40668, 40670, 40685, 40701, 40900, 40971, 40975, 40978, 40981, 40982, 40983, 40984, 40996, 41027, 41142, 41143, 41144, 41145, 41146, 41150, 41156, 41157, 41158, 41159, 41161, 41163, 41164, 41165, 41166, 41167, 41168, 41169, 42732, 42733, 42734]


In [8]:
df = pd.read_csv('database-accuracy.csv') 
array = np.full((len(data_ids_LCDB10), len(learner_zoo_LCDB10), 5, 5, len(anchor_list_lcdb10), 3), np.nan)

# mapping dictionary
id_map = {v: i for i, v in enumerate(data_ids_LCDB10)}
learner_map = {v: i for i, v in enumerate(learner_zoo_LCDB10)}
size_map = {v: i for i, v in enumerate(anchor_list_lcdb10)}


for _, row in tqdm(df.iterrows(), total=len(df)):
    if (
        row['openmlid'] in id_map and
        row['learner'] in learner_map and
        row['size_train'] in size_map and
        row['outer_seed'] <= 4 and
        row['inner_seed'] <= 4
    ):
        i = id_map[row['openmlid']]
        j = learner_map[row['learner']]
        k = int(row['outer_seed'])
        l = int(row['inner_seed'])
        m = size_map[row['size_train']]
        array[i, j, k, l, m, 1] = 1 - row['score_valid']


100%|██████████| 1953371/1953371 [00:45<00:00, 42797.32it/s]


In [9]:
results = []
mono_matrix_y, _ = global_monotonicity_violation(array, flat_filter = True, bonferroni=True, dipping=False, anchor_list = anchor_list_lcdb10)
conv_matrix, _, _, _ = global_convexity_violation(array, flat_filter = True, bonferroni = True, anchor_list = anchor_list_lcdb10)
dipping_matrix_y, _ = global_monotonicity_violation(array, flat_filter = True, bonferroni=True, dipping=True, anchor_list = anchor_list_lcdb10)
peak_matrix, _, _, _ = peaking_detection(array, flat_filter = True, bonferroni = True, anchor_list = anchor_list_lcdb10)
results.append({
    "mono_matrix_y": mono_matrix_y,
    "conv_matrix": conv_matrix,
    "dipping_matrix_y": dipping_matrix_y,
    "peak_matrix": peak_matrix,
})



100%|██████████| 196/196 [00:01<00:00, 187.00it/s]
100%|██████████| 196/196 [00:02<00:00, 95.14it/s] 
100%|██████████| 196/196 [00:00<00:00, 268.37it/s]
100%|██████████| 196/196 [00:02<00:00, 71.38it/s]


In [16]:
mono_matrix_y = results[0]["mono_matrix_y"]
conv_matrix = results[0]["conv_matrix"]
missing = (np.isnan(mono_matrix_y).sum() / mono_matrix_y.size) * 100
print(f"Missing Ratio: {missing:.2f}%") 

print(f"Flat: {(np.sum(mono_matrix_y == -1) / mono_matrix_y.size) * 100:.2f}%")

print(f"Monotone (M): {(np.sum(mono_matrix_y == 0) / mono_matrix_y.size) * 100:.2f}%")
print(f"Convex (C): {(np.sum(conv_matrix == 0) / conv_matrix.size) * 100:.2f}%")

print(f"Well-behaved (M+C): {(np.sum((conv_matrix == 0) & (mono_matrix_y == 0)) / mono_matrix_y.size) * 100:.2f}%")

print(f"Monotonicity Violation: {(np.sum(mono_matrix_y > 0) / mono_matrix_y.size) * 100:.2f}%")
print(f"Convexity Violation: {(np.sum(conv_matrix > 0) / conv_matrix.size) * 100:.2f}%")

dipping_matrix_y = results[0]["dipping_matrix_y"]
peak_matrix = results[0]["peak_matrix"]

print(f"Peaking: {(np.sum(peak_matrix > 0) / peak_matrix.size) * 100:.2f}%")
print(f"Dipping: {(np.sum(dipping_matrix_y > 0) / dipping_matrix_y.size) * 100:.2f}%")
print("-----------------------")

Missing Ratio: 11.89%
Flat: 5.10%
Monotone (M): 75.79%
Convex (C): 75.61%
Well-behaved (M+C): 73.14%
Monotonicity Violation: 7.22%
Convexity Violation: 7.40%
Peaking: 3.95%
Dipping: 5.15%
-----------------------


### interpolate into denser version for maintaining the same bonferroni correction

In [None]:
from scipy.interpolate import interp1d

anchor_list_denser = np.ceil(16 * 2 ** (np.arange(137) / 8)).astype(int)

denser_array = np.full((196, 20, 5, 5, len(anchor_list_denser), 3), np.nan, dtype=array.dtype)

for a in range(196):
    for b in range(20):
        for c in range(5):
            for d in range(5):
                curve = array[a, b, c, d, :, 1]  # shape: (35,)
                
                valid_mask = ~np.isnan(curve)
                valid_curve = curve[valid_mask]
                if valid_curve.shape[0] < 2:
                    continue

                x_old = anchor_list_lcdb10[valid_mask]
                y_old = valid_curve

                # with in exist anchors
                x_min, x_max = x_old[0], x_old[-1]
                x_new = anchor_list_denser[(anchor_list_denser >= x_min) & (anchor_list_denser <= x_max)]

                if len(x_new) < 2:
                    continue  

                # interpolation
                interp_func = interp1d(x_old, y_old, kind='linear', bounds_error=False)
                y_new = interp_func(x_new)

                denser_array[a, b, c, d, :len(y_new), 1] = y_new


In [39]:
results = []
mono_matrix_y, _ = global_monotonicity_violation(denser_array, flat_filter = True, bonferroni=True, dipping=False, anchor_list = anchor_list_denser)
conv_matrix, _, _, _ = global_convexity_violation(denser_array, flat_filter = True, bonferroni = True, anchor_list = anchor_list_denser)
dipping_matrix_y, _ = global_monotonicity_violation(denser_array, flat_filter = True, bonferroni=True, dipping=True, anchor_list = anchor_list_denser)
peak_matrix, _, _, _ = peaking_detection(denser_array, flat_filter = True, bonferroni = True, anchor_list = anchor_list_denser)
results.append({
    "mono_matrix_y": mono_matrix_y,
    "conv_matrix": conv_matrix,
    "dipping_matrix_y": dipping_matrix_y,
    "peak_matrix": peak_matrix,
})



100%|██████████| 196/196 [00:03<00:00, 58.28it/s]
100%|██████████| 196/196 [03:40<00:00,  1.13s/it]
100%|██████████| 196/196 [00:04<00:00, 43.95it/s]
100%|██████████| 196/196 [04:43<00:00,  1.45s/it]


In [40]:
mono_matrix_y = results[0]["mono_matrix_y"]
conv_matrix = results[0]["conv_matrix"]
missing = (np.isnan(mono_matrix_y).sum() / mono_matrix_y.size) * 100
print(f"Missing Ratio: {missing:.2f}%") 

print(f"Flat: {(np.sum(mono_matrix_y == -1) / mono_matrix_y.size) * 100:.2f}%")

print(f"Monotone (M): {(np.sum(mono_matrix_y == 0) / mono_matrix_y.size) * 100:.2f}%")
print(f"Convex (C): {(np.sum(conv_matrix == 0) / conv_matrix.size) * 100:.2f}%")

print(f"Well-behaved (M+C): {(np.sum((conv_matrix == 0) & (mono_matrix_y == 0)) / mono_matrix_y.size) * 100:.2f}%")

print(f"Monotonicity Violation: {(np.sum(mono_matrix_y > 0) / mono_matrix_y.size) * 100:.2f}%")
print(f"Convexity Violation: {(np.sum(conv_matrix > 0) / conv_matrix.size) * 100:.2f}%")

dipping_matrix_y = results[0]["dipping_matrix_y"]
peak_matrix = results[0]["peak_matrix"]

print(f"Peaking: {(np.sum(peak_matrix > 0) / peak_matrix.size) * 100:.2f}%")
print(f"Dipping: {(np.sum(dipping_matrix_y > 0) / dipping_matrix_y.size) * 100:.2f}%")
print("-----------------------")

Missing Ratio: 11.89%
Flat: 5.20%
Monotone (M): 78.32%
Convex (C): 78.62%
Well-behaved (M+C): 76.48%
Monotonicity Violation: 4.59%
Convexity Violation: 4.29%
Peaking: 2.30%
Dipping: 4.01%
-----------------------
