In [1]:
import numpy as np
import h5py
from tqdm import tqdm
import pandas as pd
import random
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({"text.usetex": True, "font.family": "serif", "font.serif": ["Computer Modern Roman"]})

from utils import global_monotonicity_violation, global_convexity_violation, peaking_detection
from meta_feature import dataset_ids_CC18, anchor_list_denser, feature_num_CC18, class_num_CC18, learner_zoo

In [None]:
file_paths = [  Path.cwd() / '../dataset/LCDB11_ER_CC18_noFS_raw.hdf5',
                Path.cwd() / '../dataset/LCDB11_ER_CC18_minmaxFS_raw.hdf5',
                Path.cwd() / '../dataset/LCDB11_ER_CC18_standardFS_raw.hdf5']
# file_paths = [  Path.cwd() / '../dataset/LCDB11_ER_265_noFS_raw_compress.hdf5',
#                 Path.cwd() / '../dataset/LCDB11_ER_265_minmaxFS_raw_compress.hdf5',
#                 Path.cwd() / '../dataset/LCDB11_ER_265_standardFS_raw_compress.hdf5']
dataset_nofs, dataset_minmaxfs, dataset_standardfs = [h5py.File(fp, 'r')['error_rate'][...] for fp in file_paths]
datasets = [dataset_nofs, dataset_minmaxfs, dataset_standardfs]

### Monotonicity and Convexity

In [3]:
results = []
for DATASET in datasets: 
    mono_matrix_y, mono_matrix_x = global_monotonicity_violation(DATASET, flat_filter = True)
    conv_matrix, conv_h_matrix, conv_i_matrix, conv_j_matrix = global_convexity_violation(DATASET, flat_filter = True)
    results.append({
        "mono_matrix_y": mono_matrix_y,
        "mono_matrix_x": mono_matrix_x,
        "conv_matrix": conv_matrix,
        "conv_h_matrix": conv_h_matrix,
        "conv_i_matrix": conv_i_matrix,
        "conv_j_matrix": conv_j_matrix,
    })

In [4]:
for result in results:
    mono_matrix_y = result["mono_matrix_y"]
    conv_matrix = result["conv_matrix"]
    missing = (np.isnan(mono_matrix_y).sum() / mono_matrix_y.size) * 100
    print(f"Missing Ratio: {missing:.2f}%") 

    flat_percentage_learner = (np.sum(mono_matrix_y == -1) / mono_matrix_y.size) * 100
    print(f"Flat: {flat_percentage_learner:.2f}%")

    mono_viola_percentage = (np.sum(mono_matrix_y == 0) / mono_matrix_y.size) * 100
    print(f"Monotone: {mono_viola_percentage:.2f}%")
    conv_viola_percentage = (np.sum(conv_matrix == 0) / conv_matrix.size) * 100
    print(f"Convex: {conv_viola_percentage:.2f}%")

    both_no_viola_percentage = (np.sum((conv_matrix == 0) & (mono_matrix_y == 0)) / mono_matrix_y.size) * 100
    print(f"Well-behaved: {both_no_viola_percentage:.2f}%")
    print("-----------------------")

Missing Ratio: 2.89%
Flat: 11.05%
Monotone: 77.01%
Convex: 77.45%
Well-behaved: 74.51%
-----------------------
Missing Ratio: 0.38%
Flat: 10.49%
Monotone: 79.25%
Convex: 80.71%
Well-behaved: 76.95%
-----------------------
Missing Ratio: 8.33%
Flat: 8.00%
Monotone: 74.81%
Convex: 75.71%
Well-behaved: 72.89%
-----------------------


In [7]:
all_learner_stats = []

for result in results:
    mono_matrix_y = result["mono_matrix_y"]
    conv_matrix = result["conv_matrix"]
    learner_stats = [{"learner": learner_zoo[i]} for i in range(24)]

    for i in range(24):
        mono_y_learner = mono_matrix_y[i, :]
        conv_learner = conv_matrix[i, :]

        missing_learner = (np.isnan(mono_y_learner).sum() / mono_y_learner.size) * 100
        flat_percentage_learner = (np.sum(mono_y_learner == -1) / mono_y_learner.size) * 100
        mono_viola_percentage_learner = (np.sum(mono_y_learner == 0) / mono_y_learner.size) * 100
        conv_viola_percentage_learner = (np.sum(conv_learner == 0) / conv_learner.size) * 100
        both_no_viola_percentage_learner = (np.sum((conv_learner == 0) & (mono_y_learner == 0)) / mono_y_learner.size) * 100

        learner_stats[i].update({
            "missing": missing_learner,
            "flat": flat_percentage_learner,
            "monotone": mono_viola_percentage_learner,
            "convex": conv_viola_percentage_learner,
            "well_behaved": both_no_viola_percentage_learner,
        })

    all_learner_stats.append({
        "learner_stats": learner_stats
    })

for dataset_result in all_learner_stats:
    for learner_stat in dataset_result["learner_stats"]:
        print(
            f"Learner {learner_stat['learner']}: "
            # f"Missing={learner_stat['missing']:.2f}%, "
            f"Flat={learner_stat['flat']:.2f}%, "
            f"Monotone={learner_stat['monotone']:.2f}%, "
            # f"Convex={learner_stat['convex']:.2f}%, "
            f"Well-behaved={learner_stat['well_behaved']:.2f}%"
        )
    print("\n")


Learner SVC_linear: Flat=0.00%, Monotone=95.83%, Well-behaved=94.44%
Learner SVC_poly: Flat=19.44%, Monotone=80.56%, Well-behaved=79.17%
Learner SVC_rbf: Flat=19.44%, Monotone=80.56%, Well-behaved=76.39%
Learner SVC_sigmoid: Flat=11.11%, Monotone=30.56%, Well-behaved=27.78%
Learner Decision Trees: Flat=2.78%, Monotone=97.22%, Well-behaved=97.22%
Learner ExtraTrees: Flat=2.78%, Monotone=95.83%, Well-behaved=95.83%
Learner LogisticRegression: Flat=2.78%, Monotone=97.22%, Well-behaved=93.06%
Learner PassiveAggressive: Flat=1.39%, Monotone=93.06%, Well-behaved=93.06%
Learner Perceptron: Flat=0.00%, Monotone=95.83%, Well-behaved=95.83%
Learner RidgeClassifier: Flat=5.56%, Monotone=76.39%, Well-behaved=73.61%
Learner SGDClassifier: Flat=0.00%, Monotone=97.22%, Well-behaved=97.22%
Learner MLP: Flat=2.78%, Monotone=79.17%, Well-behaved=68.06%
Learner LDA: Flat=2.78%, Monotone=54.17%, Well-behaved=48.61%
Learner QDA: Flat=2.78%, Monotone=54.17%, Well-behaved=45.83%
Learner BernoulliNB: Flat=19.

In [24]:
table_data = []
for i in range(24):  
    row = [learner_zoo[i]]  
    for dataset_result in all_learner_stats:
        learner_stat = dataset_result["learner_stats"][i]
        row.append(f"{learner_stat['flat']:.2f}")  # Flat
        row.append(f"{learner_stat['monotone']:.2f}")  # Monotone
        row.append(f"{learner_stat['well_behaved']:.2f}")  # Mono & Conv
    table_data.append(row)


latex_code = r"""
\begin{sidewaystable}
\caption{Statistics for LCDB 1.1}
\label{tab:learner_stats}
\begin{tabular*}{\textheight}{@{\extracolsep\fill}lcccccccccccccccccc}
\toprule%
\multirow{2}{*}{Shapes/Database} & \multicolumn{3}{@{}c@{}}{LCDB 1.1 CC-18 noFS}& \multicolumn{3}{@{}c@{}}{LCDB 1.1 CC-18 minmaxFS} & \multicolumn{3}{@{}c@{}}{LCDB 1.1 CC-18 standardFS} 
\\
\cmidrule{2-4}\cmidrule{5-7}\cmidrule{8-10}
 & Flat & Monotone & Mono \& Conv & Flat & Monotone & Mono \& Conv & Flat & Monotone & Mono \& Conv \\
\midrule
"""

for row in table_data:
    latex_code += " & ".join(row) + r" \\" + "\n"

latex_code += r"""
\bottomrule
\end{tabular*}
\end{sidewaystable}
"""

print(latex_code)



\begin{sidewaystable}
\caption{Statistics for LCDB 1.1}
\label{tab:learner_stats}
\begin{tabular*}{\textheight}{@{\extracolsep\fill}lcccccccccccccccccc}
\toprule%
\multirow{2}{*}{Shapes/Database} & \multicolumn{3}{@{}c@{}}{LCDB 1.1 CC-18 noFS}& \multicolumn{3}{@{}c@{}}{LCDB 1.1 CC-18 minmaxFS} & \multicolumn{3}{@{}c@{}}{LCDB 1.1 CC-18 standardFS} 
\\
\cmidrule{2-4}\cmidrule{5-7}\cmidrule{8-10}
 & Flat & Monotone & Mono \& Conv & Flat & Monotone & Mono \& Conv & Flat & Monotone & Mono \& Conv \\
\midrule
SVC_linear & 0.00 & 95.83 & 94.44 & 5.56 & 93.06 & 93.06 & 1.39 & 98.61 & 97.22 \\
SVC_poly & 19.44 & 80.56 & 79.17 & 2.78 & 97.22 & 95.83 & 6.94 & 93.06 & 88.89 \\
SVC_rbf & 19.44 & 80.56 & 76.39 & 15.28 & 84.72 & 79.17 & 12.50 & 87.50 & 83.33 \\
SVC_sigmoid & 11.11 & 30.56 & 27.78 & 11.11 & 31.94 & 30.56 & 6.94 & 51.39 & 48.61 \\
Decision Trees & 2.78 & 97.22 & 97.22 & 0.00 & 98.61 & 98.61 & 0.00 & 98.61 & 98.61 \\
ExtraTrees & 2.78 & 95.83 & 95.83 & 0.00 & 98.61 & 98.61 & 0.00 & 98.

In [14]:
learner_stats = [{"learner": learner_zoo[i]} for i in range(24)]
for i in range(24):
    mono_y_learner = mono_matrix_y[i, :]
    conv_learner = conv_matrix[i, :]

    missing_learner = (np.isnan(mono_matrix_y).sum() / mono_matrix_y.size) * 100
    flat_percentage_learner = (np.sum(mono_y_learner == -1) / mono_y_learner.size) * 100
    mono_viola_percentage_learner = (np.sum(mono_y_learner == 0) / mono_y_learner.size) * 100
    conv_viola_percentage_learner = (np.sum(conv_learner == 0) / conv_learner.size) * 100
    both_no_viola_percentage_learner = ( np.sum((conv_learner == 0) & (mono_y_learner == 0)) / mono_y_learner.size ) * 100
    
    # update
    learner_stats[i].update({
        "missing": missing_learner, 
        "flat": flat_percentage_learner,
        "monotone": mono_viola_percentage_learner,
        "convex": conv_viola_percentage_learner,
        "well_behaved": both_no_viola_percentage_learner,
    })

for stat in learner_stats:
    print(
        f"Learner {stat['learner']}: "
        f"Missing={stat['missing']:.2f}%, "
        f"Flat={stat['flat']:.2f}%, "
        f"Monotone={stat['monotone']:.2f}%, "
        f"Convex={stat['convex']:.2f}%, "
        f"Well-behaved={stat['well_behaved']:.2f}%"
    )

Learner SVC_linear: Missing=1.85%, Flat=0.00%, Monotone=95.83%, Convex=95.83%, Well-behaved=94.44%
Learner SVC_poly: Missing=1.85%, Flat=19.44%, Monotone=80.56%, Convex=79.17%, Well-behaved=79.17%
Learner SVC_rbf: Missing=1.85%, Flat=19.44%, Monotone=80.56%, Convex=76.39%, Well-behaved=76.39%
Learner SVC_sigmoid: Missing=1.85%, Flat=11.11%, Monotone=30.56%, Convex=56.94%, Well-behaved=27.78%
Learner Decision Trees: Missing=1.85%, Flat=2.78%, Monotone=97.22%, Convex=97.22%, Well-behaved=97.22%
Learner ExtraTrees: Missing=1.85%, Flat=2.78%, Monotone=95.83%, Convex=97.22%, Well-behaved=95.83%
Learner LogisticRegression: Missing=1.85%, Flat=2.78%, Monotone=97.22%, Convex=93.06%, Well-behaved=93.06%
Learner PassiveAggressive: Missing=1.85%, Flat=1.39%, Monotone=93.06%, Convex=98.61%, Well-behaved=93.06%
Learner Perceptron: Missing=1.85%, Flat=0.00%, Monotone=95.83%, Convex=98.61%, Well-behaved=95.83%
Learner RidgeClassifier: Missing=1.85%, Flat=5.56%, Monotone=76.39%, Convex=73.61%, Well-be

In [16]:
latex_table = r"""
\begin{table}[h!]
\caption{Statistics for LCDB 1.1 noFS.}
\label{tab:learner_stats}
\centering
\begin{tabular}{lccccc}
\hline
Learner & Missing & Flat  & Monotone  & Convex  & Monotone \& Convex  \\ \hline
"""
for stat in learner_stats:
    learner = str(stat["learner"]).replace("_", r"\_")  # SVC_linear... need replace
    missing = f"{stat['missing']:.2f}"
    flat = f"{stat['flat']:.2f}"
    monotone = f"{stat['monotone']:.2f}"
    convex = f"{stat['convex']:.2f}"
    both = f"{stat['well_behaved']:.2f}"
    # write the number
    latex_table += f"{learner} & {missing} & {flat}\% & {monotone}\% & {convex}\% & {both}\% \\\\ \n"

latex_table += r"""
\hline
\end{tabular}
\end{table}
"""

print(latex_table)



\begin{table}[h!]
\caption{Statistics for LCDB 1.1 noFS.}
\label{tab:learner_stats}
\centering
\begin{tabular}{lccccc}
\hline
Learner & Missing & Flat  & Monotone  & Convex  & Monotone \& Convex  \\ \hline
SVC\_linear & 1.85 & 0.00\% & 95.83\% & 95.83\% & 94.44\% \\ 
SVC\_poly & 1.85 & 19.44\% & 80.56\% & 79.17\% & 79.17\% \\ 
SVC\_rbf & 1.85 & 19.44\% & 80.56\% & 76.39\% & 76.39\% \\ 
SVC\_sigmoid & 1.85 & 11.11\% & 30.56\% & 56.94\% & 27.78\% \\ 
Decision Trees & 1.85 & 2.78\% & 97.22\% & 97.22\% & 97.22\% \\ 
ExtraTrees & 1.85 & 2.78\% & 95.83\% & 97.22\% & 95.83\% \\ 
LogisticRegression & 1.85 & 2.78\% & 97.22\% & 93.06\% & 93.06\% \\ 
PassiveAggressive & 1.85 & 1.39\% & 93.06\% & 98.61\% & 93.06\% \\ 
Perceptron & 1.85 & 0.00\% & 95.83\% & 98.61\% & 95.83\% \\ 
RidgeClassifier & 1.85 & 5.56\% & 76.39\% & 73.61\% & 73.61\% \\ 
SGDClassifier & 1.85 & 0.00\% & 97.22\% & 100.00\% & 97.22\% \\ 
MLP & 1.85 & 2.78\% & 79.17\% & 70.83\% & 68.06\% \\ 
LDA & 1.85 & 2.78\% & 54.17\% & 50.00

### Dipping and Peaking

In [6]:
results = []
for DATASET in datasets: 
    mono_matrix_y, mono_matrix_x = global_monotonicity_violation(DATASET, flat_filter = True, dipping = True)
    conv_matrix, conv_h_matrix, conv_i_matrix, conv_j_matrix = peaking_detection(DATASET, flat_filter = True)
    results.append({
        "mono_matrix_y": mono_matrix_y,
        "mono_matrix_x": mono_matrix_x,
        "conv_matrix": conv_matrix,
        "conv_h_matrix": conv_h_matrix,
        "conv_i_matrix": conv_i_matrix,
        "conv_j_matrix": conv_j_matrix,
    })

100%|██████████| 72/72 [00:00<00:00, 78.10it/s]
100%|██████████| 72/72 [01:31<00:00,  1.28s/it]
100%|██████████| 72/72 [00:01<00:00, 62.88it/s]
100%|██████████| 72/72 [01:49<00:00,  1.52s/it]
100%|██████████| 72/72 [00:03<00:00, 19.76it/s]
100%|██████████| 72/72 [01:46<00:00,  1.48s/it]


In [7]:
for result in results:
    mono_matrix_y = result["mono_matrix_y"]
    conv_matrix = result["conv_matrix"]

    dipping_percentage = (np.sum(mono_matrix_y > 0) / mono_matrix_y.size) * 100
    print(f"Dipping: {dipping_percentage:.2f}%")

    peaking_percentage = (np.sum(conv_matrix > 0) / conv_matrix.size) * 100
    print(f"Peaking: {peaking_percentage:.2f}%")

    print("-----------------------")

Dipping: 6.42%
Peaking: 6.02%
-----------------------
Dipping: 7.52%
Peaking: 6.08%
-----------------------
Dipping: 5.79%
Peaking: 5.32%
-----------------------
