In [1]:
import numpy as np
import h5py
from tqdm import tqdm
import pandas as pd
import random
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({"text.usetex": True, "font.family": "serif", "font.serif": ["Computer Modern Roman"]})

from utils import global_monotonicity_violation, global_convexity_violation, peaking_detection
from meta_feature import learner_zoo

In [6]:
file_paths = Path.cwd() / '../dataset/LCDB11_ER_265_24.hdf5'
dataset_CC18 = h5py.File(file_paths , 'r')['error rate'][...] 
# dataset_nofs, dataset_minmaxfs, dataset_standardfs 
datasets = [dataset_CC18[..., 0, 0], dataset_CC18[..., 1, 0], dataset_CC18[..., 2, 0]]

In [7]:
results = []
for DATASET in datasets: 
    mono_matrix_y, _ = global_monotonicity_violation(DATASET, flat_filter = True)
    conv_matrix, _, _, _ = global_convexity_violation(DATASET, flat_filter = True)
    dipping_matrix_y, _ = global_monotonicity_violation(DATASET, flat_filter = True, dipping = True)
    peak_matrix, _, _, _ = peaking_detection(DATASET, flat_filter = True)
    results.append({
        "mono_matrix_y": mono_matrix_y,
        "conv_matrix": conv_matrix,
        "dipping_matrix_y": dipping_matrix_y,
        "peak_matrix": peak_matrix,
    })


100%|██████████| 265/265 [00:11<00:00, 22.51it/s]
100%|██████████| 265/265 [08:00<00:00,  1.81s/it]
100%|██████████| 265/265 [00:11<00:00, 23.79it/s]
100%|██████████| 265/265 [08:04<00:00,  1.83s/it]
100%|██████████| 265/265 [00:17<00:00, 15.49it/s]
100%|██████████| 265/265 [08:35<00:00,  1.95s/it]
100%|██████████| 265/265 [00:11<00:00, 22.41it/s]
100%|██████████| 265/265 [06:56<00:00,  1.57s/it]
100%|██████████| 265/265 [00:11<00:00, 22.68it/s]
100%|██████████| 265/265 [05:33<00:00,  1.26s/it]
100%|██████████| 265/265 [00:07<00:00, 35.78it/s]
100%|██████████| 265/265 [05:30<00:00,  1.25s/it]


In [8]:
for result in results:
    mono_matrix_y = result["mono_matrix_y"]
    conv_matrix = result["conv_matrix"]
    missing = (np.isnan(mono_matrix_y).sum() / mono_matrix_y.size) * 100
    print(f"Missing Ratio: {missing:.2f}%") 

    print(f"Flat: {(np.sum(mono_matrix_y == -1) / mono_matrix_y.size) * 100:.2f}%")

    print(f"Monotone (M): {(np.sum(mono_matrix_y == 0) / mono_matrix_y.size) * 100:.2f}%")
    print(f"Convex (C): {(np.sum(conv_matrix == 0) / conv_matrix.size) * 100:.2f}%")

    print(f"Well-behaved (M+C): {(np.sum((conv_matrix == 0) & (mono_matrix_y == 0)) / mono_matrix_y.size) * 100:.2f}%")

    print(f"Monotonicity Violation: {(np.sum(mono_matrix_y > 0) / mono_matrix_y.size) * 100:.2f}%")
    print(f"Convexity Violation: {(np.sum(conv_matrix > 0) / conv_matrix.size) * 100:.2f}%")

    dipping_matrix_y = result["dipping_matrix_y"]
    peak_matrix = result["peak_matrix"]

    print(f"Peaking: {(np.sum(peak_matrix > 0) / peak_matrix.size) * 100:.2f}%")
    print(f"Dipping: {(np.sum(dipping_matrix_y > 0) / dipping_matrix_y.size) * 100:.2f}%")
    print("-----------------------")

Missing Ratio: 2.89%
Flat: 11.04%
Monotone (M): 77.03%
Convex (C): 77.50%
Well-behaved (M+C): 74.53%
Monotonicity Violation: 9.04%
Convexity Violation: 8.57%
Peaking: 5.36%
Dipping: 6.57%
-----------------------
Missing Ratio: 0.38%
Flat: 10.49%
Monotone (M): 79.25%
Convex (C): 80.71%
Well-behaved (M+C): 76.95%
Monotonicity Violation: 9.89%
Convexity Violation: 8.43%
Peaking: 5.33%
Dipping: 8.18%
-----------------------
Missing Ratio: 8.33%
Flat: 8.00%
Monotone (M): 74.81%
Convex (C): 75.71%
Well-behaved (M+C): 72.89%
Monotonicity Violation: 8.85%
Convexity Violation: 7.96%
Peaking: 5.02%
Dipping: 6.32%
-----------------------


In [6]:
all_learner_stats = []

for result in results:
    mono_matrix_y = result["mono_matrix_y"]
    conv_matrix = result["conv_matrix"]
    dipping_matrix_y = result["dipping_matrix_y"]
    peak_matrix = result["peak_matrix"]

    learner_stats = [{"learner": learner_zoo[i]} for i in range(24)]

    for i in range(24):
        mono_y_learner = mono_matrix_y[i, :]
        conv_learner = conv_matrix[i, :]
        dipping_y_learner = dipping_matrix_y[i, :]
        peak_learner = peak_matrix[i, :]

        missing_learner = (np.isnan(mono_y_learner).sum() / mono_y_learner.size) * 100
        flat_percentage_learner = (np.sum(mono_y_learner == -1) / mono_y_learner.size) * 100
        mono_viola_percentage_learner = (np.sum(mono_y_learner == 0) / mono_y_learner.size) * 100
        conv_viola_percentage_learner = (np.sum(conv_learner == 0) / conv_learner.size) * 100
        both_no_viola_percentage_learner = (np.sum((conv_learner == 0) & (mono_y_learner == 0)) / mono_y_learner.size) * 100
        peaking_percentage_learner = (np.sum(peak_learner > 0) / peak_learner.size) * 100
        dipping_percentage_learner = (np.sum(dipping_y_learner > 0) / dipping_y_learner.size) * 100

        learner_stats[i].update({
            "missing": missing_learner,
            "flat": flat_percentage_learner,
            "monotone": mono_viola_percentage_learner,
            "convex": conv_viola_percentage_learner,
            "well_behaved": both_no_viola_percentage_learner,
            "peaking": peaking_percentage_learner,
            "dipping": dipping_percentage_learner,
        })

    all_learner_stats.append({
        "learner_stats": learner_stats
    })

for dataset_idx, dataset_result in enumerate(all_learner_stats):
    print(f"=== Dataset {dataset_idx + 1} ===")
    for learner_stat in dataset_result["learner_stats"]:
        print(
            f"Learner {learner_stat['learner']}: "
            f"Missing={learner_stat['missing']:.2f}%, "
            f"Flat={learner_stat['flat']:.2f}%, "
            f"Monotone={learner_stat['monotone']:.2f}%, "
            f"Convex={learner_stat['convex']:.2f}%, "
            f"Well-behaved={learner_stat['well_behaved']:.2f}%, "
            f"Peaking={learner_stat['peaking']:.2f}%, "
            f"Dipping={learner_stat['dipping']:.2f}%"
        )
    print("\n")


=== Dataset 1 ===
Learner SVM_Linear: Missing=0.38%, Flat=3.40%, Monotone=93.21%, Convex=92.83%, Well-behaved=91.70%, Peaking=1.51%, Dipping=2.64%
Learner SVM_Poly: Missing=0.38%, Flat=17.36%, Monotone=80.75%, Convex=79.62%, Well-behaved=78.49%, Peaking=0.38%, Dipping=1.89%
Learner SVM_RBF: Missing=0.38%, Flat=18.49%, Monotone=80.00%, Convex=73.58%, Well-behaved=73.21%, Peaking=0.00%, Dipping=0.38%
Learner SVM_Sigmoid: Missing=0.38%, Flat=19.25%, Monotone=36.98%, Convex=49.81%, Well-behaved=32.45%, Peaking=23.40%, Dipping=42.26%
Learner Decision Tree: Missing=0.38%, Flat=4.53%, Monotone=94.72%, Convex=94.34%, Well-behaved=94.34%, Peaking=0.38%, Dipping=0.75%
Learner ExtraTree: Missing=0.38%, Flat=3.77%, Monotone=94.72%, Convex=95.85%, Well-behaved=94.72%, Peaking=0.00%, Dipping=0.38%
Learner LogisticRegression: Missing=0.38%, Flat=6.79%, Monotone=91.32%, Convex=89.06%, Well-behaved=88.68%, Peaking=1.13%, Dipping=1.51%
Learner PassiveAggressive: Missing=0.38%, Flat=5.66%, Monotone=86.42

In [8]:
table_data = []

for i in range(24):  
    row = [learner_zoo[i]]  
    for dataset_result in all_learner_stats:
        learner_stat = dataset_result["learner_stats"][i]
        row.extend([
            f"{learner_stat['missing']:.2f}",
            f"{learner_stat['flat']:.2f}",
            f"{learner_stat['monotone']:.2f}",
            f"{learner_stat['convex']:.2f}",
            f"{learner_stat['well_behaved']:.2f}",
            f"{learner_stat['peaking']:.2f}",
            f"{learner_stat['dipping']:.2f}",
        ])
    table_data.append(row)


latex_code = r"""
\begin{sidewaystable}
\caption{Statistics of each learner in LCDB 1.1 (no Data Leakage version)}
\label{tab:learner_stats}
\resizebox{\textwidth}{!}{ 
\begin{tabular}{lccccccccccccccccccccc}
\toprule
\multirow{2}{*}{Learner} 
& \multicolumn{7}{c}{LCDB 1.1 FULL (265) no FS}               
& \multicolumn{7}{c}{LCDB 1.1 FULL (265) min-max FS}           
& \multicolumn{7}{c}{LCDB 1.1 FULL (265) standardization FS}       
\\
\cmidrule(lr){2-8} \cmidrule(lr){9-15} \cmidrule(lr){16-22}
& Missing & Flat & Monotone & Convex & Mono \& Conv & Peaking & Dipping
& Missing & Flat & Monotone & Convex & Mono \& Conv & Peaking & Dipping
& Missing & Flat & Monotone & Convex & Mono \& Conv & Peaking & Dipping
\\
\midrule
"""

for row in table_data:
    latex_code += " & ".join(row) + r" \\" + "\n"


latex_code += r"""
\bottomrule
\end{tabular}
} 
\end{sidewaystable}
"""


print(latex_code)



\begin{sidewaystable}
\caption{Statistics of each learner in LCDB 1.1 (no Data Leakage version)}
\label{tab:learner_stats}
\resizebox{\textwidth}{!}{ 
\begin{tabular}{lccccccccccccccccccccc}
\toprule
\multirow{2}{*}{Learner} 
& \multicolumn{7}{c}{LCDB 1.1 FULL (265) no FS}               
& \multicolumn{7}{c}{LCDB 1.1 FULL (265) min-max FS}           
& \multicolumn{7}{c}{LCDB 1.1 FULL (265) standardization FS}       
\\
\cmidrule(lr){2-8} \cmidrule(lr){9-15} \cmidrule(lr){16-22}
& Missing & Flat & Monotone & Convex & Mono \& Conv & Peaking & Dipping
& Missing & Flat & Monotone & Convex & Mono \& Conv & Peaking & Dipping
& Missing & Flat & Monotone & Convex & Mono \& Conv & Peaking & Dipping
\\
\midrule
SVM_Linear & 0.38 & 3.40 & 93.21 & 92.83 & 91.70 & 1.51 & 2.64 & 0.38 & 7.92 & 89.81 & 90.19 & 89.43 & 0.38 & 0.75 & 0.00 & 2.26 & 94.34 & 93.96 & 92.83 & 1.51 & 2.26 \\
SVM_Poly & 0.38 & 17.36 & 80.75 & 79.62 & 78.49 & 0.38 & 1.89 & 0.38 & 6.04 & 92.83 & 91.70 & 90.94 & 0.00 & 1.13 & 0