In [2]:
import sys, subprocess, importlib, os
REPO_URL = "https://github.com/khayhamz31/d2v_copy"
REPO_DIR = "Dataset2Vec"

if not os.path.exists(REPO_DIR):
    subprocess.run(
        ["git", "clone", "--depth", "1", REPO_URL, REPO_DIR],
        check=True
    )
    print(f"Cloned repository into ./{REPO_DIR}")
else:
    print("dataset2vec already present in project root.")

Cloning into 'Dataset2Vec'...


Cloned repository into ./Dataset2Vec


### Environment Imports  
Core libraries, OpenML access, widgets for UI, and project modules for:
- downloading datasets  
- extracting traditional & D2V meta-features  
- aggregating runs  
- training meta-classifier and meta-regressor models  

In [None]:
import warnings
import openml
import ipywidgets as widgets
import time
import pandas as pd
from IPython.display import clear_output, display
warnings.filterwarnings("ignore", category=RuntimeWarning)

# SET OPENML KEY HERE 
openml.config.apikey = ""
from datasets import (
    download_benchmark_suite,
    download_and_process_dataset,
    download_datasets_from_df
)

from qualities import extract_metafeatures_from_local_datasets

from d2v_qualities import extract_metafeatures_from_datasets

from runs import run_pipeline

from metaclassifier import run_meta_classifier

from metaregressor import run_meta_regressor_multioutput

from regressor_comparison import plot_regressor_as_classifier_results

# set random seed for reproducibility
random_seed = 411

### Benchmark Suite Selection  
Fetch OpenML benchmark suites with retry logic, display a dropdown selector, and download datasets from the chosen suite 

In [None]:
max_retries = 3
delay = 5

for attempt in range(max_retries):
    try:
        suites_df = openml.study.list_suites(output_format="dataframe", status="all")
        break
    except openml.exceptions.OpenMLServerException:
        if attempt < max_retries - 1:
            time.sleep(delay)
        else:
            raise RuntimeError("Failed to fetch benchmark suites after multiple retries.")

suites_df["alias"] = suites_df["alias"].fillna("unnamed")
options = [(f"{alias} ({sid})", sid) for sid, alias in zip(suites_df["id"], suites_df["alias"])]

dropdown = widgets.Dropdown(
    options=options,
    description='Suite:',
    layout=widgets.Layout(width='60%')
)
output = widgets.Output()
selected_suite_info = {"id": None, "alias": None}

def on_dropdown_change(change):
    if change["type"] == "change" and change["name"] == "value":
        with output:
            clear_output()
            sid = change["new"]
            alias = next(label.split(" (")[0] for label, val in options if val == sid)
            selected_suite_info.update({"id": sid, "alias": alias})
            print(f"Downloading: {alias} ({sid})...")
            res = download_benchmark_suite(sid)
            print(f"{res['successful']}/{res['total']} downloaded, {res['failed']} failed.")

dropdown.observe(on_dropdown_change)
display(dropdown, output)

### Meta-feature extraction 
1. Extracts meta-features from openml for the datasets downloaded
2. Extracts Dataset2Vec meta-features from the datasets downloaded

In [None]:
df_local = extract_metafeatures_from_local_datasets("test_datasets")
stats = extract_metafeatures_from_datasets()

### Meta-target extraction
1. Downloads and samples run data from OpenML for the datasets downloaded

In [None]:
run_pipeline(
    mapping_path="test_datasets/id_task_mapping.json",
    flow_map_path="flows/filtered_flow_algorithm_mapping_v2.json",
    sample_size=50,
    batch_size=50,
    base_dir="runs"
)

### Meta-classifier
1. Trains meta-classifier 
2. Evaluates meta-classifier performance (LOOCV with n_repeats)
3. Visualises with boxplot and simple statistics (mean accuracy Â± std )

In [None]:
results, majority_acc, summary, (fig, ax) = run_meta_classifier(
    # choosing performance metrics (accuracy or f1)
    metric_name="accuracy",
    # change for algorithm subsets ['decision_tree','random_forest','xgboost','linear_models','support_vector_machine'] or None for all 
    algorithms=['decision_tree','random_forest','xgboost'],
    # change number of repeats (numer of meta-models)
    n_repeats=10,
    # Change random seed for reproducibility
    seed = random_seed,
    # Change title for plot
    plot_title="Benchmark Suite 99 (Classifier)"
)

### Meta-Regressor 
1. Trains the meta-regressor  
2. Evaluates meta-regressor performance (LOO-CV with n_repeats)
3. Saves simple statistics and predictions to the results folder


In [None]:
results, baseline_mae, summary_df = run_meta_regressor_multioutput(
    # choosing performance metrics (accuracy or f1)
    metric_name="accuracy",
    # change for algorithm subsets ['decision_tree','random_forest','xgboost','linear_models','support_vector_machine'] or None for all
    algorithms=None,
    # change number of repeats (numer of meta-models)
    n_repeats=10,
    # change random seed for reproducibility
    seed = random_seed,
    # change folder to save regressor results
    output_dir="meta_regressor_results",
)

### Meta-regressor as classifier  
Uses meta-regressor outputs to visualises them in the same format as the meta-classifier for direct comparison.

In [None]:
accuracies, majority_accuracy, summary_df, (fig, ax) = plot_regressor_as_classifier_results(
    # change to where meta_regressor results are stored
    analysis_csv="meta_regressor_results/analysis/top1_accuracy_analysis.csv",
    # change folder to save regressor as classifier
    output_dir="regresscompress",
    # Change title for plot
    plot_title="Benchmark Suite 99 (Regressor)",
    ylabel="Mean Accuracy"
)