In [2]:
#!/usr/bin/env python
# coding: utf-8

import os
import json
import random
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import openml
from openml import config
openml.config.apikey = "3ba6ade3f263806d251fb5641e1b2793"

In [None]:
output_dir = "test_datasets"  # adjust if you used a different output_dir
mapping_path = os.path.join(output_dir, "id_task_mapping.json")

# === Load JSON ===
with open(mapping_path, "r") as f:
    id_task_map = json.load(f)

print(f"Loaded mapping from {mapping_path}")
print(f"Total mappings: {len(id_task_map)}\n")

# === Display as DataFrame for convenience ===
df_map = pd.DataFrame([
    {"dataset_id": int(did), "task_id": int(tid)}
    for did, tid in id_task_map.items()
])

path = "flows/filtered_flow_algorithm_mapping_v2.json"
with open(path, "r") as f:
    flow_map = json.load(f)

Loaded mapping from test_datasets/id_task_mapping.json
Total mappings: 54



In [46]:
df_map.head(5)

Unnamed: 0,dataset_id,task_id
0,6,6
1,11,11
2,12,12
3,14,14
4,16,16


In [48]:
valid_flow_ids = set(map(int, flow_map.keys()))

In [None]:
task_ids = [6,11,12,14,16]

os.makedirs("runs/filtered", exist_ok=True)

for task_id in tqdm(task_ids, desc="Filtering runs"):
    try:
        runs_df = openml.runs.list_runs(task=[task_id], output_format='dataframe')
        
        filtered_df = runs_df[runs_df['flow_id'].isin(valid_flow_ids)]
        
        output_path = os.path.join("runs/filtered", f"task_{task_id}_runs.csv")
        filtered_df.to_csv(output_path, index=False)
        
        print(f"Task {task_id}: {len(runs_df)} -> {len(filtered_df)} runs | Saved to {output_path}")
        
    except Exception as e:
        print(f"Task {task_id}: Error - {e}")

Filtering runs:  20%|██        | 1/5 [00:03<00:13,  3.47s/it]

Task 6: 34712 -> 24081 runs | Saved to runs/filtered/task_6_runs.csv


Filtering runs:  40%|████      | 2/5 [00:06<00:08,  2.97s/it]

Task 11: 24518 -> 20817 runs | Saved to runs/filtered/task_11_runs.csv


Filtering runs:  60%|██████    | 3/5 [00:08<00:05,  2.62s/it]

Task 12: 23084 -> 18192 runs | Saved to runs/filtered/task_12_runs.csv


Filtering runs:  80%|████████  | 4/5 [00:10<00:02,  2.46s/it]

Task 14: 23736 -> 19522 runs | Saved to runs/filtered/task_14_runs.csv


Filtering runs: 100%|██████████| 5/5 [00:12<00:00,  2.54s/it]

Task 16: 24209 -> 18844 runs | Saved to runs/filtered/task_16_runs.csv





In [51]:
os.makedirs("runs/sampled", exist_ok=True)

sample_size = 50

for task_id in tqdm(task_ids, desc="Sampling runs"):
    try:
        input_path = os.path.join("runs/filtered", f"task_{task_id}_runs.csv")
        filtered_df = pd.read_csv(input_path)
        
        k = min(sample_size, len(filtered_df))
        if k == 0:
            continue
        
        sampled_df = filtered_df.sample(n=k, random_state=42)
        
        output_path = os.path.join("runs/sampled", f"task_{task_id}_sampled.csv")
        sampled_df.to_csv(output_path, index=False)
        
        print(f"Task {task_id}: Sampled {k} runs | Saved to {output_path}")
        
    except Exception as e:
        print(f"Task {task_id}: Error - {e}")

Sampling runs: 100%|██████████| 5/5 [00:00<00:00, 32.93it/s]

Task 6: Sampled 50 runs | Saved to runs/sampled/task_6_sampled.csv
Task 11: Sampled 50 runs | Saved to runs/sampled/task_11_sampled.csv
Task 12: Sampled 50 runs | Saved to runs/sampled/task_12_sampled.csv
Task 14: Sampled 50 runs | Saved to runs/sampled/task_14_sampled.csv
Task 16: Sampled 50 runs | Saved to runs/sampled/task_16_sampled.csv





In [None]:
os.makedirs("runs/accuracies", exist_ok=True)
os.makedirs("runs/f1_scores", exist_ok=True)

batch_size = 50

for task_id in tqdm(task_ids, desc="Fetching accuracies and f1"):
    try:
        input_path = os.path.join("runs/sampled", f"task_{task_id}_sampled.csv")
        sampled_df = pd.read_csv(input_path)
        
        run_ids = sampled_df['run_id'].tolist()
        accuracy_map = {}
        f1_map = {}
        
        for i in range(0, len(run_ids), batch_size):
            batch = run_ids[i:i + batch_size]
            
            evals_acc = openml.evaluations.list_evaluations(
                function='predictive_accuracy',
                runs=[int(rid) for rid in batch],
                output_format='dataframe'
            )
            if not evals_acc.empty:
                accuracy_map.update(dict(zip(evals_acc['run_id'], evals_acc['value'])))
            
            evals_f1 = openml.evaluations.list_evaluations(
                function='f_measure',
                runs=[int(rid) for rid in batch],
                output_format='dataframe'
            )
            if not evals_f1.empty:
                f1_map.update(dict(zip(evals_f1['run_id'], evals_f1['value'])))
        
        acc_df = sampled_df[['run_id']].copy()
        acc_df['predictive_accuracy'] = sampled_df['run_id'].map(lambda x: accuracy_map.get(int(x)))
        acc_path = os.path.join("runs/accuracies", f"task_{task_id}_accuracies.csv")
        acc_df.to_csv(acc_path, index=False)
        
        f1_df = sampled_df[['run_id']].copy()
        f1_df['f1_score'] = sampled_df['run_id'].map(lambda x: f1_map.get(int(x)))
        f1_path = os.path.join("runs/f1_scores", f"task_{task_id}_f1.csv")
        f1_df.to_csv(f1_path, index=False)
        
        print(f"Task {task_id}: Saved accuracies and f1 scores")
        
    except Exception as e:
        print(f"Task {task_id}: Error - {e}")

Fetching accuracies and f1:  20%|██        | 1/5 [00:00<00:02,  1.43it/s]

Task 6: Saved accuracies and f1 scores


Fetching accuracies and f1:  40%|████      | 2/5 [00:01<00:01,  1.53it/s]

Task 11: Saved accuracies and f1 scores


Fetching accuracies and f1:  60%|██████    | 3/5 [00:02<00:01,  1.49it/s]

Task 12: Saved accuracies and f1 scores


Fetching accuracies and f1:  80%|████████  | 4/5 [00:02<00:00,  1.47it/s]

Task 14: Saved accuracies and f1 scores


Fetching accuracies and f1: 100%|██████████| 5/5 [00:03<00:00,  1.50it/s]

Task 16: Saved accuracies and f1 scores





In [55]:
os.makedirs("runs/statistics", exist_ok=True)

for task_id in tqdm(task_ids, desc="Generating statistics"):
    try:
        acc_path = os.path.join("runs/accuracies", f"task_{task_id}_accuracies.csv")
        acc_df = pd.read_csv(acc_path)
        accs = acc_df['predictive_accuracy'].dropna()
        
        f1_path = os.path.join("runs/f1_scores", f"task_{task_id}_f1.csv")
        f1_df = pd.read_csv(f1_path)
        f1s = f1_df['f1_score'].dropna()
        
        stats = {
            'task_id': task_id,
            'acc_count': len(accs),
            'acc_mean': accs.mean(),
            'acc_std': accs.std(),
            'acc_median': accs.median(),
            'f1_count': len(f1s),
            'f1_mean': f1s.mean(),
            'f1_std': f1s.std(),
            'f1_median': f1s.median()
        }
        
        stats_path = os.path.join("runs/statistics", f"task_{task_id}_stats.csv")
        pd.DataFrame([stats]).to_csv(stats_path, index=False)
        
        print(f"Task {task_id}: Statistics saved")
        
    except Exception as e:
        print(f"Task {task_id}: Error - {e}")

# Build algorithm performance matrix
os.makedirs("final", exist_ok=True)

merged = []
for task_id in task_ids:
    row = {'task_id': task_id}
    
    acc_path = os.path.join("runs/accuracies", f"task_{task_id}_accuracies.csv")
    acc_df = pd.read_csv(acc_path)
    accs = acc_df['predictive_accuracy'].dropna()
    
    f1_path = os.path.join("runs/f1_scores", f"task_{task_id}_f1.csv")
    f1_df = pd.read_csv(f1_path)
    f1s = f1_df['f1_score'].dropna()
    
    row['mean_accuracy'] = accs.mean()
    row['median_accuracy'] = accs.median()
    row['mean_f1'] = f1s.mean()
    row['median_f1'] = f1s.median()
    
    merged.append(row)

perf_df = pd.DataFrame(merged).set_index("task_id")
perf_df.to_csv("final/algorithm_performances.csv")

print("Algorithm performance matrix saved to final/algorithm_performances.csv")

Generating statistics: 100%|██████████| 5/5 [00:00<00:00, 174.81it/s]

Task 6: Statistics saved
Task 11: Statistics saved
Task 12: Statistics saved
Task 14: Statistics saved
Task 16: Statistics saved
Algorithm performance matrix saved to final/algorithm_performances.csv





In [3]:
import json 

In [6]:
with open("flows/filtered_flow_algorithm_mapping_v2.json", "r") as f:
        flow_map = json.load(f)
valid_flow_ids = set(map(int, flow_map.keys()))
flow_to_algo = {int(fid): entry["algorithm_type"] for fid, entry in flow_map.items()}

print(valid_flow_ids)
print(flow_to_algo)

{8192, 8193, 8194, 8196, 8198, 8200, 16392, 8202, 8203, 8204, 8209, 8210, 8211, 8212, 8213, 8214, 16407, 16410, 8219, 8220, 8224, 8226, 16419, 8230, 8231, 8233, 8234, 8238, 8239, 8240, 8242, 16434, 16437, 8246, 8248, 8249, 59, 60, 61, 62, 63, 64, 65, 8254, 8255, 8256, 8257, 8258, 8259, 8261, 8262, 74, 8264, 8268, 8269, 16461, 80, 8272, 8273, 8274, 8275, 8276, 8277, 8278, 8281, 8282, 92, 8285, 94, 95, 8286, 8287, 8288, 8289, 100, 101, 102, 16479, 16488, 106, 107, 108, 109, 112, 118, 8311, 8312, 123, 8315, 8317, 126, 8318, 8320, 131, 132, 133, 8323, 8324, 136, 137, 8327, 8328, 8329, 141, 8330, 144, 145, 149, 150, 151, 8342, 153, 154, 155, 156, 157, 158, 159, 160, 8348, 162, 163, 164, 165, 8351, 8353, 168, 8354, 8355, 8356, 8359, 8360, 8361, 8365, 8366, 8367, 8369, 8371, 16560, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 8374, 192, 8376, 8377, 8378, 8379, 8380, 8382, 8384, 8385, 8392, 202, 8393, 204, 205, 206, 207, 208, 209, 210, 8399, 212, 213, 8401, 8402, 8403, 8405, 8408, 223, 84

In [9]:
#!/usr/bin/env python
# coding: utf-8

"""Print shapes of meta features and targets"""

import pandas as pd
from typing import Tuple, Dict, Optional, List

# Load metatargets
metatarget_path = "meta_targets/accuracy.csv"
metatarget_df = pd.read_csv(metatarget_path, index_col=0)

print("=" * 70)
print("METATARGET (Performance Matrix)")
print("=" * 70)
print(f"Shape: {metatarget_df.shape}")
print(f"  Rows (datasets): {metatarget_df.shape[0]}")
print(f"  Columns (algorithms): {metatarget_df.shape[1]}")
print(f"Algorithms: {list(metatarget_df.columns)}")
print()

# Load metafeatures
d2v = pd.read_csv("qualities/d2v/metafeatures.csv", index_col=0)
traditional = pd.read_csv("qualities/traditional/metafeatures.csv", index_col=0)
hybrid = pd.concat([traditional, d2v], axis=1, join='inner')

print("=" * 70)
print("META-FEATURES")
print("=" * 70)
print(f"\nD2V:")
print(f"  Shape: {d2v.shape}")
print(f"    Rows (datasets): {d2v.shape[0]}")
print(f"    Columns (features): {d2v.shape[1]}")

print(f"\nTraditional:")
print(f"  Shape: {traditional.shape}")
print(f"    Rows (datasets): {traditional.shape[0]}")
print(f"    Columns (features): {traditional.shape[1]}")

print(f"\nHybrid:")
print(f"  Shape: {hybrid.shape}")
print(f"    Rows (datasets): {hybrid.shape[0]}")
print(f"    Columns (features): {hybrid.shape[1]}")
print()

# Alignment
print("=" * 70)
print("AFTER ALIGNMENT (Inner Join)")
print("=" * 70)

for name, features in [("d2v", d2v), ("traditional", traditional), ("hybrid", hybrid)]:
    common_idx = features.index.intersection(metatarget_df.index)
    X = features.loc[common_idx]
    y = metatarget_df.loc[common_idx]
    
    print(f"\n{name.upper()}:")
    print(f"  X (features) shape:  {X.shape}")
    print(f"    Rows (datasets): {X.shape[0]}")
    print(f"    Columns (features): {X.shape[1]}")
    print(f"  y (targets) shape:   {y.shape}")
    print(f"    Rows (datasets): {y.shape[0]}")
    print(f"    Columns (algorithms): {y.shape[1]}")

METATARGET (Performance Matrix)
Shape: (54, 5)
  Rows (datasets): 54
  Columns (algorithms): 5
Algorithms: ['random_forest', 'support_vector_machine', 'linear_models', 'xgboost', 'decision_tree']

META-FEATURES

D2V:
  Shape: (54, 32)
    Rows (datasets): 54
    Columns (features): 32

Traditional:
  Shape: (54, 93)
    Rows (datasets): 54
    Columns (features): 93

Hybrid:
  Shape: (54, 125)
    Rows (datasets): 54
    Columns (features): 125

AFTER ALIGNMENT (Inner Join)

D2V:
  X (features) shape:  (54, 32)
    Rows (datasets): 54
    Columns (features): 32
  y (targets) shape:   (54, 5)
    Rows (datasets): 54
    Columns (algorithms): 5

TRADITIONAL:
  X (features) shape:  (54, 93)
    Rows (datasets): 54
    Columns (features): 93
  y (targets) shape:   (54, 5)
    Rows (datasets): 54
    Columns (algorithms): 5

HYBRID:
  X (features) shape:  (54, 125)
    Rows (datasets): 54
    Columns (features): 125
  y (targets) shape:   (54, 5)
    Rows (datasets): 54
    Columns (algorit