In [1]:
import openml
import pandas as pd
import os
import json
import time
from tqdm.notebook import tqdm

# Create a subfolder for storing datasets and meta-features
base_folder = "openml_datasets"
os.makedirs(base_folder, exist_ok=True)

# Define meta-feature categories (based on the paper)
META_FEATURE_CATEGORIES = {
    "Simple": [
        "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses",
        "PercentageOfBinaryFeatures", "PercentageOfSymbolicFeatures", 
        "PercentageOfNumericFeatures", "PercentageOfMissingValues",
        "PercentageOfInstancesWithMissingValues", 
        "MajorityClassPercentage", "MinorityClassPercentage", "Dimensionality"
    ],
    "Statistical": [
        "MeanMeansOfNumericAtts", "MeanStdDevOfNumericAtts", 
        "MeanKurtosisOfNumericAtts", "MeanSkewnessOfNumericAtts",
        "MinMeansOfNumericAtts", "MinStdDevOfNumericAtts", 
        "MinKurtosisOfNumericAtts", "MinSkewnessOfNumericAtts",
        "MaxMeansOfNumericAtts", "MaxStdDevOfNumericAtts", 
        "MaxKurtosisOfNumericAtts", "MaxSkewnessOfNumericAtts"
    ],
    "Information_Theoretic": [
        "ClassEntropy", "MeanAttributeEntropy", "MinAttributeEntropy", 
        "MaxAttributeEntropy", "MeanMutualInformation", "MinMutualInformation", 
        "MaxMutualInformation", "EquivalentNumberOfAtts", "MeanNoiseToSignalRatio"
    ],
    "Landmarking": [
        "NaiveBayesAUC", "NaiveBayesErrRate", "kNN1NAUC", "kNN1NErrRate",
        "DecisionStumpAUC", "DecisionStumpErrRate", "J48AUC", "J48ErrRate",
        "RandomTreeDepth1AUC", "RandomTreeDepth1ErrRate",
        "REPTreeDepth1AUC", "REPTreeDepth1ErrRate"
    ]
}

def get_dataset_metafeatures(dataset_id, verbose=False):
    """
    Fetch meta-features for a specific dataset from OpenML
    """
    try:
        # Get dataset
        dataset = openml.datasets.get_dataset(dataset_id, download_data=False)
        
        # Get all qualities (meta-features)
        qualities = dataset.qualities
        
        # Organize meta-features into categories
        categorized_features = {
            "dataset_id": dataset_id,
            "name": dataset.name,
            "Simple": {},
            "Statistical": {},
            "Information_Theoretic": {},
            "Landmarking": {},
            "Other": {}
        }
        
        # Add all qualities to appropriate categories
        for key, value in qualities.items():
            # Skip None values
            if value is None:
                continue
                
            # Check which category it belongs to
            category_found = False
            for category, features in META_FEATURE_CATEGORIES.items():
                for feature in features:
                    if feature in key:
                        categorized_features[category][key] = value
                        category_found = True
                        break
                if category_found:
                    break
            
            # If not found in any category, put in Other
            if not category_found:
                categorized_features["Other"][key] = value
        
        if verbose:
            print(f"Found {sum(len(v) for k, v in categorized_features.items() if k not in ['dataset_id', 'name'])} meta-features for dataset {dataset_id}")
        
        return categorized_features
    except Exception as e:
        if verbose:
            print(f"Error fetching dataset {dataset_id}: {str(e)}")
        return None

# No custom meta-feature calculation needed

def download_datasets_with_metafeatures(num_datasets=100, min_qualities=20, save_data=False):
    """
    Download datasets from OpenML and save their meta-features
    
    Args:
        num_datasets: Number of datasets to download (set high to get all)
        min_qualities: Minimum number of qualities required for a dataset
        save_data: Whether to save the actual dataset CSV
    """
    print("Fetching list of datasets from OpenML...")
    datasets_dict = openml.datasets.list_datasets()
    datasets = list(datasets_dict.items())
    
    print(f"Found {len(datasets)} datasets")
    
    successful = 0
    failed = 0
    skipped = 0
    
    for i, (dataset_id, dataset_info) in enumerate(tqdm(datasets[:num_datasets], desc="Processing datasets")):
        try:
            # Fetch meta-features first (lightweight operation)
            metafeatures = get_dataset_metafeatures(dataset_id)
            
            if metafeatures is None:
                failed += 1
                continue
                
            # Count total number of qualities
            total_qualities = sum(len(v) for k, v in metafeatures.items() 
                               if k not in ['dataset_id', 'name'])
            
            # Skip if too few qualities
            if total_qualities < min_qualities:
                skipped += 1
                continue
            
            # Create a valid folder name
            dataset_name = metafeatures['name'].replace(" ", "_").replace("/", "_")
            dataset_folder = os.path.join(base_folder, f"{dataset_id}_{dataset_name}")
            os.makedirs(dataset_folder, exist_ok=True)
            
            # Save meta-features to JSON file
            metafeatures_file = os.path.join(dataset_folder, "metafeatures.json")
            with open(metafeatures_file, 'w') as f:
                json.dump(metafeatures, f, indent=2)
            
            if save_data:
                # Download the actual dataset (more heavyweight)
                dataset = openml.datasets.get_dataset(dataset_id)
                X, y, categorical_indicator, feature_names = dataset.get_data(
                    target=dataset.default_target_attribute
                )
                
                # Convert to pandas DataFrame
                df = pd.DataFrame(X, columns=feature_names)
                if y is not None:
                    df['target'] = y
                
                # Save to CSV
                csv_file = os.path.join(dataset_folder, f"{dataset_name}.csv")
                df.to_csv(csv_file, index=False)
            
            successful += 1
            
        except Exception as e:
            print(f"Error processing dataset {dataset_id}: {str(e)}")
            failed += 1
        
        # Be nice to the API
        time.sleep(0.2)
        
        # Print progress every 20 datasets
        if (i + 1) % 20 == 0:
            print(f"Progress: {i+1}/{min(num_datasets, len(datasets))} datasets processed")
            print(f"  - Successful: {successful}")
            print(f"  - Failed: {failed}")
            print(f"  - Skipped (too few qualities): {skipped}")
    
    print(f"\nCompleted downloading meta-features:")
    print(f"  - Successful: {successful} datasets")
    print(f"  - Failed: {failed} datasets")
    print(f"  - Skipped (too few qualities): {skipped} datasets")
    print(f"Meta-features saved to {os.path.abspath(base_folder)}")

# Example usage in a Jupyter notebook:
download_datasets_with_metafeatures(
    num_datasets=75,  # Set to a large number to download many datasets
    min_qualities=10,  # Minimum number of meta-features required
    save_data=True    # Set to True to also save the actual CSV data
)

Fetching list of datasets from OpenML...


  datasets_dict = openml.datasets.list_datasets()


Found 6024 datasets


Processing datasets:   0%|          | 0/75 [00:00<?, ?it/s]

Progress: 20/75 datasets processed
  - Successful: 20
  - Failed: 0
  - Skipped (too few qualities): 0
Progress: 40/75 datasets processed
  - Successful: 40
  - Failed: 0
  - Skipped (too few qualities): 0
Progress: 60/75 datasets processed
  - Successful: 60
  - Failed: 0
  - Skipped (too few qualities): 0

Completed downloading meta-features:
  - Successful: 75 datasets
  - Failed: 0 datasets
  - Skipped (too few qualities): 0 datasets
Meta-features saved to /Users/anukhayri/Desktop/mtlas/openml_datasets
