Cell 1: Imports
This cell imports all necessary libraries:

Standard libraries (os, pandas, numpy, json)
Scikit-learn packages for model training, evaluation, and preprocessing
XGBoost for gradient boosting
Wittgenstein for the RIPPER algorithm
SimpleImputer for handling missing values
Warnings suppression

In [37]:
# Standard libraries
import os
import json
import warnings

# Third-party libraries
import numpy as np
import pandas as pd
import xgboost as xgb
from wittgenstein import RIPPER

# Scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score, classification_report, precision_recall_fscore_support,
    precision_score, recall_score, f1_score, roc_auc_score, log_loss
)

# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# Only suppress warnings we expect
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

Cell 2: Dataset Loading
This cell defines and executes a function load_openml_datasets() that:

Scans a directory called "openml_datasets" for subdirectories
Loads CSV data files and their corresponding JSON metadata files
Stores them in a dictionary with dataset names as keys
Prints a summary of all 75 loaded datasets, including their shapes and available meta-features

In [38]:
import os
import glob
import json
import pandas as pd
from tqdm.notebook import tqdm  # For progress tracking

def load_openml_datasets(root_dir, lazy_load=False):
    """
    Load datasets from the OpenML dataset directory structure.
    
    Args:
        root_dir: Path to the root directory containing dataset folders
        lazy_load: If True, only load metadata and load data on demand
        
    Returns:
        Dictionary mapping dataset names to their data and metadata
    """
    datasets = {}
    
    # Get all subdirectories (dataset folders)
    dataset_dirs = [d for d in glob.glob(os.path.join(root_dir, "*")) 
                   if os.path.isdir(d) and os.path.basename(d) != os.path.basename(root_dir)]
    
    # Process each dataset directory with progress bar
    for dataset_dir in tqdm(dataset_dirs, desc="Loading datasets"):
        dataset_name = os.path.basename(dataset_dir)
        
        # Find CSV and JSON files
        csv_files = glob.glob(os.path.join(dataset_dir, "*.csv"))
        json_files = glob.glob(os.path.join(dataset_dir, "*.json"))
        
        if csv_files and json_files:
            try:
                # Get paths
                data_path = csv_files[0]  # Assuming the first CSV file is the main one
                meta_path = json_files[0]  # Assuming the first JSON file has the metadata
                
                # Create dataset entry
                datasets[dataset_name] = {
                    'data_file': os.path.basename(data_path),
                    'meta_file': os.path.basename(meta_path),
                    'path': dataset_dir
                }
                
                # Load JSON metadata
                with open(meta_path, 'r') as f:
                    datasets[dataset_name]['meta_features'] = json.load(f)
                
                # Load data if not lazy loading
                if not lazy_load:
                    datasets[dataset_name]['data'] = pd.read_csv(data_path)
                
            except Exception as e:
                print(f"Error loading dataset {dataset_name}: {str(e)}")
    
    return datasets

# Function to get data for a dataset (used for lazy loading)
def get_dataset_data(datasets, dataset_name):
    """Get the actual data for a dataset, loading it if needed."""
    if dataset_name not in datasets:
        return None
        
    if 'data' not in datasets[dataset_name]:
        data_path = os.path.join(datasets[dataset_name]['path'], 
                                datasets[dataset_name]['data_file'])
        try:
            datasets[dataset_name]['data'] = pd.read_csv(data_path)
        except Exception as e:
            print(f"Error loading data for {dataset_name}: {str(e)}")
            return None
            
    return datasets[dataset_name]['data']

# Load the datasets
openml_datasets = load_openml_datasets(dataset_root, lazy_load=False)

# Print summary of loaded datasets
print(f"\nLoaded {len(openml_datasets)} datasets:")
for name, dataset in openml_datasets.items():
    print(f"\nDataset: {name}")
    print(f"Data shape: {dataset['data'].shape}")
    print(f"Meta features: {list(dataset['meta_features'].keys())[:5]}..." if dataset['meta_features'] else "No meta features")

Loading datasets:   0%|          | 0/75 [00:00<?, ?it/s]


Loaded 75 datasets:

Dataset: 118_BNG(mfeat-zernike,nominal,1000000)
Data shape: (1000000, 48)
Meta features: ['dataset_id', 'name', 'Simple', 'Statistical', 'Information_Theoretic']...

Dataset: 9_autos
Data shape: (205, 26)
Meta features: ['dataset_id', 'name', 'Simple', 'Statistical', 'Information_Theoretic']...

Dataset: 72_BNG(kr-vs-kp)
Data shape: (1000000, 37)
Meta features: ['dataset_id', 'name', 'Simple', 'Statistical', 'Information_Theoretic']...

Dataset: 55_hepatitis
Data shape: (155, 20)
Meta features: ['dataset_id', 'name', 'Simple', 'Statistical', 'Information_Theoretic']...

Dataset: 50_tic-tac-toe
Data shape: (958, 10)
Meta features: ['dataset_id', 'name', 'Simple', 'Statistical', 'Information_Theoretic']...

Dataset: 34_postoperative-patient-data
Data shape: (90, 9)
Meta features: ['dataset_id', 'name', 'Simple', 'Statistical', 'Information_Theoretic']...

Dataset: 62_zoo
Data shape: (101, 17)
Meta features: ['dataset_id', 'name', 'Simple', 'Statistical', 'Informatio

Cell 3: Model Training Function
This cell defines a comprehensive function train_models_on_dataset() that:

Takes a dataset name and training parameters
Preprocesses the data (handles missing values, encodes categorical features)
Scales features using StandardScaler
Trains seven different classification models:

Logistic Regression
Decision Tree
Random Forest
XGBoost
SVM
Neural Network (MLP)
Gaussian Naive Bayes


Evaluates each model and returns performance metrics

In [33]:
from joblib import Parallel, delayed
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
)

def train_models_on_dataset(dataset_name, test_size=0.2, random_state=42, n_jobs=-1):
    """
    Train all models on a specific dataset.
    
    Args:
        dataset_name: Name of the dataset in openml_datasets dictionary
        test_size: Proportion of data to use for testing
        random_state: Random seed for reproducibility
        n_jobs: Number of parallel jobs for models that support it
        
    Returns:
        Dictionary with trained models and their performance metrics
    """
    if dataset_name not in openml_datasets:
        print(f"Dataset {dataset_name} not found")
        return None
    
    data = openml_datasets[dataset_name]['data']
    
    # Assume the target is the last column (you can modify this if needed)
    y_col = data.columns[-1]
    X = data.drop(columns=[y_col])
    y = data[y_col]
    
    # Handle missing values - separately for numeric and categorical columns
    print(f"  Handling missing values in dataset...")
    
    # Identify numeric and categorical columns
    numeric_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(exclude=['number']).columns
    
    print(f"  Found {len(numeric_cols)} numeric columns and {len(categorical_cols)} categorical columns")
    
    # Handle numeric columns with mean imputation
    if not numeric_cols.empty:
        numeric_imputer = SimpleImputer(strategy='mean')
        X[numeric_cols] = numeric_imputer.fit_transform(X[numeric_cols])
    
    # Handle categorical columns with most frequent value imputation
    if not categorical_cols.empty:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
    
    # Adjust class labels to start from 0 if needed
    if pd.api.types.is_numeric_dtype(y):
        if y.min() != 0:
            print(f"  Adjusting class labels to start from 0 (original range: {y.min()}-{y.max()})")
            y = y - y.min()

    # Handle categorical features
    X = pd.get_dummies(X)
    
    # Convert categorical target to numeric if needed
    if not pd.api.types.is_numeric_dtype(y):
        y = pd.factorize(y)[0]
    
    # Check number of classes
    n_classes = len(np.unique(y))
    is_binary = n_classes == 2
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, 
        stratify=y if len(np.unique(y)) < 20 else None
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize models with better parameters
    models = {
        'LogisticRegression': LogisticRegression(
            max_iter=1000, 
            random_state=random_state,
            n_jobs=n_jobs if n_jobs > 0 else None,
            solver='saga' if X_train.shape[1] > 10000 else 'lbfgs'
        ),
        'DecisionTreeClassifier': DecisionTreeClassifier(
            random_state=random_state,
            max_depth=None,  # Allow full trees
            min_samples_split=2
        ),
        'RandomForestClassifier': RandomForestClassifier(
            n_estimators=100, 
            random_state=random_state,
            n_jobs=n_jobs if n_jobs > 0 else None,
            max_features='sqrt'  # More efficient for high-dimensional data
        ),
        'XGBClassifier': xgb.XGBClassifier(
            n_estimators=100, 
            random_state=random_state,
            n_jobs=n_jobs if n_jobs > 0 else None,
            early_stopping_rounds=10,
            eval_metric='mlogloss' if n_classes > 2 else 'logloss'
        ),
        'SVC': SVC(
            random_state=random_state, 
            max_iter=1000,
            probability=True  # Needed for ROC AUC
        ),
        'MLPClassifier': MLPClassifier(
            max_iter=500, 
            random_state=random_state,
            early_stopping=True,
            validation_fraction=0.1
        ),
        'GaussianNB': GaussianNB()
    }
    
    results = {}
    print(f"Training models on dataset: {dataset_name}")
    
    # Define a function to train a single model
    def train_single_model(name, model, X_train, y_train, X_test, y_test, is_binary, n_classes):
        try:
            print(f"  Training {name}...")
            
            # Special handling for XGBoost with early stopping
            if name == 'XGBClassifier':
                X_tr, X_val, y_tr, y_val = train_test_split(
                    X_train_scaled, y_train, test_size=0.2, random_state=random_state
                )
                model.fit(
                    X_tr, y_tr,
                    eval_set=[(X_val, y_val)],
                    verbose=False
                )
            else:
                model.fit(X_train_scaled, y_train)
                
            y_pred = model.predict(X_test_scaled)
            
            # Calculate metrics based on problem type
            accuracy = accuracy_score(y_test, y_pred)
            
            # For multiclass problems
            precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
            recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
            f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
            
            # ROC AUC - handle binary vs multiclass
            roc_auc = None
            if hasattr(model, "predict_proba"):
                y_proba = model.predict_proba(X_test_scaled)
                if is_binary:
                    # For binary classification
                    roc_auc = roc_auc_score(y_test, y_proba[:, 1])
                elif n_classes > 2:
                    # For multiclass, use OVR approach
                    try:
                        roc_auc = roc_auc_score(
                            np.eye(n_classes)[y_test], 
                            y_proba,
                            multi_class='ovr'
                        )
                    except:
                        roc_auc = None
            
            # Log loss if available
            log_loss_value = None
            if hasattr(model, "predict_proba"):
                try:
                    log_loss_value = log_loss(y_test, y_proba)
                except:
                    log_loss_value = None
            
            result = {
                'model': model,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'roc_auc': roc_auc,
                'log_loss': log_loss_value,
                'predictions': y_pred
            }
            
            print(f"    Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, Log Loss: {log_loss_value:.4f if log_loss_value else 'N/A'}")
            return name, result
            
        except Exception as e:
            print(f"    Error training {name}: {str(e)}")
            return name, {'error': str(e)}
    
    # Train models (sequentially for better notebook output)
    for name, model in models.items():
        model_name, model_results = train_single_model(
            name, model, X_train_scaled, y_train, X_test_scaled, y_test, is_binary, n_classes
        )
        results[model_name] = model_results
    
    return results

Cell 4: Results Saving Function
This cell defines a function save_results_to_csv() that:

Takes model performance results and a dataset name
Formats the results into a DataFrame
Saves the DataFrame to a CSV file named "[dataset_name]_model_results.csv"

In [34]:
import datetime

def save_results_to_csv(results, dataset_name, include_timestamp=True):
    """
    Save model performance results to a CSV file.

    Args:
        results (dict): Dictionary with trained models and their performance metrics.
        dataset_name (str): Name of the dataset used for training.
        include_timestamp (bool): Whether to include a timestamp in the filename.
    """
    try:
        # Create data rows
        data = []
        for model_name, metrics in results.items():
            if 'error' in metrics:  # If model training failed
                data.append([model_name, 'Error', 'Error', 'Error', 'Error', 'Error', 'Error'])
            else:
                # Use `.get()` to handle missing keys safely
                data.append([
                    model_name, 
                    metrics.get('accuracy', 'N/A'), 
                    metrics.get('precision', 'N/A'), 
                    metrics.get('recall', 'N/A'), 
                    metrics.get('f1_score', 'N/A'), 
                    metrics.get('roc_auc', 'N/A'),
                    metrics.get('log_loss', 'N/A')
                ])

        # Create a DataFrame
        df = pd.DataFrame(data, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Log Loss'])

        # Add timestamp if requested
        timestamp = ""
        if include_timestamp:
            timestamp = f"_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
        
        # Save to CSV
        filename = f"{dataset_name}{timestamp}_model_results.csv"
        df.to_csv(filename, index=False)
        
        print(f"Results saved to {filename}")
        return filename
        
    except Exception as e:
        print(f"Error saving results: {str(e)}")
        return None

Cell 5: Single Dataset Training and Saving
This cell:

Runs the training function on a specific dataset ('16_mfeat-karhunen')
Saves the results to a CSV file

In [35]:
import matplotlib.pyplot as plt
import seaborn as sns

# Choose a specific dataset by name
test_dataset = "16_mfeat-karhunen"
print(f"Testing with dataset: {test_dataset}")

# Run the training function
results = train_models_on_dataset(test_dataset)

# Save results to CSV
csv_filename = save_results_to_csv(results, test_dataset)

# Print summary for this dataset
print(f"\n=== SUMMARY FOR {test_dataset} ===")
accuracies = {name: res['accuracy'] for name, res in results.items() if 'accuracy' in res}

if accuracies:
    best_model = max(accuracies.items(), key=lambda x: x[1])
    print(f"  Best model: {best_model[0]} (Accuracy: {best_model[1]:.4f})")
    
    print("  All models:")
    for model_name, accuracy in sorted(accuracies.items(), key=lambda x: x[1], reverse=True):
        print(f"    {model_name}: {accuracy:.4f}")

    # Visualize the results
    plt.figure(figsize=(12, 6))
    
    # Accuracy comparison
    sorted_models = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)
    plt.barh([name for name, _ in sorted_models], [acc for _, acc in sorted_models])
    plt.xlabel('Accuracy')
    plt.ylabel('Model')
    plt.title(f'Model Accuracy Comparison - {test_dataset}')
    plt.xlim(0, 1)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    for i, (_, acc) in enumerate(sorted_models):
        plt.text(acc + 0.01, i, f'{acc:.4f}', va='center')
    
    plt.tight_layout()
    plt.show()

Testing with dataset: 16_mfeat-karhunen
  Handling missing values in dataset...
  Found 64 numeric columns and 0 categorical columns
  Adjusting class labels to start from 0 (original range: 1-10)
Training models on dataset: 16_mfeat-karhunen
  Training LogisticRegression...
    Error training LogisticRegression: Invalid format specifier '.4f if log_loss_value else 'N/A'' for object of type 'float'
  Training DecisionTreeClassifier...
    Error training DecisionTreeClassifier: Invalid format specifier '.4f if log_loss_value else 'N/A'' for object of type 'float'
  Training RandomForestClassifier...
    Error training RandomForestClassifier: Invalid format specifier '.4f if log_loss_value else 'N/A'' for object of type 'float'
  Training XGBClassifier...
    Error training XGBClassifier: Invalid format specifier '.4f if log_loss_value else 'N/A'' for object of type 'float'
  Training SVC...
    Error training SVC: Invalid format specifier '.4f if log_loss_value else 'N/A'' for object of

Cell 6: Test Run with Performance Summary
This cell:

Runs the training function again on the same dataset ('16_mfeat-karhunen')
Prints a summary of the results, showing each model's accuracy
Identifies the best performing model (in this case, SVC and MLPClassifier tied at 97% accuracy)

In [36]:
# Choose a specific dataset by name
test_dataset = "BNG(lymph,nominal,1000000)"  # Replace "iris" with the actual name of your dataset
print(f"Testing with dataset: {test_dataset}")

# Run the training function on just this dataset
results = train_models_on_dataset(test_dataset)

# Print summary for this dataset
print(f"\n=== SUMMARY FOR {test_dataset} ===")
accuracies = {name: res['accuracy'] for name, res in results.items() if 'accuracy' in res}

if accuracies:
    best_model = max(accuracies.items(), key=lambda x: x[1])
    print(f"  Best model: {best_model[0]} (Accuracy: {best_model[1]:.4f})")
    
    print("  All models:")
    for model_name, accuracy in sorted(accuracies.items(), key=lambda x: x[1], reverse=True):
        print(f"    {model_name}: {accuracy:.4f}")

Testing with dataset: BNG(lymph,nominal,1000000)
Dataset BNG(lymph,nominal,1000000) not found

=== SUMMARY FOR BNG(lymph,nominal,1000000) ===


AttributeError: 'NoneType' object has no attribute 'items'

Cell 7: Batch Processing (Commented Out)
This cell contains code to:

Train models on all datasets in the collection
Generate a comprehensive summary of results across all datasets
The cell is not executed (no output shown) and likely would take a long time to run given the large number of datasets

In [None]:
all_results = {}
for dataset_name in openml_datasets.keys():
    print(f"\n=== Dataset: {dataset_name} ===")
    all_results[dataset_name] = train_models_on_dataset(dataset_name)

print("\n=== SUMMARY OF RESULTS ===")
for dataset_name, results in all_results.items():
    print(f"\nDataset: {dataset_name}")
    accuracies = {name: res['accuracy'] for name, res in results.items() if 'accuracy' in res}
    
    if accuracies:
        best_model = max(accuracies.items(), key=lambda x: x[1])
        print(f"  Best model: {best_model[0]} (Accuracy: {best_model[1]:.4f})")
        
        print("  All models:")
        for model_name, accuracy in sorted(accuracies.items(), key=lambda x: x[1], reverse=True):
            print(f"    {model_name}: {accuracy:.4f}")