In [2]:
# Surpress warnings
import warnings
warnings.filterwarnings("ignore")

# Import libraries
from pmlb import dataset_names, classification_dataset_names, regression_dataset_names, fetch_data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
import pickle
import sys

# Import SK-learn and AutoSK-Learn
import autosklearn.classification
import autosklearn.regression
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

  from collections import Mapping, defaultdict


In [3]:
import argparse

parser = argparse.ArgumentParser(description='Run Auto-SkLearn on PMLB datasets')

parser.add_argument('-min', '--minset', type=int, metavar='', required=False, default=1, help = 'Min dataset number (default 1)')
parser.add_argument('-max', '--maxset', type=int, metavar='', required=False, default=166, help = '# Max dataset number (default is 166 for classifcation, 120 for regression)')
parser.add_argument('-mem', '--memory', type=int, metavar='', required=False, default=3072, help = '# Memory capacity for the AutoSklean script (default 3072MB)')
parser.add_argument('-noxg', '--no_xgboost', action='store_true', help = '# Remove XGBoost library from being used in Auto-SkLearn')
parser.add_argument('-t', '--maxtime', type=int, metavar='', required=False, default=1, help = 'Maximum time to run the model for in seconds(default 3600)')

class_group = parser.add_mutually_exclusive_group()
class_group.add_argument('-r', '--regre_sets', action='store_true', help='Benchmark on regression sets')
class_group.add_argument('-c', '--class_sets', action='store_true', help='Benchmark on classification sets (default)')

_StoreTrueAction(option_strings=['-c', '--class_sets'], dest='class_sets', nargs=0, const=True, default=False, type=None, choices=None, help='Benchmark on classification sets (default)', metavar=None)

In [10]:
args = parser.parse_args(['-min','1','-max','10','-noxg','-t', '36', '-c'])

In [11]:
# Assign variables based on arguments
minset = args.minset
maxset = args.maxset
max_time = args.maxtime
regre_sets = args.regre_sets
class_sets = args.class_sets
no_xgboost = args.no_xgboost
memory_cap = args.memory

In [12]:
# Set classification sets to default if no class was selected
if not regre_sets and not class_sets:
    class_sets = True

# Rescale dataset max number to be within boundaries
if maxset < minset:
    temp = maxset
    maxset = minset
    minset = temp
if minset < 1:
    minset = 1
    print('Minset provided is less than 1, changed to 1.')
if class_sets and maxset > 166:
    maxset = 166
    print('Maxset provided is greater than 166, changed to 166.')
if regre_sets and maxset > 120:                
    maxset = 120
    print('Maxset provided is greater than 120, changed to 120.')  

In [13]:
print(minset)
print(maxset)
print(max_time)
print(regre_sets)
print(class_sets)
print(no_xgboost)
print(memory_cap)

1
10
36
False
True
True
3072


In [14]:
# Create a dictionary of the number of features, instances, and classes per classification dataset
# Potentially look into including number of binary, integer, and float features in the future

datasets = []
dataset_props = {}

if class_sets:
    dataset_names = classification_dataset_names[minset-1: maxset] 
if regre_sets:
    dataset_names = regression_dataset_names[minset-1: maxset]

dataset_number = minset;
for dataset in dataset_names:
    X, y = fetch_data(dataset, return_X_y=True)
    num_instances, num_features =  X.shape
    if num_instances > 500000:
        dataset_number += 1
        continue        
    num_classes = (np.unique(y)).size if class_sets else -1
    dataset_props[dataset] = (num_instances, num_features, num_classes, dataset_number)
    dataset_number += 1

In [None]:
# Add performance results of the datasets that we query on to a final dataframe to output
df_rows_list = []
time_cap = 36
while(time_cap < max_time * 2):
    # Cap the time cap at the max time when the benchmarker is on its final iteration
    if(time_cap > max_time):
        time_cap = max_time
    print('CURRENT TIME IS ', time_cap)
        
    for dataset in dataset_names:
        curr_dataset_results = {}
        print("Auto-SKLearn, on dataset ", dataset, " | Number: ", str(dataset_props[dataset][3]), "max of ", str(maxset))
        print("Properties: ")
        print(str(dataset_props[dataset]))
        
        # Split the data to training and test sets
        X, y = fetch_data(dataset, return_X_y=True)
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)

        # Run the classifier
        automl = 0;
        
        if class_sets:
            if no_xgboost:
                automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task = time_cap, 
                                                                          ml_memory_limit = memory_cap,
                                                                          exclude_estimators = 'xgradient_boosting.py')
            else:
                automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task = time_cap, 
                                                                          ml_memory_limit = memory_cap)
        if regre_sets:
            if no_xgboost:
                automl = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task = time_cap, 
                                                                 ml_memory_limit = memory_cap,
                                                                 exclude_estimators = 'xgradient_boosting.py')                  
            else:
                automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task = time_cap, 
                                                                          ml_memory_limit = memory_cap)
                
        # Use the fit and test with AutoSkLearn on the current data.
        # If exception occurs, continue to next dataset.
        try:
            print("Auto-SKLearn, fitting")
            automl.fit(X_train, y_train)
            print("Auto-SKLearn, testing")        
            current_score = automl.score(X_test, y_test)                            
            print("Auto-SKLearn, finished testing on set ", str(dataset_props[dataset][3]))
            print("Current time Autosklearn score: ", str(current_score))
        except:
            print("EXCEPTION: CURRENT DATASET FAILED WITH AUTOSKLEARN. CONTINUING TO NEXT DATASET.")
            continue;
                          
        # Store the result in a dictionary
        curr_dataset_results['name'] = dataset
        curr_dataset_results['number'] = dataset_props[dataset][3]
        curr_dataset_results['num_instances'] = dataset_props[dataset][0]
        curr_dataset_results['num_features'] = dataset_props[dataset][1]
        curr_dataset_results['num_classes'] = dataset_props[dataset][2]
        curr_dataset_results['time_cap'] = time_cap
        curr_dataset_results['score'] = current_score
        
        # Save the pickled model
        curr_dataset_results['model'] = pickle.dumps(automl)
        print('size of model mb: ', str(sys.getsizeof(curr_dataset_results['model'])/1000000))
              
        # Append current dictionary to a list of dictionary
        df_rows_list.append(curr_dataset_results)                
        
        # Create a Pandas Dataframe with the results
        autosklearn_df = pd.DataFrame(df_rows_list)
        autosklearn_df.sort_values(by=['number', 'time_cap'])

        # Save results into a CSV after every round
        set_type_string = 'c' if class_sets else 'r'

        file_name = 'PMLB_benchmark_results/' + set_type_string + '_' + str(minset) + '_' + str(maxset) + '_' + 'maxtime' + '_' + str(max_time) + '.csv'
        print('saved to ', file_name)

        autosklearn_df.to_csv(file_name, sep='\t')
        
    time_cap *= 2

CURRENT TIME IS  36
Auto-SKLearn, on dataset  GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1  | Number:  1 max of  10
Properties: 
(1600, 1000, 2, 1)
Auto-SKLearn, fitting
Time limit for a single run is higher than total time limit. Capping the limit for a single run to the total time given to SMAC (35.591344)
Auto-SKLearn, testing
Auto-SKLearn, finished testing on set  1
Current time Autosklearn score:  0.4825
size of model mb:  5.688343
saved to  PMLB_benchmark_results/c_1_10_maxtime_36.csv
Auto-SKLearn, on dataset  GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1  | Number:  2 max of  10
Properties: 
(1600, 20, 2, 2)
Auto-SKLearn, fitting
Time limit for a single run is higher than total time limit. Capping the limit for a single run to the total time given to SMAC (35.538961)
Auto-SKLearn, testing
Auto-SKLearn, finished testing on set  2
Current time Autosklearn score:  0.6125
size of model mb:  31.558024
saved to  PMLB_benchmark_results/c_1_10_maxtime_36.csv
Auto-SKLearn, on datase