In [46]:
# Surpress warnings
import warnings
warnings.filterwarnings("ignore")

In [13]:
# Import libraries
from pmlb import dataset_names, fetch_data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

# Import SK-learn and AutoSK-Learn
import autosklearn.classification
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

In [35]:
# Add Argument Parsers for the following:

# Min dataset number (default is 1)
# Max dataset number (default is 166 for classifcation, 120 for regression)
# Mutually exclusive argument for classification vs regression dataset
# List of number of seconds to train datasets on (default is 3600)

import argparse

parser = argparse.ArgumentParser(description='Run Auto-SkLearn on PMLB datasets')

parser.add_argument('-min', '--minset', type=int, metavar='', required=False, default=1, help = 'Min dataset number (default 1)')
parser.add_argument('-max', '--maxset', type=int, metavar='', required=False, default=166, help = '# Max dataset number (default is 166 for classifcation, 120 for regression)')
parser.add_argument('-t', '--times', type=int, nargs='+', metavar='', required=False, default=3600, help = 'List of number of seconds to train datasets on (default is 3600)')

class_group = parser.add_mutually_exclusive_group()
class_group.add_argument('-r', '--regre_sets', action='store_true', help='Benchmark on regression sets')
class_group.add_argument('-c', '--class_sets', action='store_true', help='Benchmark on classification sets (default)')



_StoreTrueAction(option_strings=['-c', '--class_sets'], dest='class_sets', nargs=0, const=True, default=False, type=None, choices=None, help='Benchmark on classification sets', metavar=None)

In [68]:
args = parser.parse_args(['-min', '1', '-max', '10', '-t', '80', '120'])

# Assign variables based on arguments
minset = args.minset
maxset = args.maxset
times = args.times
regre_sets = args.regre_sets
class_sets = args.class_sets

# Set classification sets to default if no class was selected

if not regre_sets and not class_sets:
    class_sets = True

# Rescale dataset max number to be within boundaries
if minset < 1:
    minset = 1
    print('Minset provided is less than 1, changed to 1.')
if class_sets and maxset > 166:
    maxset = 166
    print('Maxset provided is greater than 166, changed to 166.')
if regre_sets and maxset > 120:                
    maxset = 120
    print('Maxset provided is greater than 120, changed to 120.')    
    
print(minset)
print(maxset)
print(times)
print(regre_sets)
print(class_sets)    
        

1
10
[80, 120]
False
True


In [71]:
# Create a dictionary of the number of features, instances, and classes per classification dataset
# Potentially look into including number of binary, integer, and float features in the future

datasets = []
dataset_props = {}

if class_sets:
    dataset_names = classification_dataset_names[minset-1: maxset]    
if regre_sets:
    dataset_names = regression_dataset_names[minset-1: maxset]

dataset_number = minset;
for dataset in dataset_names:
    X, y = fetch_data(dataset, return_X_y=True)
    num_instances, num_features =  X.shape
    num_classes = (np.unique(y)).size    
    dataset_props[dataset] = (num_instances, num_features, num_classes, dataset_number)
    dataset_number += 1

In [72]:
dataset_props

{'GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1': (1600, 1000, 2, 1),
 'GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1': (1600, 20, 2, 2),
 'GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1': (1600, 20, 2, 3),
 'GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1': (1600, 20, 2, 4),
 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001': (1600,
  20,
  2,
  5),
 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001': (1600,
  20,
  2,
  6),
 'Hill_Valley_with_noise': (1212, 100, 2, 7),
 'Hill_Valley_without_noise': (1212, 100, 2, 8),
 'adult': (48842, 14, 2, 9),
 'agaricus-lepiota': (8145, 22, 2, 10)}

In [60]:
# Add to this dataframe the performance results of the datasets that we query on
df_rows_list = []
for time_cap in times:
    print('CURRENT TIME IS ', time_cap)
        
    for dataset in dataset_names:
        curr_dataset_results = {}
        print("Auto-SKLearn, on dataset ", dataset, " | Number: ", str(dataset_props[dataset][3]), "max of ", str(maxset))
        print("Properties: ")
        print(str(dataset_props[dataset]))
        
        # Split the data to training and test sets
        X, y = fetch_data(dataset, return_X_y=True)
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)

        # Run the classifier
        automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task = time_cap)
        print("Auto-SKLearn, fitting")
        automl.fit(X_train, y_train)
        print("Auto-SKLearn, testing")        
        current_score = automl.score(X_test, y_test)                            
        print("Auto-SKLearn, finished testing on set ", str(dataset_props[dataset][3]))
        print("Current time Autosklearn score: ", str(current_score))
              
        # Store the result in a dictionary
        curr_dataset_results['name'] = dataset
        curr_dataset_results['number'] = dataset_props[dataset][3]
        curr_dataset_results['num_instances'] = dataset_props[dataset][0]
        curr_dataset_results['num_features'] = dataset_props[dataset][1]
        curr_dataset_results['num_classes'] = dataset_props[dataset][2]
        curr_dataset_results['time_cap'] = time_cap
        curr_dataset_results['score'] = current_score
              
        # Append current dictionary to a list of dictionary
        df_rows_list.append(curr_dataset_results)                   
        




CURRENT TIME IS  100
Auto-SKLearn, on dataset  GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1  | Number:  2 out of  4
Properties: 
(1479, 8, 9, 2)
Auto-SKLearn, fitting
Time limit for a single run is higher than total time limit. Capping the limit for a single run to the total time given to SMAC (99.425137)
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weight = 1
[15:43:31] /workspace/src/gbm/gbtree.cc:492: drop 0 trees, weigh

Time limit for a single run is higher than total time limit. Capping the limit for a single run to the total time given to SMAC (199.505947)
Auto-SKLearn, testing
Auto-SKLearn, finished testing on set  4
Current time Autosklearn score:  0.6075


In [61]:
autosklearn_df = pd.DataFrame(df_rows_list)

In [62]:
autosklearn_df

Unnamed: 0,name,num_classes,num_features,num_instances,number,score,time_cap
0,GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1,9,8,1479,2,0.6275,100
1,GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1,9,8,1479,3,0.7575,100
2,GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1,9,8,1479,4,0.55,100
3,GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1,9,8,1479,2,0.6275,200
4,GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1,9,8,1479,3,0.725,200
5,GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1,9,8,1479,4,0.6075,200


In [67]:
# Save results into a CSV
set_type_string = 'c' if class_sets else 'r'

times_string = ''

for i in range(len(times)):
    times_string += str(times[i])
    if i != len(times) - 1:
        times_string += '_'

file_name = set_type_string + '_' + str(minset) + '_' + str(maxset) + '_' + 'times' + '_' + times_string + '.csv'
print('saving to ', file_name)

autosklearn_df.to_csv(file_name, sep='\t')

saving to  c_2_4_times_100_200.csv
