In [9]:
import pandas as pd
import numpy as np
import os
# local
import sys
sys.path.append('lcdb_function')
from lcdb_function.lcdb import get_dataset, get_inner_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


Anchor list

In [2]:
# anchor_list = np.ceil(16 * 2 ** ((np.arange(35)) / 2)).astype(int)
anchor_list = np.ceil(16 * 2 ** ((np.arange(137)) / 8)).astype(int)
anchor_list

array([     16,      18,      20,      21,      23,      25,      27,
            30,      32,      35,      39,      42,      46,      50,
            54,      59,      64,      70,      77,      83,      91,
            99,     108,     118,     128,     140,     153,     166,
           182,     198,     216,     235,     256,     280,     305,
           332,     363,     395,     431,     470,     512,     559,
           609,     664,     725,     790,     862,     940,    1024,
          1117,    1218,    1328,    1449,    1580,    1723,    1879,
          2048,    2234,    2436,    2656,    2897,    3159,    3445,
          3757,    4096,    4467,    4871,    5312,    5793,    6317,
          6889,    7513,    8192,    8934,    9742,   10624,   11586,
         12634,   13778,   15025,   16384,   17867,   19484,   21248,
         23171,   25268,   27555,   30049,   32768,   35734,   38968,
         42495,   46341,   50536,   55109,   60097,   65536,   71468,
         77936,   84

Openml dataset ID CC-18

In [3]:
dataset_ids = [  3,     6,    11,    12,    14,    15,    16,    18,    22,
                23,    28,    29,    31,    32,    37,    38,    44,    46,
                50,    54,   151,   182,   188,   300,   307,   458,   469,
                554,  1049,  1050,  1053,  1063,  1067,  1068,  1461,  1462,
                1464,  1468,  1475,  1478,  1480,  1485,  1486,  1487,  1489,
                1494,  1497,  1501,  1510,  1590,  4134,  4534,  4538,  6332,
                23381, 23517, 40499, 40668, 40670, 40701, 40923, 40927, 40966,
                40975, 40978, 40979, 40982, 40983, 40984, 40994, 40996, 41027]

Seed range

In [4]:
seed_list = range(5)
seed_list

range(0, 5)

Learner

In [5]:
learner_zoo = [ 'SVC_linear',
                'SVC_poly',
                'SVC_rbf',
                'SVC_sigmoid',
                'sklearn.tree.DecisionTreeClassifier',
                'sklearn.tree.ExtraTreeClassifier',
                'sklearn.linear_model.LogisticRegression',
                'sklearn.linear_model.PassiveAggressiveClassifier',
                'sklearn.linear_model.Perceptron',
                'sklearn.linear_model.RidgeClassifier',
                'sklearn.linear_model.SGDClassifier',
                'sklearn.neural_network.MLPClassifier',
                'sklearn.discriminant_analysis.LinearDiscriminantAnalysis',
                'sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis',
                'sklearn.naive_bayes.BernoulliNB',
                'sklearn.naive_bayes.MultinomialNB',
                'sklearn.naive_bayes.ComplementNB',
                'sklearn.naive_bayes.GaussianNB',
                'sklearn.neighbors.KNeighborsClassifier',
                'sklearn.neighbors.NearestCentroid',
                'sklearn.ensemble.ExtraTreesClassifier',
                'sklearn.ensemble.RandomForestClassifier',
                'sklearn.ensemble.GradientBoostingClassifier',
                'sklearn.dummy.DummyClassifier'
                  ]

Create CSV file

In [6]:
def get_real_anchor_list(openml_id, feature_scaling):
    X, y = get_dataset(openml_id, feature_scaling, mix=False, preprocess=False)
    X_train, X_valid, X_test, y_train, y_valid, y_test = get_inner_split(X, y, outer_seed=0, inner_seed=0)
    real_anchor_list = anchor_list[anchor_list <= X_train.shape[0]] 
    return real_anchor_list

In [15]:
import itertools

timelimit = 3 # hour
num_splits = 1000
counter = 0

data = []
first_dataset = True
mix_list = [False]
realistic_list = [True, False]

one_seed = False
if one_seed:
    print('only doing 1 seed for testing purpose!')
    seed_list = [0]
else:
    seed_list = range(5)

for dataset_id in dataset_ids:
    data = []
    
    real_anchor_list = get_real_anchor_list(dataset_id, feature_scaling=False)

    print(f"Learner Zoo {len(learner_zoo)}, Real Anchor List {len(real_anchor_list)}")

    combinations = len(learner_zoo) * len(real_anchor_list) * len(seed_list) * len(seed_list) * 2 * len(mix_list) * len(realistic_list)
    print(f"we have here {combinations} combinations.")

    param_combinations = itertools.product(
        [dataset_id],
        learner_zoo,
        real_anchor_list,
        seed_list,
        seed_list,
        [True, False],  # fs values
        mix_list,
        [True, False], # realistic setting
        [timelimit]
    )

    # print('now looping over combinations')

    for combination in param_combinations:
        row = list(combination)
        row.insert(0, counter)
        counter = counter+1
        data.append(row)

    # print('added the rows, now converting to pandas')

    df = pd.DataFrame(data, columns=['jobid', 'openmlid', 'learner', 'size_train', 'outer_seed', 'inner_seed', 'feature_scaling', 'mix', 'realistic', 'timelimit'])
    df.to_csv('jobs_dataset%d.csv' % dataset_id)
    df = df.sample(frac=1).reset_index(drop=True)
    
    splits = np.array_split(df, num_splits)

    for index, value in enumerate(splits):
        # print('working on job %d that consists of %d tasks...\n' % (index, len(value)))
        if first_dataset:
            value.to_csv('jobs/experiments_job%d.csv' % index, index=False)
        else: 
            value.to_csv('jobs/experiments_job%d.csv' % index, mode='a', index=False, header=False)

    first_dataset = False
    print("-----------------------------------")



Loading raw data from OpenML ID 3
Learner Zoo 24, Real Anchor List 59
we have here 141600 combinations.
-----------------------------------
Loading raw data from OpenML ID 6
Learner Zoo 24, Real Anchor List 80
we have here 192000 combinations.
-----------------------------------
Loading raw data from OpenML ID 11
Learner Zoo 24, Real Anchor List 40
we have here 96000 combinations.
-----------------------------------
Loading raw data from OpenML ID 12
Learner Zoo 24, Real Anchor List 54
we have here 129600 combinations.
-----------------------------------
Loading raw data from OpenML ID 14
Learner Zoo 24, Real Anchor List 54
we have here 129600 combinations.
-----------------------------------
Loading raw data from OpenML ID 15
Learner Zoo 24, Real Anchor List 42
we have here 100800 combinations.
-----------------------------------
Loading raw data from OpenML ID 16
Learner Zoo 24, Real Anchor List 54
we have here 129600 combinations.
-----------------------------------
Loading raw data