In [2]:
import os
import openml
import pandas as pd
import numpy as np
from scipy.io.arff import loadarff
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from fastprogress.fastprogress import master_bar, progress_bar

In [3]:
seed = 42

## TODO
- shuffle columns except class
- save dataframes with {name}\_{mean_score}\_{std_score}.csv
- initially test only on classical suite than extend to all classification datasets
- implement selfsupervised as in [TabNet](https://arxiv.org/pdf/1908.07442.pdf)

In [4]:
path = '../data/'
count = 0

os.makedirs('../samples', exist_ok=True)

clf1 = RandomForestClassifier(random_state=seed)

In [5]:
openml_list = openml.datasets.list_datasets()
suites = openml.study.list_suites(output_format="dataframe", status="all")

In [6]:
suites.head(100)

Unnamed: 0,id,alias,main_entity_type,name,status,creation_date,creator
14,14,OpenML100,task,"Collaborative, reproducible benchmarking and a...",in_preparation,2019-02-21 18:40:13,1
99,99,OpenML-CC18,task,OpenML Benchmarking Suites and the OpenML-CC18,in_preparation,2019-02-21 18:47:13,1
216,216,CoolStudy,task,A Cool Study with an Awesome name,in_preparation,2019-02-28 23:00:26,8006
217,217,Study,task,A Cool Study with an Awesome name,in_preparation,2019-02-28 23:05:11,8006
218,218,AutoML-Benchmark,task,AutoML Benchmark,in_preparation,2019-05-02 13:35:08,869
219,219,FOREX,task,Forex,in_preparation,2019-06-04 00:45:17,1
225,225,OpenML-friendly,task,OpenML100-friendly,active,2019-09-16 19:41:46,1
236,236,a9ee1f0b2a4b48b6b6da1653fe92890e,task,Item Response Theory for Classification problems,in_preparation,2020-04-06 21:38:55,64
239,239,c638a5d3d31241179f9b4853951fdb79,task,Item Response Theory for Regression problems,in_preparation,2020-04-19 22:15:30,64
240,240,e5e7f56c8655433eb2418c240ec8b8c0,task,InvestigatingDL,in_preparation,2020-04-28 02:30:38,2902


In [7]:
suite = openml.study.get_suite(271)
print(suite)

OpenML Benchmark Suite
ID..............: 271
Name............: AutoML Benchmark All Classification
Status..........: in_preparation
Main Entity Type: task
Study URL.......: https://www.openml.org/s/271
# of Data.......: 71
# of Tasks......: 71
Creator.........: https://www.openml.org/u/869
Upload Time.....: 2020-11-19 20:52:19


In [9]:
datasets = suite.data

In [13]:
for idd in progress_bar(datasets[20:]):
    dataset = openml.datasets.get_dataset(idd)
    print(f'Getting dataset id {idd}, {dataset.name}...')
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="array", target=dataset.default_target_attribute
    )
    df = pd.DataFrame(X, columns=attribute_names)
    df["class"] = y
    df.to_csv(path+dataset.name+'.csv',index=False)

Getting dataset id 1487, ozone-level-8hr...
Getting dataset id 54, vehicle...
Getting dataset id 41144, madeline...


OpenMLServerException: https://www.openml.org/api/v1/xml/data/qualities/41145 returned code 107: Database connection error. Usually due to high server load. Please wait for N seconds and try again. - None

In [23]:
mb = master_bar(os.listdir(path))

for f in mb:
    mb.main_bar.comment = f'Files'
    data, _ = loadarff(path+f)
    sampler = pd.DataFrame(data)
    print(sampler.columns)
    if sampler.shape[0] < 1000:
        continue
    for i in progress_bar(range(10), parent=mb):
        df = sampler.sample(500, replace=True, random_state=seed)
        X = df.drop(["class"], axis=1).astype(str)
        y = df["class"].astype(str)
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        scores = []
        for train_idx, test_idx in kfold.split(X, y):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
            clf1.fit(X_train, y_train)
            scores.append(clf1.score(X_test, y_test))
        df.to_csv('../samples/'+f'{f}_{np.mean(scores):.3f}.csv')
        mb.child.comment = f'Sampler'
    mb.write(f'Finished {f}')

Index(['height', 'lenght', 'area', 'eccen', 'p_black', 'p_and', 'mean_tr',
       'blackpix', 'blackand', 'wb_trans', 'class'],
      dtype='object')
Index(['oz1', 'oz2', 'oz3', 'oz4', 'oz5', 'oz6', 'oz7', 'oz8', 'oz9', 'oz10',
       'oz11', 'oz12', 'oz13', 'oz14', 'oz15', 'oz16', 'oz17', 'oz18', 'oz19',
       'oz20', 'oz21', 'oz22', 'oz23', 'oz24', 'oz25', 'class'],
      dtype='object')
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31',
       'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41',
       'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51',
       'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61',
       'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71',
       'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80',

ValueError: could not convert string to float: "b'v3'"

In [24]:
import arff

ModuleNotFoundError: No module named 'arff'