In [2]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import robust_scale
from sklearn.metrics import f1_score
from fastprogress.fastprogress import master_bar, progress_bar
from sklearn.utils import resample

In [3]:
seed = 42
random.seed(seed)

## TODO
- save dataframes with {name}\_{mean_score}\_{std_score}.csv
- initially test only on classical suite than extend to all classification datasets
- noise injection for "data augmentation"
- explore other data augmentation
- hot encoding (as being applied in openml) could be terrible option, try target encoding or other method
- save robust scaler to disk
- check if scaler is not biased
- check if target encoder is not biased
- shuffle all columns except class
- get dummies drop dependent?

In [4]:
path = '../data/'

os.makedirs('../samples_train', exist_ok=True)
os.makedirs('../samples_valid', exist_ok=True)

In [5]:
train_valid = pd.read_csv("../data.csv")
train_valid = train_valid[train_valid.train==False].files.tolist()

In [6]:
clf1 = DecisionTreeClassifier(random_state=seed)
clf2 = KNeighborsClassifier()

In [7]:
mb = master_bar(os.listdir(path))
count = 0

aug_size = 128
percentage_valid = .1
fold = 5
sample_size = 256

In [8]:
for f in mb:
    mb.main_bar.comment = f'Files'
    data = pd.read_csv(path+f).dropna()
    if data.shape[0] < 150 or data.shape[1] > 256:
        continue
    majority_class = data["class"].value_counts().sort_values().index[-1]
    data["class"] = (data["class"] == majority_class).astype(int)
    X = data.drop(columns=["class"]).values
    X = robust_scale(X)
    y = data["class"].values
    if y.dtype == float or any(np.unique(y, return_counts=True)[1] < fold):
        print(f"{f} has a continuous y value or too imbalanced.")
        continue
    for i in progress_bar(range(aug_size), parent=mb):
        Xsample, ysample = resample(X, y, n_samples=sample_size, random_state=seed+i, stratify=y)
        kfold = KFold(n_splits=fold, shuffle=True, random_state=seed)
        scores1 = []
        scores2 = []
        for train_idx, test_idx in kfold.split(Xsample, ysample):
            X_train, y_train = Xsample[train_idx], ysample[train_idx]
            X_test, y_test = Xsample[test_idx], ysample[test_idx]
            clf1.fit(X_train, y_train)
            scores1.append(f1_score(y_test, clf1.predict(X_test), average='weighted'))
            clf2.fit(X_train, y_train)
            scores2.append(f1_score(y_test, clf2.predict(X_test), average='weighted'))
        df_X = pd.DataFrame(Xsample)
        df_X = df_X[random.sample(df_X.columns.to_list(), len(df_X.columns))]
        df = pd.concat([df_X, pd.DataFrame({'class': ysample})], axis=1)
        if f not in train_valid:
            df.to_csv('../samples_train/'+f'{f}_{np.mean(scores1):.5f}_{np.mean(scores2):.5f}_{i}.csv',
                      index=False)
        else:
            df.to_csv('../samples_valid/'+f'{f}_{np.mean(scores1):.5f}_{np.mean(scores2):.5f}_{i}.csv',
                      index=False)
        mb.child.comment = f'Sampler'
        count += 1

In [9]:
print("Number of datasets exported:", count)

Number of datasets exported: 36992
