In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from fastprogress.fastprogress import master_bar, progress_bar
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample

In [2]:
seed = 42
random.seed(seed)

## TODO
- save dataframes with {name}\_{mean_score}\_{std_score}.csv
- initially test only on classical suite than extend to all classification datasets
- noise injection for "data augmentation"
- explore other data augmentation
- hot encoding (as being applied in openml) could be terrible option, try target encoding or other method
- save robust scaler to disk
- check if scaler is not biased
- check if target encoder is not biased
- shuffle all columns except class
- get dummies drop dependent?

In [3]:
path = '../data/'

os.makedirs('../samples_train', exist_ok=True)
os.makedirs('../samples_valid', exist_ok=True)

In [4]:
clf1 = RandomForestClassifier(random_state=seed)

In [5]:
mb = master_bar(os.listdir(path))
count = 0

aug_size = 10
percentage_valid = .1

for f in mb:
    mb.main_bar.comment = f'Files'
    data = pd.read_csv(path+f).dropna()
    if data.shape[0] < 150:
        continue
     
    
    X = data.drop(columns=["class"]).values
    y = data["class"].values
    if y.dtype == float:
        print(f"{f} has a continuous y value.")
        continue
    for i in progress_bar(range(aug_size), parent=mb):
        Xsample, ysample = resample(X, y, n_samples=128, random_state=seed+i)
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        scores = []
        for train_idx, test_idx in kfold.split(Xsample, ysample):
            X_train, y_train = Xsample[train_idx], ysample[train_idx]
            X_test, y_test = Xsample[test_idx], ysample[test_idx]
            clf1.fit(X_train, y_train)
            scores.append(clf1.score(X_test, y_test))
        
        df = pd.DataFrame(Xsample)
        df.join(pd.get_dummies(ysample, prefix="class_"))
        if random.random() < percentage_valid:
            df.to_csv('../samples_train/'+f'{f}_{np.mean(scores):.3f}_{i}.csv', index=False)
        else:
            df.to_csv('../samples_valid/'+f'{f}_{np.mean(scores):.3f}_{i}.csv', index=False)            
        mb.child.comment = f'Sampler'
        count += 1
    #mb.write(f'Finished {f}')







1029_LEV.csv has a continuous y value.
504_analcatdata_supreme.csv has a continuous y value.










298_coil2000.csv has a continuous y value.
182_satimage.csv has a continuous y value.




294_satellite_image.csv has a continuous y value.




1099_EgyptianSkulls.csv has a continuous y value.




1028_SWD.csv has a continuous y value.






1030_ERA.csv has a continuous y value.


In [6]:
y.dtype == float

False

In [7]:
print("Number of datasets exported:", count)

Number of datasets exported: 2930
