# Dataset preprocessing

In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_categorical
import json

class EmpiricalDistribution:
    def __init__(self, dtype=None, values=None, prob=None):
        self.dtype = dtype
        self.values = values
        self.prob = prob
    
    def sample(self, size):
        return np.random.choice(self.values, size=size, p=self.prob)

    def __str__(self):
        return f'dtype: {self.dtype}\nvalues: {self.values}\nprob: {self.prob}'

    @staticmethod    
    def from_series(pd_series):
        ed = EmpiricalDistribution()
        counts = pd_series.value_counts().to_numpy()
        if is_categorical(pd_series):
            ed.dtype = 'categorical'
            ed.values = list(range(len(counts)))
        elif is_numeric_dtype(pd_series):
            ed.dtype = 'numeric'
            nbins = min(20, len(counts))
            cut, bins = pd.qcut(pd_series, nbins, retbins=True, duplicates='drop')
            counts = cut.value_counts().sort_index().to_numpy()
            ed.values = ((bins[1:] + bins[:-1]) / 2).tolist()
        else:
            raise TypeError('Only categorical or numeric pandas series supported')
        ed.prob = (counts / counts.sum()).tolist()
        return ed
    
    @staticmethod
    def from_dict(d):
        return EmpiricalDistribution(**d)
    
def save_json(filename, obj, indent=None):
    with open(filename, 'w') as f:
        json.dump(obj, f, indent=indent)


## Investigating the base level distribution in the expertise space off OpenML datasets
* number of instance and number of features is not NaN
* number of instances < 1000
* number of features < 1000
* number of classes = 2
* number of missing values = 0

Without restricting number of instances and features, there are 499 binary classification problems

In [2]:
from openml import datasets

ds = datasets.list_datasets(output_format='dataframe').drop_duplicates(subset='name')
ds = ds[(ds.NumberOfFeatures < 1000) & (ds.NumberOfInstances < 1000) & (ds.NumberOfClasses == 2) & (ds.NumberOfMissingValues == 0)].dropna(subset=['NumberOfFeatures', 'NumberOfInstances'])

ds

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
37,37,diabetes,1,1,active,ARFF,500.0,2.0,268.0,2.0,9.0,768.0,0.0,0.0,8.0,1.0
40,40,sonar,1,1,active,ARFF,111.0,2.0,97.0,2.0,61.0,208.0,0.0,0.0,60.0,1.0
43,43,haberman,1,1,active,ARFF,225.0,12.0,81.0,2.0,4.0,306.0,0.0,0.0,2.0,2.0
50,50,tic-tac-toe,1,1,active,ARFF,626.0,3.0,332.0,2.0,10.0,958.0,0.0,0.0,0.0,10.0
53,53,heart-statlog,1,1,active,ARFF,150.0,2.0,120.0,2.0,14.0,270.0,0.0,0.0,13.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41521,41521,Weather,1,2,active,arff,9.0,3.0,5.0,2.0,5.0,14.0,0.0,0.0,2.0,3.0
41538,41538,conference_attendance,1,8263,active,ARFF,215.0,7.0,31.0,2.0,7.0,246.0,0.0,0.0,0.0,7.0
41976,41976,TuningSVMs,1,64,active,ARFF,102.0,2.0,54.0,2.0,81.0,156.0,0.0,0.0,80.0,1.0
42554,42554,test_hate_vects,1,11939,active,ARFF,3.0,,1.0,2.0,50.0,4.0,0.0,0.0,48.0,2.0


In [3]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, minmax_scale

for id in ds.index:
    ohe = OneHotEncoder(sparse=False)
    le = LabelEncoder()

    dataset = fetch_openml(data_id=id)

    categorical = dataset.data.select_dtypes(include=['category', 'object']).to_numpy()
    numeric = dataset.data.select_dtypes(include='number').to_numpy()

    if categorical.size > 0:
        categorical = ohe.fit_transform(categorical)
    if numeric.size > 0:
        numeric = minmax_scale(numeric)
    
    if categorical.size > 0 and numeric.size > 0:
        X = np.concatenate((numeric, categorical), axis=1)
    else:
        X = categorical if categorical.size > 0 else numeric

    targets = dataset.target.to_numpy()
    if len(targets.shape) == 1:
        y = le.fit_transform(targets)
        np.save(f'{id}data.npy', X)
        np.save(f'{id}label.npy', y)
    elif len(targets.shape) == 2:
        for i in range(targets.shape[1]):
            y = le.fit_transform(targets[:, i])
            np.save(f'{id}-{i}data.npy', X)
            np.save(f'{id}-{i}label.npy', y)
    else:
        print(f"OpenML dataset {id} targets has shape {targets.shape}")

## Compiling the empirical distributions

Take all active datasets from openml.org that satisfy:
* number of instance and number of features is not NaN
* number of instances < 1000
* number of features < 1000

Ignore columns of data that contain empty values

In [13]:
from IPython.display import clear_output
from sklearn.datasets import fetch_openml
from openml import datasets

ds = datasets.list_datasets(output_format='dataframe').drop_duplicates(subset='name')
ds = ds[(ds.NumberOfFeatures < 1000) & (ds.NumberOfInstances < 1000)].dropna()

# Check for NaN
# np_list[np_list.isnull().any(axis=1)]

distr_list = []
split = 100
count = 0
i = 1
for id in ds.index:
    clear_output(wait=True)
    print(f'{i}/{len(ds)}')
    i += 1
    print(id)

    # Skip api errors
    try:
        dataset = fetch_openml(data_id=id)
    except:
        continue
    
    df = dataset.data
    # Skip non-dataframes
    if not isinstance(df, pd.DataFrame):
        continue

    for col in df:
        # Skip features that have NaN values in them
        if df[col].isnull().any():
            continue
        try:
            f = EmpiricalDistribution.from_series(df[col])
        except TypeError:
            continue
        distr_list.append(vars(f))
        if len(distr_list) >= split:
            save_json(f'{count}.json', distr_list)
            distr_list = []
            count += 1


save_json(f'{count}.json', distr_list)


212/212
42186


In [6]:
ds = datasets.list_datasets(output_format='dataframe').drop_duplicates(subset='name')
ds = ds[(ds.NumberOfFeatures < 1000) & (ds.NumberOfInstances < 1000)].dropna()
ds.mean()

did                                   7505.325472
version                                  1.084906
uploader                                      inf
MajorityClassSize                      164.188679
MaxNominalAttDistinctValues             10.495283
MinorityClassSize                       61.660377
NumberOfClasses                          4.056604
NumberOfFeatures                        30.825472
NumberOfInstances                      294.551887
NumberOfInstancesWithMissingValues      27.240566
NumberOfMissingValues                  178.051887
NumberOfNumericFeatures                 22.056604
NumberOfSymbolicFeatures                 8.716981
dtype: float64

In [8]:
ds.mode(numeric_only=True)

Unnamed: 0,did,version,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
0,2,1.0,50.0,2.0,1.0,2.0,10.0,500.0,0.0,0.0,0.0,1.0
1,4,,,,,,,,,,,
2,5,,,,,,,,,,,
3,7,,,,,,,,,,,
4,9,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
207,41981,,,,,,,,,,,
208,42167,,,,,,,,,,,
209,42169,,,,,,,,,,,
210,42172,,,,,,,,,,,


# Dataset construction from empirical distributions

In [2]:
from sklearn.neighbors import NearestNeighbors

def get_np(choice):
    if choice == 0:
        return 300, 30
    if choice == 1:
        return 500, 10
    if choice == 2:
        n = np.random.choice([2 ** n for n in range(5, 10)])
        p = np.random.choice([2 ** n for n in range(1, 10)])
        return n, p

def get_c(choice):
    if choice == 0:
        return 10
    if choice == 1:
        return np.random.choice(range(2, 11))

def generate_labels(X, c):
    n, p = X.shape
    centres0 = np.random.rand(c // 2, p)
    centres1 = np.random.rand(c - c // 2, p)
    
    nn = NearestNeighbors(n_neighbors=1)
    
    nn.fit(centres0)
    dist0 = nn.kneighbors(X)[0]
    nn.fit(centres1)
    dist1 = nn.kneighbors(X)[0]

    dist = np.concatenate((dist0, dist1), axis=1)
    probs = dist / dist.sum(axis=1, keepdims=True)
    probs = 1 - probs
    
    labels = []
    for i in range(n):
        labels.append(np.random.choice(2, p=probs[i,:]))
    return np.asarray(labels)

In [3]:
import json
import numpy as np
from sklearn.preprocessing import OneHotEncoder, minmax_scale

def load_json(filename):
    with open(filename) as f:
        data = json.load(f)
        return data

split = 100
folder = 'data/distributions'

np_choices = [0, 1, 2]
c_choices = [0, 1]

for p in np_choices:
    for c in c_choices:
        save_folder = f'data/datasets_artificial/{p}{c}'
        for i in range(100):
            n_samples, n_features = get_np(p)
            n_clusters = get_c(c)

            distr_ids = np.random.randint(0, 5800, size=n_features)
            features = []
            for id in distr_ids:
                filename = id // split
                remainder = id % split
                d = load_json(f'{folder}/{filename}.json')[remainder]
                ed = EmpiricalDistribution.from_dict(d)
                new_feature = ed.sample(n_samples)[:, np.newaxis]
                if ed.dtype == 'categorical':
                    enc = OneHotEncoder(categories=[ed.values])
                    new_feature = enc.fit_transform(new_feature).toarray()
                else: # dtype == 'numeric'
                    new_feature = minmax_scale(new_feature)

                features.append(new_feature)

            dataset = np.concatenate(features, axis=1)
            labels = generate_labels(dataset, n_clusters)
            np.save(f'{save_folder}/{p}-{c}-{i}data.npy', dataset)
            np.save(f'{save_folder}/{p}-{c}-{i}label.npy', labels)
