In [1]:
import pandas as pd
import numpy as np
from requests import get
import json
from scipy.stats import kurtosis, skew, entropy
%load_ext autotime
%matplotlib inline  

In [None]:
"""
Workflow:
- Functions to get dataset features as described in the paper Efficient and Robust Automated Machine Learning
- Get dataset features using paper features
- Get dataset features using OpenML features
- Combine features from both methods
"""

In [3]:
# Read in necessary data
with open('OpenML_Data/sklearn_task_descriptions.json') as data_file:
    sklearn_task_descriptions = json.load(data_file)
    
with open('OpenML_Data/sklearn_data_set_descriptions.json') as data_file:
    sklearn_data_set_descriptions = json.load(data_file)
    
with open('OpenML_Data/sklearn_data_set_qualities.json') as data_file:
    sklearn_data_set_qualities = json.load(data_file)
    
with open('OpenML_Data/sklearn_data_set_features.json') as data_file:
    sklearn_data_set_features = json.load(data_file)

time: 4.38 s


In [4]:
"""
Function to get the 37 dataset features described in the paper
- X: the predictor variables
- y: the response variable
- categorical: an array of 0's or 1's for whether the predictor variable is categorical 
"""
def get_features(X,y,categorical):

    NumberOfInstances = float(X.shape[0])

    LogNumberOfInstances = np.log(NumberOfInstances)

    if len(y.shape) == 2:
        NumberOfClasses = np.mean([len(np.unique(y[:,i])) for i in range(y.shape[1])])
    else:
        NumberOfClasses = float(len(np.unique(y)))

    NumberOfFeatures = float(X.shape[1])

    LogNumberOfFeatures = np.log(NumberOfFeatures)

    missing = 0
    missing_instances = []
    missing_features = []
    for row_idx, row in data.iterrows():
        for col_idx,col in enumerate(row):
            if " ?" == col:
                missing += 1
                missing_instances.append(row_idx)
                missing_features.append(col_idx)
    missing_instances = list(set(missing_instances))
    missing_features = list(set(missing_features))

    NumberOfInstancesWithMissingValues = len(missing_instances)

    PercentageOfInstancesWithMissingValues = NumberOfInstancesWithMissingValues/NumberOfInstances

    NumberOfFeaturesWithMissingValues = len(missing_features)

    PercentageOfFeaturesWithMissingValues = NumberOfFeaturesWithMissingValues/NumberOfInstances

    NumberOfMissingValues = missing

    PercentageOfMissingValues = missing/(X.shape[0]*X.shape[1])

    NumberOfNumericFeatures = len(categorical) - np.sum(categorical)

    NumberOfCategoricalFeatures = np.sum(categorical)

    if NumberOfCategoricalFeatures == 0.0:
        RatioNumericalToNominal = 0.0
    else:
        RatioNumericalToNominal = NumberOfNumericFeatures / NumberOfCategoricalFeatures

    if NumberOfNumericFeatures == 0.0:
        RatioNominalToNumerical = 0.0
    else:
        RatioNominalToNumerical = NumberOfCategoricalFeatures / NumberOfNumericFeatures

    DatasetRatio = NumberOfFeatures / NumberOfInstances

    LogDatasetRatio = np.log(DatasetRatio)

    InverseDatasetRatio = NumberOfInstances / NumberOfFeatures

    LogInverseDatasetRatio = np.log(InverseDatasetRatio)

    occurence_dict = {}
    for val in set(y):
        occurence_dict[str(val)] = 0
    for value in y:
        occurence_dict[str(value)] += 1
    ClassOccurences = occurence_dict   

    min_value = np.iinfo(np.int64).max
    max_value = np.iinfo(np.int64).min
    for class_val, num_occurences in ClassOccurences.items():
        if num_occurences < min_value:
            min_value = num_occurences
        if num_occurences > max_value:
            max_value = num_occurences
    ClassProbabilityMin = float(min_value) / float(y.shape[0])
    ClassProbabilityMax = float(max_value) / float(y.shape[0])

    occurences = np.array([occurrence for occurrence in occurence_dict.values()],dtype=np.float64)
    ClassProbabilityMean = (occurences / y.shape[0]).mean()
    ClassProbabilitySTD = (occurences / y.shape[0]).std()
    
    symbols_per_column = []
    kurtosisses = []
    skewnesses = []
    col_num = 0
    for column in X:
        if categorical[col_num]:
            unique_values = np.unique(X[column])
            num_unique = len(unique_values)
            symbols_per_column.append(num_unique)
        else:
            kurtosis_val = kurtosis(X[column])
            kurtosisses.append(kurtosis_val)
            skewness_val = skew(X[column])
            skewnesses.append(skewness_val)
        col_num += 1
        
    if len(symbols_per_column) == 0:
        SymbolsMin = 0
        SymbolsMax = 0
        SymbolsMean = 0
        SymbolsSTD = 0
        SymbolsSum = 0
        KurtosisMin = 0
        KurtosisMax = 0
        KurtosisMean = 0
        KurtosisSTD = 0
        SkewnessMin = 0
        SkewnessMax = 0
        SkewnessMean = 0
        SkewnessSTD = 0
    else:
        SymbolsMin = min(symbols_per_column)
        SymbolsMax = max(symbols_per_column)
        SymbolsMean = np.mean(symbols_per_column)
        SymbolsSTD = np.std(symbols_per_column)
        SymbolsSum = sum(symbols_per_column)
        KurtosisMin = min(kurtosisses)
        KurtosisMax = max(kurtosisses)
        KurtosisMean = np.mean(kurtosisses)
        KurtosisSTD = np.std(kurtosisses)
        SkewnessMin = min(skewnesses)
        SkewnessMax = max(skewnesses)
        SkewnessMean = np.mean(skewnesses)
        SkewnessSTD = np.std(skewnesses)
    

    entropies = []
    occurence_dict = {}
    for value in y:
        if value in occurence_dict:
            occurence_dict[value] += 1
        else:
            occurence_dict[value] = 1
    ClassEntropy = entropy([occurence_dict[key] for key in occurence_dict], base=2)

    
    
    col_names = ["NumberOfInstances","LogNumberOfInstances","NumberOfClasses","NumberOfFeatures","LogNumberOfFeatures",
                                     "NumberOfInstancesWithMissingValues","PercentageOfInstancesWithMissingValues","NumberOfFeaturesWithMissingValues",
                                     "PercentageOfFeaturesWithMissingValues","NumberOfMissingValues","PercentageOfMissingValues","NumberOfNumericFeatures",
                                     "NumberOfCategoricalFeatures","RatioNumericalToNominal","RatioNominalToNumerical","DatasetRatio","LogDatasetRatio",
                                     "InverseDatasetRatio","LogInverseDatasetRatio","ClassProbabilityMin","ClassProbabilityMax", "ClassProbabilityMean",
                                     "ClassProbabilitySTD", "SymbolsMin", "SymbolsMax", "SymbolsMean", "SymbolsSTD", "SymbolsSum", "KurtosisMin", 
                 "KurtosisMax", "KurtosisMean", "KurtosisSTD", "SkewnessMin", "SkewnessMax", "SkewnessMean", "SkewnessSTD", "ClassEntropy"]
    
    return pd.Series([NumberOfInstances,LogNumberOfInstances,NumberOfClasses,NumberOfFeatures,LogNumberOfFeatures,
                               NumberOfInstancesWithMissingValues,PercentageOfInstancesWithMissingValues,NumberOfFeaturesWithMissingValues,
                               PercentageOfFeaturesWithMissingValues,NumberOfMissingValues,PercentageOfMissingValues,NumberOfNumericFeatures,
                               NumberOfCategoricalFeatures,RatioNumericalToNominal,RatioNominalToNumerical,DatasetRatio,LogDatasetRatio,
                               InverseDatasetRatio,LogInverseDatasetRatio,ClassProbabilityMin,ClassProbabilityMax,ClassProbabilityMean,
                               ClassProbabilitySTD, SymbolsMin, SymbolsMax, SymbolsMean, SymbolsSTD, SymbolsSum, KurtosisMin, KurtosisMax,
                     KurtosisMean, KurtosisSTD, SkewnessMin, SkewnessMax, SkewnessMean, SkewnessSTD, ClassEntropy], index=col_names)



time: 412 ms


In [5]:
# Function to get the X, y, and categorical variables for the "get_features" function from an OpenML dataset
def get_data_base(features,data):
    categorical = []
    for column in features['data_features']['feature']:
        if column['data_type'] == 'numeric':
            categorical.append(0)
        else:
            categorical.append(1)
        if column['is_target'] == 'true':
            X = data.drop(data.columns[int(column['index'])], axis=1)
            y = data.iloc[:,int(column['index'])]
    return X,y,categorical

time: 7.07 ms


In [6]:
# Get all sklearn dataset ids and file ids
data_set_ids = []
data_set_file_ids = {}
for task_id,task_description in sklearn_task_descriptions.items():
    data_set_id = task_description['task']['input'][0]['data_set']['data_set_id']
    data_set_file_id = sklearn_data_set_descriptions[data_set_id]['data_set_description']['file_id']
    data_set_ids.append(data_set_id)
    data_set_file_ids[data_set_id] = data_set_file_id

time: 13.5 ms


In [7]:
# Feature names  
    col_names = ["NumberOfInstances","LogNumberOfInstances","NumberOfClasses","NumberOfFeatures","LogNumberOfFeatures",
                                     "NumberOfInstancesWithMissingValues","PercentageOfInstancesWithMissingValues","NumberOfFeaturesWithMissingValues",
                                     "PercentageOfFeaturesWithMissingValues","NumberOfMissingValues","PercentageOfMissingValues","NumberOfNumericFeatures",
                                     "NumberOfCategoricalFeatures","RatioNumericalToNominal","RatioNominalToNumerical","DatasetRatio","LogDatasetRatio",
                                     "InverseDatasetRatio","LogInverseDatasetRatio","ClassProbabilityMin","ClassProbabilityMax", "ClassProbabilityMean",
                                     "ClassProbabilitySTD", "SymbolsMin", "SymbolsMax", "SymbolsMean", "SymbolsSTD", "SymbolsSum", "KurtosisMin", 
                 "KurtosisMax", "KurtosisMean", "KurtosisSTD", "SkewnessMin", "SkewnessMax", "SkewnessMean", "SkewnessSTD", "ClassEntropy"]

time: 2.87 ms


In [4]:
# Get the dataset features for all OpenML sklearn datasets (1 Hour)
error_data_sets = []
df_data_sets = pd.DataFrame(columns=col_names)
for idx, data_set_id in enumerate(data_set_ids):
    try:
        for quality in sklearn_data_set_qualities[data_set_id]['data_qualities']['quality']:
            if quality['name'] == 'NumberOfInstances':
                instances = int(float(quality['value']))
            if quality['name'] == 'NumberOfFeatures':
                features = int(float(quality['value']))
        if instances * features < 6900000:
            url='https://www.openml.org/data/get_csv/' + str(data_set_file_ids[data_set_id])
            response = get(url).content
            with open("temp.csv", 'wb') as f:
                f.write(response)

            data = pd.read_csv("temp.csv")

            features = sklearn_data_set_features[data_set_id]

            X,y,categorical = get_data_base(features,data)

            data_set_features = get_features(X,y,categorical)

            data_set_features.name = data_set_id

            df_data_sets = df_data_sets.append(data_set_features)

    except:
        error_data_sets.append(data_set_id)
        
# df_data_sets.to_csv("OpenML_Data/paper_data_set_features.csv")

In [50]:
# Get all the features given by OpenML
df_indices = [str(id) for id in list(df_data_sets.index)]
openml_features = pd.DataFrame()
for data_set_id in df_indices:
    quality_values = pd.Series()
    for quality in sklearn_data_set_qualities[data_set_id]['data_qualities']['quality']:
        quality_values = quality_values.append(pd.Series(quality['value'], index=[quality['name']]))
        quality_values.name = data_set_id
    openml_features = openml_features.append(quality_values, ignore_index=False)
    
#openml_features.to_csv("OpenML_Data/openml_data_set_features.csv")

time: 1min 3s


In [60]:
def intersection(lst1, lst2):
    return [value for value in lst1 if value in lst2]

time: 10.2 ms


In [68]:
# Get the combined features from OpenML and the paper
openml_features.dropna(axis=1, how='any')
same_cols = intersection(list(df_data_sets.columns),list(openml_features.columns))
combined_data_set_features = pd.concat([df_data_sets, openml_features.drop(same_cols, axis=1)], axis=1)
# combined_data_set_features.to_csv("OpenML_Data/combined_data_set_features.csv")

time: 19.6 ms


time: 2.14 ms
