In [1]:
import warnings

from openml import tasks, datasets
from typing import Callable
from pathlib import Path

import pandas as pd
import os

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
os.chdir('..')

In [3]:
from engine.preprocessing import eliminate_correlated_values, ensure_last_target, get_generic_preprocessing

In [4]:
SMALL_TASKS_CNT: int = 3
BIG_TASKS_CNT: int = 6

FEATURES_TRESHOLD: int = 10

ADDITIONAL_CONDITIONS: Callable = lambda task: task['NumberOfInstances'] < 10_000 \
    and task['NumberOfMissingValues'] == 0
    
DATA_PATH: str = Path('data')

# Data filtering

In [5]:
classification_tasks = tasks.list_tasks(
        task_type=tasks.TaskType.SUPERVISED_CLASSIFICATION
    )

small_tasks = []
big_tasks = []

for task in classification_tasks.values():
    if task.get("NumberOfClasses") is None:
        continue
    if task['NumberOfInstances'] <= task['NumberOfFeatures']:
        continue
    if not ADDITIONAL_CONDITIONS(task):
        continue
    if task["NumberOfClasses"] == 2:
        if task['NumberOfFeatures'] > FEATURES_TRESHOLD:
            big_tasks.append(task)
        else:
            small_tasks.append(task)

In [6]:
small_tasks[:SMALL_TASKS_CNT]
chosen_small_ids = [37, 3585, 3053]

In [21]:
big_tasks[:BIG_TASKS_CNT]
chosen_big_ids = [3, 31, 39, 3872, 52, 57]

In [10]:
SELECTED_FEATURES = ['did', 'name', 'NumberOfFeatures', 'NumberOfInstances', 'NumberOfNumericFeatures', 'NumberOfSymbolicFeatures']

In [22]:
summary = pd.DataFrame()

for id_ in chosen_small_ids:
    row = {key: [value] for key, value in classification_tasks[id_].items() if key in SELECTED_FEATURES}
    row['type'] = 'Small'
    row = pd.DataFrame(row)
    summary = pd.concat([summary, row])
    
for id_ in chosen_big_ids:
    row = {key: [value] for key, value in classification_tasks[id_].items() if key in SELECTED_FEATURES}
    row['type'] = 'Big'
    row = pd.DataFrame(row)
    summary = pd.concat([summary, row])

In [28]:
summary.rename(columns={'did': 'ID',
                'NumberOfFeatures': '# Attributes',
                'NumberOfInstances': '# Row',
                'NumberOfNumericFeatures': '# Numeric Attributes',
                'NumberOfSymbolicFeatures': '# Categoric Attributes'}).to_csv('data_info.csv', index=False)

# Data downloading

In [8]:
(DATA_PATH / 'small').mkdir(exist_ok=True)
(DATA_PATH / 'big').mkdir(exist_ok=True)

In [9]:
print("SMALL")
for id_ in chosen_small_ids:
    task = classification_tasks[id_]
    task_name = task['name']
    print(task)
    target_name = task["target_feature"]
    data = datasets.get_dataset(task['did']).get_data()[0]
    data = ensure_last_target(data, target_name)

    X, y = data.iloc[:,:-1], data.iloc[:,-1]

    pipeline = get_generic_preprocessing()
    X = pipeline.fit_transform(X)
    y = pd.get_dummies(y, drop_first=True).astype(int)
    X = eliminate_correlated_values(pd.DataFrame(data=X))

    df = pd.DataFrame(data=X)
    df['target'] = y

    df.to_csv(DATA_PATH / 'small' / f'{task_name}.csv', index=False)

print("BIG")
for id_ in chosen_big_ids:
    task = classification_tasks[id_]

    task_name = task['name']
    print(task)
    target_name = task['target_feature']
    data = datasets.get_dataset(task['did']).get_data()[0]
    data = ensure_last_target(data, target_name)

    X, y = data.iloc[:,:-1], data.iloc[:,-1]

    pipeline = get_generic_preprocessing()
    X = pipeline.fit_transform(X)
    y = pd.get_dummies(y, drop_first=True).astype(int)
    X = eliminate_correlated_values(pd.DataFrame(data=X))

    df = pd.DataFrame(data=X)
    df['target'] = y

    df.to_csv(DATA_PATH / 'big' / f'{task_name}.csv', index=False)

SMALL
{'tid': 37, 'ttid': <TaskType.SUPERVISED_CLASSIFICATION: 1>, 'did': 37, 'name': 'diabetes', 'task_type': 'Supervised Classification', 'status': 'active', 'estimation_procedure': '10-fold Crossvalidation', 'source_data': '37', 'target_feature': 'class', 'MajorityClassSize': 500, 'MaxNominalAttDistinctValues': 2, 'MinorityClassSize': 268, 'NumberOfClasses': 2, 'NumberOfFeatures': 9, 'NumberOfInstances': 768, 'NumberOfInstancesWithMissingValues': 0, 'NumberOfMissingValues': 0, 'NumberOfNumericFeatures': 8, 'NumberOfSymbolicFeatures': 1}
Remaining variables:
Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='object')
{'tid': 3585, 'ttid': <TaskType.SUPERVISED_CLASSIFICATION: 1>, 'did': 719, 'name': 'veteran', 'task_type': 'Supervised Classification', 'status': 'active', 'estimation_procedure': '10-fold Crossvalidation', 'evaluation_measures': 'predictive_accuracy', 'source_data': '719', 'target_feature': 'binaryClass', 'MajorityClassSize': 94, 'MaxNominalAttDistinctValues': 4, 'MinorityClassSize

  vif = 1. / (1. - r_squared_i)


Remaining variables:
Index([ 0,  3,  4,  6,  7,  9, 10, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24,
       25, 26, 27, 28, 30, 32, 33, 34, 35, 38],
      dtype='object')
{'tid': 52, 'ttid': <TaskType.SUPERVISED_CLASSIFICATION: 1>, 'did': 53, 'name': 'heart-statlog', 'task_type': 'Supervised Classification', 'status': 'active', 'estimation_procedure': '10-fold Crossvalidation', 'evaluation_measures': 'predictive_accuracy', 'source_data': '53', 'target_feature': 'class', 'MajorityClassSize': 150, 'MaxNominalAttDistinctValues': 2, 'MinorityClassSize': 120, 'NumberOfClasses': 2, 'NumberOfFeatures': 14, 'NumberOfInstances': 270, 'NumberOfInstancesWithMissingValues': 0, 'NumberOfMissingValues': 0, 'NumberOfNumericFeatures': 13, 'NumberOfSymbolicFeatures': 1}
Remaining variables:
Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype='object')
{'tid': 57, 'ttid': <TaskType.SUPERVISED_CLASSIFICATION: 1>, 'did': 59, 'name': 'ionosphere', 'task_type': 'Supervised Classification', 'status': 'act

  return 1 - self.ssr/self.centered_tss


Remaining variables:
Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15, 16, 17, 18,
       19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33],
      dtype='object')
