In [None]:
import json
import os
from copy import copy
from pathlib import Path

import pandas as pd
from openml import datasets
from openml.exceptions import OpenMLServerException
from tqdm import tqdm

In [None]:
base_path = Path(os.getenv("RAW_DATADIR", "../data")) / "openml"
base_path.mkdir(parents=True, exist_ok=True)

## Data Prep

In [None]:
oml_catalog = datasets.list_datasets(output_format="dataframe")
# Remove datasets with server errors
oml_catalog = oml_catalog[~oml_catalog.did.isin([4537, 4546, 4562, 40864, 41190, 41949])]
# The following dataset profiles are too large for MongoDB (> 16MB)
oml_catalog = oml_catalog[
    ~oml_catalog.did.isin([41147, 42706, 42708, 44538, 44539, 44540, 44541, 44542])
]

oml_catalog.to_parquet(base_path / "oml_catalog.pq", index=False)

In [None]:
ds_list = []
errors = []

for did in tqdm(oml_catalog.did):
    try:
        ds_list.append(datasets.get_dataset(did, download_qualities=True))
    except OpenMLServerException as e:
        # Error code for quality information not being available
        if e.code == 362:
            ds_list.append(datasets.get_dataset(did, download_qualities=False))
        else:
            errors.append((did, type(e), e.args))
    except Exception as e:
        errors.append((did, type(e), e.args))

for did, e, args in errors:
    print(f"{did}: {e}\n{args}\n")

if len(errors) != 0:
    print(errors)

## Converting to JSON

In [None]:
for ds in ds_list:
    profile = {
        "dataset_id": ds.dataset_id,
        "name": ds.name,
        "version": ds.version,
        "description": ds.description,
        "creator": ds.creator,
        "contributor": ds.contributor,
        "collection_date": ds.collection_date,
        "upload_date": ds.upload_date,
        "language": ds.language,
        "license": ds.licence,
        "default_target_attribute": ds.default_target_attribute,
        "row_id_attribute": ds.row_id_attribute,
        "ignore_attribute": ds.ignore_attribute,
        "tags": ds.tag,
        "features": [
            {
                "index": v.index,
                "name": v.name,
                "data_type": v.data_type,
                "nominal_values": v.nominal_values,
                "number_missing_values": v.number_missing_values,
            }
            for k, v in ds.features.items()
        ],
        "qualities": ds.qualities,
    }

    collection_path = base_path / "collection"
    collection_path.mkdir(exist_ok=True)
    with (collection_path / f"{ds.dataset_id}.json").open("w") as file:
        json.dump(profile, file)

## Converting to Parquet in Tabular Format

In [None]:
md_list = []
f_list = []
q_list = []
t_list = []

for ds in ds_list:
    metadata = copy(vars(ds))

    # List or dict attributes go to separate tables for normalization
    features = metadata.pop("features", {})
    qualities = metadata.pop("qualities", {})
    tags = metadata.pop("tag", [])
    creators = metadata.pop("creator", [])
    contributors = metadata.pop("contributor", [])
    ignore_attributes = metadata.pop("ignore_attribute", [])

    if ignore_attributes:
        if len(ignore_attributes) == 1:
            ignore_attributes = ignore_attributes[0].split(",")
    else:
        ignore_attributes = []

    for v in features.values():
        f = copy(vars(v))
        f["dataset_id"] = ds.dataset_id
        f["ignore"] = False
        if f["name"] in ignore_attributes:
            f["ignore"] = True
        f_list.append(f)

    if qualities is not None:
        for k, v in qualities.items():
            q_list.append({"dataset_id": ds.dataset_id, "metric": k, "value": v})

    if tags is not None:
        t_list = [{"dataset_id": ds.dataset_id, "tag": t} for t in tags]

    # The following attributes do not contain any data
    _ = metadata.pop("update_comment", None)
    _ = metadata.pop("_dataset", None)
    _ = metadata.pop("data_pickle_file", None)
    _ = metadata.pop("data_feather_file", None)
    _ = metadata.pop("feather_attribute_file", None)

    # The following attributes always contain the same data
    _ = metadata.pop("cache_format", None)
    _ = metadata.pop("format", None)
    _ = metadata.pop("visibility", None)

    md_list.append(metadata)

dataset_df = pd.DataFrame(md_list)
feature_df = pd.DataFrame(f_list)
quality_df = pd.DataFrame(q_list)
tags_df = pd.DataFrame(t_list)

col = feature_df.pop("dataset_id")
feature_df.insert(0, col.name, col)

In [None]:
dataset_df.to_parquet(base_path / "datasets.pq", index=False)
feature_df.to_parquet(base_path / "features.pq", index=False)
quality_df.to_parquet(base_path / "metrics.pq", index=False)
tags_df.to_parquet(base_path / "tags.pq", index=False)

## Notes

OpenML data types: [["nominal", "numeric", "string", "date"]](https://github.com/openml/openml-python/blob/develop/openml/datasets/data_feature.py#L23)

## Analysis

In [None]:
oml_catalog.info()

In [None]:
dataset_df.info()

In [None]:
feature_df.info()

In [None]:
features_cleansed = feature_df[
    ~feature_df.name.str.contains("^oz[1-9][0-9]?[0-9]?$")
    & ~feature_df.name.str.contains("^V[0-9][0-9]?[0-9]?$")
    & ~feature_df.name.str.contains("^col_[0-9][0-9]?[0-9]?$")
    & ~feature_df.name.str.contains("AFFX-")
    & ~feature_df.name.str.contains(r"Var\d{0,7}")
    & ~feature_df.name.str.contains(r"att_\d{0,7}")
    & ~feature_df.name.str.contains(r"^\d+$")
    & ~feature_df.name.str.contains(r"\d{2,7}(?:_\w*)?_at$")
].merge(
    dataset_df[~dataset_df.name.str.startswith("QSAR-TID")].dataset_id,
    how="right",
    on="dataset_id",
)

In [None]:
feature_count = features_cleansed.groupby("name")["dataset_id"].count()

In [None]:
feature_count.sort_values(ascending=False).to_csv(
    base_path / "feature_count.csv", header=["count"]
)

In [None]:
features_cleansed