In [2]:
import openml
import pandas as pd

datasets = openml.datasets.list_datasets()
print(openml.datasets.list_datasets()[531])

ids = list(datasets.keys())
print("Number of datasets: ", len(ids))

# Removing non-dataset entries 
ids.remove(4537)
ids.remove(4546)
ids.remove(4562)

# Fetching dataset
datasets = openml.datasets.get_datasets(ids, download_data=False, download_qualities=False)

In [3]:
df = pd.DataFrame([dataset.__dict__ for dataset in datasets])
# Add the OpenML URL to the dataframe
df['openml_url'] = [dataset.openml_url for dataset in datasets]

In [4]:
# filter out only datasets that have an original_data_url
df = df[df['original_data_url'].notna()]
df

In [5]:
# leave in only the columns for dataset_id and original_data_url
df = df[['dataset_id', 'original_data_url']]
df

In [6]:
# Fix broken links to archive.ics.uci
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Chess+(King-Rook+vs.+King-Pawn)', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/22/chess+king+rook+vs+king+pawn'
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Molecular+Biology+(Splice-junction+Gene+Sequences)', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/39/molecular+biology+splice+junction+gene+sequences'
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Tic-Tac-Toe+Endgame', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/29/tic+tac+toe+endgame'
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Connectionist+Bench+(Vowel+Recognition+-+Deterding+Data)', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/176/connectionist+bench+vowel+recognition+deterding+data'
df.loc[df['original_data_url'] == 'http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/286/smartphone+based+recognition+of+human+activities+and+postural+transitions'
df.loc[df['original_data_url'] == 'http://sci2s.ugr.es/keel/dataset.php?cod=63, https://archive.ics.uci.edu/ml/datasets/LED+Display+Domain', 'original_data_url'] = 'https://archive.ics.uci.edu/ml/datasets/LED+Display+Domain'
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Census-Income+%28KDD%29', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/372/census+income+kdd'
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/435/beijing+pm2.5+data'
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition+', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/417/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition'
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/CNNpred%3A+CNN-based+stock+market+prediction+using+a+diverse+set+of+variables', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/372/cnnpred+cnn+based+stock+market+prediction+using+a+diverse+set+of+variables'
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/haberman\'s+survival', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/43/haberman+s+survival'
# https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/ to https://archive.ics.uci.edu/dataset/102/thyroid+disease
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/102/thyroid+disease'
# https://archive.ics.uci.edu/ml/datasets/Post-Operative+Patient to https://archive.ics.uci.edu/dataset/82/post+operative+patient
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Post-Operative+Patient', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/82/post+operative+patient'
# https://archive.ics.uci.edu/ml/datasets/MONK's+Problems to https://archive.ics.uci.edu/dataset/70/monk+s+problems
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/MONK\'s+Problems', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/70/monk+s+problems'
# archive.ics.uci.edu/ml/datasets/IDA2016Challenge to https://archive.ics.uci.edu/dataset/414/ida2016challenge
df.loc[df['original_data_url'] == 'archive.ics.uci.edu/ml/datasets/IDA2016Challenge', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/414/ida2016challenge'
# http://archive.ics.uci.edu/ml/datasets/Optical+Interconnection+Network+Optical+Interconnection+Network to http://archive.ics.uci.edu/dataset/449/optical+interconnection+network
df.loc[df['original_data_url'] == 'http://archive.ics.uci.edu/ml/datasets/Optical+Interconnection+Network+', 'original_data_url'] = 'http://archive.ics.uci.edu/dataset/449/optical+interconnection+network'
# https://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits to https://archive.ics.uci.edu/dataset/81/pen+based+recognition+of+handwritten+digits
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/81/pen+based+recognition+of+handwritten+digits'
# https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+ to https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/471/electrical+grid+stability+simulated+data'
# https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise to https://archive.ics.uci.edu/dataset/291/airfoil+self+noise
df.loc[df['original_data_url'] == 'https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise', 'original_data_url'] = 'https://archive.ics.uci.edu/dataset/291/airfoil+self+noise'

In [7]:
import os
if not os.path.exists('./datasets'):
    os.makedirs('./datasets')
df[df['original_data_url'].str.contains('archive.ics.uci.edu')].to_json('./datasets/archive_ics_uci_edu.json', orient='records')
df[df['original_data_url'].str.contains('kaggle')].to_json('./datasets/kaggle.json', orient='records')
df[df['original_data_url'].str.contains
('ebi.ac.uk')].to_json('./datasets/ebi_ac_uk.json', orient='records')

In [8]:
df[df['original_data_url'].str.contains('archive.ics.uci.edu')]

In [9]:
df.drop_duplicates(subset='original_data_url')[df['original_data_url'].str.contains('archive.ics.uci.edu')]

In [10]:
df[df['original_data_url'].str.contains('kaggle')]

In [11]:
df[df['original_data_url'].str.contains('ebi.ac.uk')]

In [12]:
df.drop_duplicates(subset='original_data_url')['original_data_url']

In [13]:
os.getcwd()

In [14]:
import json
with open('./openml_tags/openml_tags/spiders/data/archive_ics_uci_edu.json') as f:
    data = json.load(f)
    
print(len(data))

with open('./datasets/archive_ics_uci_edu.json') as f:
    data = json.load(f)

print(len(data))

In [16]:
# same but for ebi
with open('./openml_tags/openml_tags/spiders/data/ebi_ac_uk.json') as f:
    data = json.load(f)
    
print(len(data))

with open('./datasets/ebi_ac_uk.json') as f:
    data = json.load(f)

print(len(data))