# Entry 25 notebook - figuring out openml.org

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import openml

### `fetch_openml`

In [2]:
from sklearn.datasets import fetch_openml

titanic = fetch_openml('titanic', version=1, as_frame=True)

The `fetch_openml` function from the `sklearn.datasets` module was throwing errors and whining about not supporting strings for the dataset name. I got it to download a version of the titanic dataset, but it was the wrong version.

Then, when I created this notebook to document the error and find a workaround, it suddenly started working.

Technically, I totally asked for this when I switched into a computer/tech based career. Doesn't make it any less annoying though.

In [3]:
titanic.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

The `.frame` method includes the target class.

In [4]:
titanic.frame.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


The `.data` method doesn't include the target and has to be used in conjunction with the `.target` method.

In [5]:
titanic.data.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### `openml` package

Another option is the `openml` package. In addition to loading data, I can also list out all of the available datasets, as well as quite a few of their attributes.

In [6]:
openml_opts = openml.datasets.list_datasets(output_format='dataframe')
# openml_opts[openml_opts['name'].str.startswith('T')].sort_values('name')
openml_opts.head()

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0


Due to the kind of data I use at work, I'm interested in binary classification problems where the data is imbalanced. I can filter the available datasets to just the kinds I'm intersted in.

In [8]:
binary_ds = openml_opts[openml_opts['NumberOfClasses']==2]
# binary_ds.loc[binary_ds['MinorityClassSize']/binary_ds['MajorityClassSize'] < 0.07, 'name'].unique().tolist()
binary_ds[binary_ds['MinorityClassSize']/binary_ds['MajorityClassSize'] < 0.07].sort_values('name')

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
274,274,20_newsgroups.drift,1,1,active,ARFF,379943.0,2.0,19997.0,2.0,1002.0,399940.0,0.0,0.0,0.0,1001.0
40517,40517,20_newsgroups.drift,2,1,active,arff,379943.0,2.0,19997.0,2.0,1001.0,399940.0,0.0,0.0,0.0,1001.0
41138,41138,APSFailure,1,869,active,ARFF,74625.0,2.0,1375.0,2.0,171.0,76000.0,75244.0,1078695.0,170.0,1.0
4135,4135,Amazon_employee_access,1,2,active,ARFF,30872.0,7518.0,1897.0,2.0,10.0,32769.0,0.0,0.0,0.0,10.0
42256,42256,Asteroid_Dataset,2,10666,active,arff,125975.0,,156.0,2.0,34.0,126131.0,96.0,99.0,32.0,1.0
42252,42252,Asteroid_Dataset,1,10666,active,arff,125975.0,,156.0,2.0,34.0,126131.0,96.0,99.0,31.0,2.0
131,131,"BNG(sick,nominal,1000000)",1,1,active,ARFF,938761.0,5.0,61239.0,2.0,30.0,1000000.0,0.0,0.0,0.0,30.0
1178,1178,BNG(solar-flare),1,1,active,ARFF,648320.0,6.0,15232.0,2.0,13.0,663552.0,0.0,0.0,0.0,13.0
1217,1217,Click_prediction_small,2,2,active,ARFF,142949.0,2.0,6690.0,2.0,12.0,149639.0,0.0,0.0,11.0,1.0
1216,1216,Click_prediction_small,1,2,active,ARFF,1429610.0,2.0,66781.0,2.0,12.0,1496391.0,0.0,0.0,11.0,1.0


In [9]:
titanic = openml.datasets.get_dataset(40945)

In [10]:
titanic

OpenML Dataset
Name..........: Titanic
Version.......: 1
Format........: ARFF
Upload Date...: 2017-10-16 01:17:36
Licence.......: Public
Download URL..: https://www.openml.org/data/v1/download/16826755/Titanic.arff
OpenML URL....: https://www.openml.org/d/40945
# of features.: 14
# of instances: 1309

In [11]:
raw_df = titanic.get_data()[0]

In [12]:
raw_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
