# Inspect datasets in different versions (original / perturbed / task / statistical)

In [1]:
import tabmemcheck
import dataframe_image as dfi
import yaml
from pathlib import Path
import pandas as pd 

## Save the headers of all datasets as images

In [13]:
with open("datasets.yaml") as file:
    datasets = yaml.load(file, Loader=yaml.FullLoader)["datasets"]
    
versions = ['original', 'perturbed', 'task', 'statistical']

for csv_file, yaml_file in datasets:
    # dataset name
    fn = Path(yaml_file).stem
    for version in versions:
        df = tabmemcheck.datasets.load_dataset(csv_file, yaml_file, version, seed=0)
        # if the dataframe has more than 7 rows, only show the first 4 and the last 2
        dfi.export(df.head(n=5), f'figures/datasets/{fn}_{version}.png', dpi=150, max_cols=8)

## Generate latex code to have the images as figures in the paper

In [8]:
for csv_file, yaml_file in datasets:
    fn = Path(yaml_file).stem     # dataset name
    latex_code = r"""\begin{figure}[ht]
        \centering
        """
    for version in versions:
        latex_code += r"""\begin{subfigure}[b]{\linewidth}
            \includegraphics[width=""" + ('0.5' if version == 'statistical' else '') +  r"""\linewidth]{""" + f'figures/datasets/{fn}_{version}.png' + r"""}
            \caption{""" + f'{version.capitalize()}'  r"""}
            \label{""" + f'fig:{fn}_{version}.png'  r"""}
        \end{subfigure}
        """
    latex_code += r"""\caption{""" + f'Different Versions of the {fn.capitalize()} Dataset'  r"""}
    \label{""" + f'fig:{fn}_dataset'  r"""}
\end{figure}"""
    print(latex_code)

\begin{figure}[ht]
        \centering
        \begin{subfigure}[b]{\linewidth}
            \includegraphics[width=\linewidth]{figures/datasets/iris_original.png}
            \caption{Original}
            \label{fig:iris_original.png}
        \end{subfigure}
        \begin{subfigure}[b]{\linewidth}
            \includegraphics[width=\linewidth]{figures/datasets/iris_perturbed.png}
            \caption{Perturbed}
            \label{fig:iris_perturbed.png}
        \end{subfigure}
        \begin{subfigure}[b]{\linewidth}
            \includegraphics[width=\linewidth]{figures/datasets/iris_task.png}
            \caption{Task}
            \label{fig:iris_task.png}
        \end{subfigure}
        \begin{subfigure}[b]{\linewidth}
            \includegraphics[width=0.5\linewidth]{figures/datasets/iris_statistical.png}
            \caption{Statistical}
            \label{fig:iris_statistical.png}
        \end{subfigure}
        \caption{Different Versions of the Iris Dataset}
    \label{fig:iri

# Iris

In [30]:
tabmemcheck.datasets.load_dataset('iris.csv', 'iris', 'original', seed=0).head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [31]:
tabmemcheck.datasets.load_dataset('iris.csv', 'iris', 'perturbed', seed=0).head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.2,3.4,1.3,0.2,Iris-setosa
1,5.0,2.9,1.3,0.2,Iris-setosa
2,4.8,3.3,1.2,0.2,Iris-setosa
3,4.5,3.2,1.4,0.2,Iris-setosa
4,4.9,3.5,1.3,0.2,Iris-setosa


In [16]:
tabmemcheck.datasets.load_dataset('iris.csv', 'iris', 'task', seed=0)

Unnamed: 0,Length of Sepal (cm),Width of Sepal (cm),Length of Petal (cm),Width of Petal (cm),Kind of Flower
0,5.21,3.43,1.30,0.19,Setosa
1,4.99,2.89,1.28,0.19,Setosa
2,4.81,3.30,1.17,0.19,Setosa
3,4.47,3.21,1.37,0.21,Setosa
4,4.92,3.50,1.30,0.20,Setosa
...,...,...,...,...,...
145,6.61,3.07,5.07,2.39,Virginica
146,6.22,2.59,4.92,1.80,Virginica
147,6.62,2.90,5.11,2.09,Virginica
148,6.14,3.47,5.26,2.19,Virginica


In [17]:
tabmemcheck.datasets.load_dataset('iris.csv', 'iris', 'statistical', seed=0)

Unnamed: 0,X1,X2,X3,X4,Y
0,2.65,-2.76,4.69,4.33,0
1,3.43,1.30,4.71,4.37,0
2,4.22,-1.73,4.91,4.40,0
3,5.54,-1.12,4.46,4.30,0
4,3.70,-3.34,4.59,4.24,0
...,...,...,...,...,...
145,-3.07,-0.14,-2.39,-5.19,1
146,-1.49,3.54,-2.13,-2.63,1
147,-3.03,1.14,-2.46,-3.96,1
148,-1.17,-3.15,-2.76,-4.29,1


# Wine

In [2]:
tabmemcheck.datasets.load_dataset('uci-wine.csv', 'uci-wine', 'original', seed=0)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280_od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


In [None]:
tabmemcheck.datasets.load_dataset('uci-wine.csv', 'uci-wine', 'perturbed', seed=0)

None of the perturbed rows appear in the original dataset.
Feature alcohol: 0.42% variation.
Feature malic_acid: 0.51% variation.
Feature ash: 0.42% variation.
Feature alcalinity_of_ash: 0.31% variation.
Feature magnesium: 1.02% variation.
Feature total_phenols: 0.47% variation.
Feature flavanoids: 0.72% variation.
Feature nonflavanoid_phenols: 3.08% variation.
Feature proanthocyanins: 0.73% variation.
Feature color_intensity: 0.24% variation.
Feature hue: 1.11% variation.
Feature od280_od315_of_diluted_wines: 0.41% variation.
Feature proline: 0.90% variation.
Avg. Number of Matching Features: 1.58


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280_od315_of_diluted_wines,proline,target
0,14.31,1.72,2.44,15.5,126,2.79,3.05,0.29,2.28,5.65,1.05,3.91,1063,1
1,13.23,1.79,2.13,11.2,99,2.66,2.77,0.25,1.29,4.39,1.06,3.41,1058,1
2,13.17,2.37,2.68,18.7,102,2.79,3.25,0.31,2.80,5.69,1.02,3.18,1195,1
3,14.32,1.96,2.49,16.9,114,3.86,3.48,0.23,2.19,7.81,0.85,3.44,1479,1
4,13.20,2.60,2.86,21.0,117,2.79,2.70,0.38,1.83,4.31,1.05,2.92,737,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.76,5.64,2.46,20.6,96,1.69,0.62,0.53,1.07,7.69,0.65,1.75,743,3
174,13.49,3.92,2.49,23.0,101,1.79,0.74,0.44,1.42,7.29,0.71,1.55,740,3
175,13.36,4.27,2.25,20.0,121,1.60,0.70,0.42,1.36,10.21,0.58,1.57,825,3
176,13.10,2.58,2.36,20.0,119,1.64,0.69,0.52,1.45,9.31,0.61,1.61,847,3


In [None]:
tabmemcheck.datasets.load_dataset('uci-wine.csv', 'uci-wine', 'task', seed=0)

Feature alcohol: 0.90% variation.
Feature malic_acid: 0.61% variation.
Feature ash: 0.48% variation.
Feature alcalinity_of_ash: 0.58% variation.
Feature magnesium: 1.15% variation.
Feature total_phenols: 0.57% variation.
Feature flavanoids: 0.82% variation.
Feature nonflavanoid_phenols: 3.92% variation.
Feature proanthocyanins: 0.76% variation.
Feature color_intensity: 0.28% variation.
Feature hue: 1.35% variation.
Feature od280_od315_of_diluted_wines: 0.47% variation.
Feature proline: 1.35% variation.


Unnamed: 0,Alcohol,Malic Acid,Ash,Alcalinity of Ash,Magnesium,Total Phenols,Flavanoids,Nonflavanoid Phenols,Proanthocyanins,Color Intensity,Hue,OD280/OD315 of Diluted Wines,Proline,Type of Wine
0,14.28,1.72,2.44,15.39,126.06,2.78,3.08,0.28,2.29,5.66,1.02,3.92,1071.30,Type 2
1,13.13,1.77,2.15,11.37,99.73,2.65,2.75,0.25,1.29,4.38,1.06,3.40,1034.78,Type 2
2,13.18,2.35,2.66,18.62,100.53,2.81,3.26,0.28,2.82,5.67,1.04,3.18,1183.60,Type 2
3,14.30,1.96,2.53,16.82,113.08,3.84,3.47,0.25,2.18,7.78,0.84,3.45,1471.72,Type 2
4,13.01,2.59,2.86,20.94,118.56,2.80,2.68,0.40,1.80,4.35,1.02,2.92,729.25,Type 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.94,5.67,2.48,20.65,98.07,1.68,0.62,0.52,1.05,7.70,0.65,1.75,742.52,Type 1
174,13.49,3.92,2.49,22.96,100.18,1.78,0.76,0.44,1.41,7.29,0.68,1.55,737.42,Type 1
175,13.22,4.27,2.27,20.13,118.92,1.59,0.68,0.44,1.36,10.20,0.62,1.59,825.06,Type 1
176,13.21,2.60,2.36,20.12,119.91,1.65,0.66,0.52,1.47,9.29,0.60,1.61,842.39,Type 1


In [None]:
tabmemcheck.datasets.load_dataset('uci-wine.csv', 'uci-wine', 'statistical', seed=0)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,Y
0,-5.06,1.89,-0.93,4.17,-6.21,-2.53,-3.60,2.09,-4.13,-0.96,-0.88,-6.10,-3.38,1
1,-0.41,1.74,2.61,8.13,-0.02,-1.88,-2.39,2.87,1.79,0.91,-1.47,-3.74,-3.07,1
2,-0.64,0.00,-3.58,0.82,-0.27,-2.68,-4.10,2.12,-7.17,-0.92,-1.23,-2.71,-4.57,1
3,-5.15,1.17,-2.05,2.78,-3.06,-8.21,-4.79,2.90,-3.36,-3.91,1.72,-3.97,-7.60,1
4,0.01,-0.77,-6.04,-1.49,-4.42,-2.73,-2.24,-0.94,-1.31,1.06,-0.96,-1.41,0.20,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,-3.78,-9.98,-1.44,-1.18,0.39,3.34,4.74,-4.19,3.14,-3.79,4.57,4.01,0.15,0
174,-1.93,-4.74,-1.45,-3.47,-0.15,2.82,4.24,-2.01,0.98,-3.23,4.16,5.12,0.09,0
175,-0.79,-5.82,1.22,-0.48,-4.44,3.67,4.50,-2.17,1.36,-7.41,4.96,4.77,-0.75,0
176,-0.77,-0.78,0.19,-0.62,-4.72,3.47,4.53,-4.16,0.69,-6.19,5.29,4.74,-1.04,0


# Adult

In [160]:
tabmemcheck.datasets.load_dataset('datasets/tabular/adult-train.csv', 'config/transform/adult.yaml', 'original')

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
tabmemcheck.datasets.load_dataset('adult-train.csv', 'config/transform/adult.yaml, 'perturbed', seed=0)

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,40,State-gov,77118,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2209,0,39,United-States,<=50K
1,51,Self-emp-not-inc,83150,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,14,United-States,<=50K
2,39,Private,215647,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,39,United-States,<=50K
3,52,Private,235393,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,39,United-States,<=50K
4,27,Private,338710,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,39,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,28,Private,256940,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,37,United-States,<=50K
32557,41,Private,154483,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,41,United-States,>50K
32558,59,Private,151678,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,39,United-States,<=50K
32559,21,Private,200967,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,19,United-States,<=50K


In [6]:
tabmemcheck.datasets.load_dataset('adult-train.csv', 'config/transform/adult.yaml', 'task', seed=0)

Unnamed: 0,Age,Employment,Census Weight,Highest Education,Education as number,Married,Work,Relationship,Self-reported race,Self-reported gender,Investement Income,Investment Loss,Hours worked,Country of origin,Yearly Income
0,40,State government,77118,Bachelor's degree,13,Never married,Administrative clerical,Not in family,White,Male,2213.57,0.0,39,United States,Less than 50K
1,51,Self-employed and not incorporated,83150,Bachelor's degree,13,"Married, civilian spouse","Executive, managerial, or otherwise supervisor...",Husband,White,Male,0.00,0.0,14,United States,Less than 50K
2,39,Private Sector,215647,High School Graduate,9,Divorced,Handlers and cleaners,Not in family,White,Male,0.00,0.0,39,United States,Less than 50K
3,52,Private Sector,235393,11th grade,7,"Married, civilian spouse",Handlers and cleaners,Husband,Black,Male,0.00,0.0,39,United States,Less than 50K
4,27,Private Sector,338710,Bachelor's degree,13,"Married, civilian spouse",Professional with specialized knowledge,Wife,Black,Female,0.00,0.0,39,Cuba,Less than 50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,28,Private Sector,256940,Associate degree in academic studies,12,"Married, civilian spouse",Technical support role,Wife,White,Female,0.00,0.0,37,United States,Less than 50K
32557,41,Private Sector,154483,High School Graduate,9,"Married, civilian spouse",Machine operator inspector,Husband,White,Male,0.00,0.0,41,United States,More than 50K
32558,59,Private Sector,151678,High School Graduate,9,Widowed,Administrative clerical,Unmarried,White,Female,0.00,0.0,39,United States,Less than 50K
32559,21,Private Sector,200967,High School Graduate,9,Never married,Administrative clerical,Child in the family,White,Male,0.00,0.0,19,United States,Less than 50K


In [22]:
tabmemcheck.datasets.load_dataset('adult-train.csv', 'config/transform/adult.yaml', 'statistical')

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,Y
0,0.21,8,3.49,7,-3.87,1,9,1,1,1,-0.40,0.78,-0.17,13,1
1,-2.98,5,3.33,7,-3.84,0,1,4,1,1,0.44,0.77,7.13,13,1
2,0.36,3,-0.88,5,1.43,2,7,1,1,1,0.48,0.71,0.41,13,1
3,-3.34,3,-1.37,2,3.82,0,7,4,4,1,0.46,0.82,-0.12,13,1
4,2.24,3,-4.74,7,-3.70,0,5,3,4,0,0.39,0.69,0.34,29,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,2.62,3,-2.05,3,-2.50,0,2,3,1,0,0.50,0.74,0.36,13,1
32557,-0.61,3,1.15,5,1.37,0,10,4,1,1,0.55,0.67,-0.19,13,0
32558,-4.93,3,1.13,5,1.32,3,9,0,1,0,0.51,0.68,0.46,13,1
32559,3.80,3,-0.42,5,1.41,1,9,5,1,1,0.50,0.73,5.80,13,1


# California Housing

In [None]:
tabmemcheck.datasets.load_dataset('california-housing.csv', 'california-housing', 'original')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,342200.0
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND,78100.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND,77100.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND,92300.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND,84700.0


In [None]:
tabmemcheck.datasets.load_dataset('california-housing.csv', 'california-housing', 'perturbed')

None of the perturbed rows appear in the original dataset.
Feature longitude: 0.01% variation.
Feature latitude: 0.03% variation.
Feature housing_median_age: 4.83% variation.
Feature total_rooms: 0.45% variation.
Feature total_bedrooms: 1.13% variation.
Feature population: 0.77% variation.
Feature households: 1.19% variation.
Feature median_income: 1.62% variation.
Avg. Number of Matching Features: 2.88


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.22,37.89,40.0,882.0,132.0,327.0,131.0,8.3982,NEAR BAY,452600.0
1,-122.23,37.87,22.0,7094.0,1108.0,2400.0,1133.0,8.3974,NEAR BAY,358500.0
2,-122.23,37.86,51.0,1457.0,193.0,494.0,181.0,7.1884,NEAR BAY,352100.0
3,-122.24,37.84,51.0,1279.0,230.0,550.0,216.0,5.6381,NEAR BAY,341300.0
4,-122.26,37.86,51.0,1632.0,279.0,574.0,255.0,3.7892,NEAR BAY,342200.0
...,...,...,...,...,...,...,...,...,...,...
20635,-121.08,39.47,24.0,1666.0,373.0,850.0,325.0,1.5383,INLAND,78100.0
20636,-121.22,39.50,19.0,698.0,154.0,360.0,119.0,2.4628,INLAND,77100.0
20637,-121.23,39.44,18.0,2244.0,484.0,1011.0,436.0,1.6690,INLAND,92300.0
20638,-121.31,39.44,17.0,1865.0,413.0,738.0,353.0,1.9522,INLAND,84700.0


In [None]:
tabmemcheck.datasets.load_dataset('california-housing.csv', 'california-housing', 'task')

Feature longitude: 0.01% variation.
Feature latitude: 0.03% variation.
Feature housing_median_age: 4.84% variation.
Feature total_rooms: 0.43% variation.
Feature total_bedrooms: 1.11% variation.
Feature population: 0.78% variation.
Feature households: 1.21% variation.
Feature median_income: 999715.06% variation.


  "astype": lambda x, dtype, seed: x.astype(dtype),


Unnamed: 0,Longitude,Latitude,Median age,Total Rooms,Total Bedrooms,Population,Households,Median income,How close to the ocean?,Median House Value
0,-122.24,37.89,40,876,125,317,122,84222,Near San Francisco Bay,4526
1,-122.23,37.87,20,7100,1110,2398,1133,82313,Near San Francisco Bay,3585
2,-122.25,37.86,52,1472,191,499,174,73284,Near San Francisco Bay,3521
3,-122.24,37.84,52,1282,239,567,221,56391,Near San Francisco Bay,3413
4,-122.26,37.84,52,1632,284,564,254,37752,Near San Francisco Bay,3422
...,...,...,...,...,...,...,...,...,...,...
20635,-121.10,39.47,24,1671,378,851,334,15083,"Inland, far from ocean",781
20636,-121.22,39.50,17,694,145,366,115,25538,"Inland, far from ocean",771
20637,-121.21,39.44,18,2248,490,997,438,17070,"Inland, far from ocean",923
20638,-121.33,39.42,17,1863,406,735,354,17792,"Inland, far from ocean",847


In [None]:
tabmemcheck.datasets.load_dataset('california-housing.csv', 'california-housing', 'statistical')

  "astype": lambda x, dtype, seed: x.astype(dtype),


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,Y
0,4.42,-3.55,-2.99,2.69,-0.32,3.34,3.17,-7.72,3,-7.14
1,4.33,-3.37,2.25,-6.83,-0.39,-2.85,-5.64,-7.75,3,-4.36
2,4.47,-3.49,-6.20,1.81,-0.24,2.76,2.87,-5.80,3,-4.26
3,4.38,-3.34,-5.98,2.06,-0.38,2.46,2.42,-3.23,3,-3.81
4,4.39,-3.50,-6.12,1.56,-0.26,2.52,2.10,0.09,3,-3.87
...,...,...,...,...,...,...,...,...,...,...
20635,2.58,-6.01,1.28,1.56,-0.26,1.61,1.46,3.96,2,3.62
20636,2.75,-6.12,2.51,3.03,-0.32,3.15,3.34,2.39,2,3.70
20637,2.73,-6.00,3.28,0.50,-0.36,1.25,0.63,3.72,2,3.26
20638,2.94,-6.00,3.13,1.21,-0.32,2.09,1.30,3.42,2,3.46


In [124]:
pd.__version__

'2.1.4'

# Titanic

In [141]:
tabmemcheck.datasets.load_dataset('datasets/tabular/titanic-train.csv', 'config/transform/titanic.yaml', 'original')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S,0
887,888,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S,1
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S,0
889,890,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C,1


In [179]:
tabmemcheck.datasets.load_dataset('datasets/tabular/titanic-train.csv', 'config/transform/titanic.yaml', 'perturbed', seed=0)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1814,3,"Hawkins, Mr. Edward James",male,23,1,0,A/5 21182,7.59,,S,0
1,1815,1,"Harrington, Mrs. Charles Edward (Elizabeth Har...",female,39,1,0,PC 17521,71.66,C86,C,1
2,1816,3,"Korhonen, Miss. Sari",female,27,0,0,STON/O2. 3101249,7.91,,S,1
3,1817,1,"Montclair, Mrs. Henri Lucien (Rose Elise Dubois)",female,34,1,0,113786,53.38,C122,S,1
4,1818,3,"Bennett, Mr. Charles Edward",male,34,0,0,373422,8.02,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,2700,2,"Kazlauskas, Rev. Petras",male,26,0,0,211449,12.87,,S,0
887,2701,1,"Bennett, Miss. Eleanor Jane",female,20,0,0,112037,30.37,B41,S,1
888,2702,3,"Harrington, Miss. Elizabeth Anne ""Lizzie""",female,,1,3,W./C. 6596,23.79,,S,0
889,2703,1,"Wolff, Mr. Friedrich August",male,27,0,0,111422,29.93,C146,C,1


In [180]:
tabmemcheck.datasets.load_dataset('datasets/tabular/titanic-train.csv', 'config/transform/titanic.yaml', 'task', seed=0)

Unnamed: 0,Passenger ID,Class,Name,Sex,Age,Number of Siblings/Spouses aboard,Number of Parents/Children aboard,Ticket Number,Fare,Cabin Number,Port of Embarkation,Survived
0,1814,Third,Mr. Edward James Hawkins,Man,23,1,0,No. 21182,7.58,Unknown,Southampton,Not Survived
1,1815,First,Mrs. Charles Edward Harrington,Woman,39,1,0,No. 17521,71.65,C86,Cherbourg,Survived
2,1816,Third,Miss. Sari Korhonen,Woman,27,0,0,No. 3101249,7.90,Unknown,Southampton,Survived
3,1817,First,Mrs. Henri Lucien Montclair,Woman,34,1,0,No. 113786,53.36,C122,Southampton,Survived
4,1818,Third,Mr. Charles Edward Bennett,Man,34,0,0,No. 373422,8.03,Unknown,Southampton,Not Survived
...,...,...,...,...,...,...,...,...,...,...,...,...
886,2700,Second,Rev. Petras Kazlauskas,Man,26,0,0,No. 211449,12.86,Unknown,Southampton,Not Survived
887,2701,First,Miss. Eleanor Jane Bennett,Woman,20,0,0,No. 112037,30.36,B41,Southampton,Survived
888,2702,Third,"Miss. Elizabeth Anne ""Lizzie"" Harrington",Woman,30,1,3,No. 6596,23.78,Unknown,Southampton,Not Survived
889,2703,First,Mr. Friedrich August Wolff,Man,27,0,0,No. 111422,29.93,C146,Cherbourg,Survived


In [181]:
tabmemcheck.datasets.load_dataset('datasets/tabular/titanic-train.csv', 'config/transform/titanic.yaml', 'statistical', seed=0)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,Y
0,5.73,1,441,0,1.71,-1.36,1.50,811,1.60,144,2,0
1,5.79,0,344,1,-2.31,-1.33,1.41,660,-2.69,149,0,1
2,5.69,1,566,1,0.77,1.64,1.50,338,1.55,144,2,1
3,5.71,0,553,1,-1.05,-1.31,1.49,129,-1.47,103,2,1
4,5.84,1,229,0,-1.21,1.56,1.48,247,1.62,144,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,-5.64,2,745,0,1.00,1.48,1.51,22,1.36,144,2,0
887,-5.71,0,731,1,2.49,1.53,1.53,785,0.07,80,2,1
888,-5.79,1,736,1,0.01,-1.40,-10.18,241,0.55,144,2,0
889,-5.64,0,749,0,0.61,1.48,1.45,624,0.14,123,0,1


# OpenML Diabetes

In [2]:
tabmemcheck.datasets.load_dataset('openml-diabetes.csv', 'openml-diabetes', 'original')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [2]:
tabmemcheck.datasets.load_dataset('openml-diabetes.csv', 'openml-diabetes', 'perturbed', seed=0)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,7,147,71,36,0,33.9,0.630,51,1
1,1,86,67,30,0,26.5,0.350,30,0
2,9,182,65,0,0,23.7,0.675,31,1
3,1,90,65,22,95,28.6,0.165,22,0
4,0,138,41,36,167,42.7,2.285,32,1
...,...,...,...,...,...,...,...,...,...
763,9,102,77,47,181,32.5,0.167,64,0
764,1,121,69,28,0,37.0,0.337,28,0
765,6,120,71,22,111,25.8,0.249,29,0
766,1,127,61,0,0,30.2,0.353,48,1


In [4]:
tabmemcheck.datasets.load_dataset('openml-diabetes.csv', 'openml-diabetes', 'task', seed=0)

Unnamed: 0,Number of pregnancies,Glucose level,Blood pressure,Skin thickness,Insulin level,Body mass index,Diabetes pedigree function,Age,Test Result
0,7,147.01,71.01,36.02,0.00,33.9,0.63,51,Positive
1,1,86.00,67.02,30.01,0.00,26.5,0.35,30,Negaitive
2,9,182.02,65.01,0.00,0.00,23.7,0.67,31,Positive
3,1,90.01,65.00,21.99,94.98,28.6,0.16,22,Negaitive
4,0,137.99,41.00,35.99,167.01,42.7,2.29,32,Positive
...,...,...,...,...,...,...,...,...,...
763,9,101.99,77.02,47.00,180.99,32.5,0.17,64,Negaitive
764,1,121.02,69.01,28.00,0.00,37.0,0.33,28,Negaitive
765,6,120.01,71.01,21.99,111.00,25.8,0.25,29,Negaitive
766,1,127.00,60.98,0.00,0.00,30.2,0.35,48,Positive


In [33]:
tabmemcheck.datasets.load_dataset('openml-diabetes.csv', 'openml-diabetes', 'statistical', seed=0)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y
0,-3.15,-2.63,-0.26,-3.26,2.24,-0.83,-1.56,-5.02,1
1,2.70,3.62,0.38,-1.91,2.33,2.26,1.22,0.93,0
2,-4.97,-6.42,0.65,4.21,2.30,3.47,-1.93,0.63,1
3,2.69,3.20,0.64,-0.24,-0.44,1.46,3.15,3.08,0
4,3.74,-1.83,4.87,-3.24,-2.57,-4.67,-18.30,0.31,1
...,...,...,...,...,...,...,...,...,...
763,-4.92,1.97,-1.34,-5.49,-2.96,-0.23,3.02,-8.76,0
764,2.89,0.01,0.06,-1.59,2.31,-2.20,1.39,1.60,0
765,-2.04,0.07,-0.27,-0.29,-0.93,2.58,2.16,1.18,0
766,2.80,-0.66,1.39,4.29,2.34,0.81,1.28,-4.19,1


# Spaceship Titanic

In [8]:
tabmemcheck.datasets.load_dataset('datasets/tabular/spaceship-titanic-train.csv', 'config/transform/spaceship-titanic.yaml', 'original')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [9]:
tabmemcheck.datasets.load_dataset('datasets/tabular/spaceship-titanic-train.csv', 'config/transform/spaceship-titanic.yaml', 'perturbed', seed=0)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,9794_01,Europa,False,B/12/P,TRAPPIST-1e,40.0,False,0.0,0.0,0.0,0.0,0.0,Sadia Khatun,False
1,9795_01,Earth,False,F/12/S,TRAPPIST-1e,25.0,False,104.0,9.0,22.0,544.0,45.0,Mariana Flores,True
2,9796_01,Europa,False,A/12/S,TRAPPIST-1e,59.0,True,40.0,3580.0,0.0,6720.0,50.0,Borivik Grentov,False
3,9796_02,Europa,False,A/12/S,TRAPPIST-1e,32.0,False,0.0,1282.0,370.0,3332.0,197.0,Lorik Vansen,False
4,9797_01,Earth,False,F/13/S,TRAPPIST-1e,15.0,False,301.0,72.0,156.0,562.0,2.0,Billy Montalvo,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,19069_01,Europa,False,A/110/P,55 Cancri e,40.0,True,0.0,6824.0,0.0,1641.0,73.0,Tiberius Darkwater,False
8689,19071_01,Earth,True,G/1511/S,PSO J318.5-22,19.0,False,0.0,0.0,0.0,0.0,0.0,Lorik Vandermere,False
8690,19072_01,Earth,False,G/1512/S,TRAPPIST-1e,27.0,False,0.0,0.0,1876.0,1.0,0.0,Lila O'Reilly,True
8691,19073_01,Europa,False,E/620/S,55 Cancri e,33.0,False,0.0,1050.0,0.0,349.0,3234.0,Davion Montclaire,False


In [3]:
tabmemcheck.datasets.load_dataset('datasets/tabular/spaceship-titanic-train.csv', 'config/transform/spaceship-titanic.yaml', 'task', seed=0)

Unnamed: 0,ID,Home Planet,Cryo,Cabin,Destination,Age,Very Important Person,Room,Food,Shopping,Spa,Virtual Reality Deck,Name,Transported
0,9794_01,Europa,No,B-12-P,TRAPPIST,40,No,0.00,0.00,0.00,0.00,0.00,Sadia Khatun,Not Transported
1,9795_01,Earth,No,F-12-S,TRAPPIST,25,No,103.98,9.00,21.55,543.90,45.32,Mariana Flores,Transported
2,9796_01,Europa,No,A-12-S,TRAPPIST,59,Yes,39.74,3579.47,0.00,6720.41,50.31,Borivik Grentov,Not Transported
3,9796_02,Europa,No,A-12-S,TRAPPIST,32,No,0.00,1282.18,371.13,3331.96,197.25,Lorik Vansen,Not Transported
4,9797_01,Earth,No,F-13-S,TRAPPIST,15,No,300.66,72.50,156.09,562.52,2.00,Billy Montalvo,Transported
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,19069_01,Europa,No,A-110-P,CANCRI,40,Yes,0.00,6824.34,0.00,1640.73,72.74,Tiberius Darkwater,Not Transported
8689,19071_01,Earth,Yes,G-1511-S,PSO,19,No,0.00,0.00,0.00,0.00,0.00,Lorik Vandermere,Not Transported
8690,19072_01,Earth,No,G-1512-S,TRAPPIST,27,No,0.00,0.00,1875.83,1.00,0.00,Lila O'Reilly,Transported
8691,19073_01,Europa,No,E-620-S,CANCRI,33,No,0.00,1050.08,0.00,348.51,3233.39,Davion Montclaire,Not Transported


In [11]:
tabmemcheck.datasets.load_dataset('datasets/tabular/spaceship-titanic-train.csv', 'config/transform/spaceship-titanic.yaml', 'statistical', seed=0)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,Y
0,4588,0,0,2902,2,-2.56,2,1.08,0.93,1.05,0.92,0.89,5778,0
1,4441,2,0,5182,2,0.78,2,0.59,0.91,0.79,-0.68,0.74,2660,1
2,7071,0,0,860,2,-7.00,0,1.00,-6.46,0.95,-18.84,0.71,217,0
3,6546,0,0,860,2,-0.80,2,1.05,-1.70,-0.99,-8.77,0.36,204,0
4,105,2,0,4856,2,3.18,2,-0.39,0.86,0.07,-0.63,0.85,516,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,6714,0,0,458,1,-2.67,0,1.18,-13.15,0.91,-3.95,0.64,4594,0
8689,301,2,2,50,0,2.34,2,1.19,0.98,0.96,0.89,0.97,192,0
8690,4552,2,0,4870,2,0.49,2,1.03,0.93,-9.42,0.94,0.87,839,1
8691,7586,0,0,3219,1,-0.94,2,1.13,-1.28,0.87,-0.07,-8.53,3091,0


# ACS Income 2022

In [185]:
tabmemcheck.datasets.load_dataset('datasets/tabular/acs-income-2022.csv', 'config/transform/acs-income.yaml', 'original').head()

Unnamed: 0,Age,Class of worker,Educational attainment,Marital status,Occupation,Place of birth,Usual hours worked per week past 12 months,Sex,Recoded race,Income
0,26,Employee of a private for-profit company or bu...,"1 or more years of college credit, no degree",Never married or under 15 years old,Food Service Managers,Mexico,30,Female,Some Other Race alone,"Less than $50,000 per year."
1,38,Federal government employee,Regular high school diploma,Divorced,Photographers,Arizona/AZ,40,Female,White alone,"Less than $50,000 per year."
2,23,Federal government employee,Regular high school diploma,Never married or under 15 years old,Military Enlisted Tactical Operations And Air/...,Montana/MT,40,Male,Two or More Races,"Less than $50,000 per year."
3,20,Employee of a private for-profit company or bu...,"1 or more years of college credit, no degree",Never married or under 15 years old,Fast Food And Counter Workers,Nevada/NV,20,Male,Some Other Race alone,"Less than $50,000 per year."
4,20,Federal government employee,Regular high school diploma,Never married or under 15 years old,"Military, Rank Not Specified",Tennessee/TN,50,Female,Two or More Races,"Less than $50,000 per year."


In [186]:
tabmemcheck.datasets.load_dataset('datasets/tabular/acs-income-2022.csv', 'config/transform/acs-income.yaml', 'perturbed', seed=0, print_stats=False)

Unnamed: 0,Age,Class of worker,Educational attainment,Marital status,Occupation,Place of birth,Usual hours worked per week past 12 months,Sex,Recoded race,Income
0,27,Employee of a private for-profit company or bu...,"1 or more years of college credit, no degree",Never married or under 15 years old,Food Service Managers,Mexico,29,Female,Some Other Race alone,"Less than $50,000 per year."
1,39,Federal government employee,Regular high school diploma,Divorced,Photographers,Arizona/AZ,41,Female,White alone,"Less than $50,000 per year."
2,24,Federal government employee,Regular high school diploma,Never married or under 15 years old,Military Enlisted Tactical Operations And Air/...,Montana/MT,39,Male,Two or More Races,"Less than $50,000 per year."
3,19,Employee of a private for-profit company or bu...,"1 or more years of college credit, no degree",Never married or under 15 years old,Fast Food And Counter Workers,Nevada/NV,19,Male,Some Other Race alone,"Less than $50,000 per year."
4,19,Federal government employee,Regular high school diploma,Never married or under 15 years old,"Military, Rank Not Specified",Tennessee/TN,51,Female,Two or More Races,"Less than $50,000 per year."
...,...,...,...,...,...,...,...,...,...,...
200572,33,Employee of a private for-profit company or bu...,Master's degree,Never married or under 15 years old,Mental Health Counselors,California/CA,41,Female,Asian alone,"More than $50,000 per year."
200573,32,"Local government employee (city, county, etc.)","1 or more years of college credit, no degree",Married,Social And Human Service Assistants,California/CA,41,Female,Two or More Races,"Less than $50,000 per year."
200574,54,Employee of a private for-profit company or bu...,Regular high school diploma,Married,First-Line Supervisors Of Office And Administr...,Mexico,41,Female,Some Other Race alone,"Less than $50,000 per year."
200575,50,Employee of a private for-profit company or bu...,Grade 8,Married,"Laborers And Freight, Stock, And Material Move...",Mexico,41,Male,Some Other Race alone,"Less than $50,000 per year."


In [None]:
tabmemcheck.datasets.load_dataset('datasets/csv/tabular/acs-income-2022.csv', 'acs-income', 'task', seed=0, print_stats=False)

Unnamed: 0,Age,Employment,Highest Education,Married,Work,Place of birth,Hours worked,Self-reported sex,Self-reported race,Yearly Income
0,27,Private Sector,"At least 1 year of college, no degree",Never married,Food Service Managers,Mexico,29,Female,Other,Less than 50K
1,39,Federal government,High school diploma,Divorced,Photographers,Arizona,41,Female,White,Less than 50K
2,24,Federal government,High school diploma,Never married,Military Enlisted Tactical Operations And Air/...,Montana,39,Male,Two or more,Less than 50K
3,19,Private Sector,"At least 1 year of college, no degree",Never married,Fast Food And Counter Workers,Nevada,19,Male,Other,Less than 50K
4,19,Federal government,High school diploma,Never married,"Military, Rank Not Specified",Tennessee,51,Female,Two or more,Less than 50K
...,...,...,...,...,...,...,...,...,...,...
200572,33,Private Sector,Master's,Never married,Mental Health Counselors,California,41,Female,Asian,More than 50K
200573,32,Local government,"At least 1 year of college, no degree",Married,Social And Human Service Assistants,California,41,Female,Two or more,Less than 50K
200574,54,Private Sector,High school diploma,Married,First-Line Supervisors Of Office And Administr...,Mexico,41,Female,Other,Less than 50K
200575,50,Private Sector,8th grade,Married,"Laborers And Freight, Stock, And Material Move...",Mexico,41,Male,Other,Less than 50K


In [None]:
tabmemcheck.datasets.load_dataset('datasets/csv/tabular/acs-income-2022.csv', 'acs-income', 'statistical', seed=0)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,Y
0,3.53,6,0,3,276,134,2.11,0,8,0
1,0.87,0,6,0,383,131,-0.94,0,4,0
2,4.18,0,6,3,57,191,-0.33,1,7,0
3,5.34,6,0,3,223,54,4.79,1,8,0
4,5.24,0,6,3,478,178,-3.50,0,7,0
...,...,...,...,...,...,...,...,...,...,...
200572,2.16,6,12,3,165,11,-0.92,0,5,1
200573,2.41,4,0,2,445,11,-0.90,0,7,0
200574,-2.50,6,6,2,370,134,-0.79,0,8,0
200575,-1.55,6,16,2,450,134,-0.86,1,8,0


## ACS Travel Time

In [2]:
tabmemcheck.datasets.load_dataset('datasets/tabular/acs-travel-2022.csv', 'config/transform/acs-travel.yaml', 'original').head()

Unnamed: 0,Age,Educational attainment,Marital status,Sex,Disability,Employment status of parents,Lived here 1 year ago,Recorded race,Living Area,State,Citizenship,Occupation,Place of Work Area,Travel Time to Work
0,26,"1 or more years of college credit, no degree",Never married or under 15 years old,Female,With a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Some Other Race alone,Santa Barbara County--South Coast Region,California/CA,U.S. citizen by naturalization,Food Service Managers,Santa Barbara County,Less than 20 minutes
1,47,Grade 11,Divorced,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Los Angeles County (East Central)--Pomona City,California/CA,Born in the United States,"Miscellaneous Production Workers, Including Eq...",San Diego County,More than 20 minutes
2,18,Regular high school diploma,Never married or under 15 years old,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Yolo County (North)--Woodland & Winters Cities...,California/CA,Born in the United States,Other Protective Service Workers,Orange County,Less than 20 minutes
3,21,Regular high school diploma,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Black or African American alone,Alameda County (Central)--Hayward City (East),California/CA,Born in the United States,Cashiers,Alameda County,Less than 20 minutes
4,23,Associate's degree,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",White alone,Santa Cruz County (South & Coastal)--Santa Cru...,California/CA,Born in the United States,"Mail Clerks And Mail Machine Operators, Except...",Santa Cruz County,Less than 20 minutes


In [7]:
tabmemcheck.datasets.load_dataset('datasets/tabular/acs-travel-2022.csv', 'config/transform/acs-travel.yaml', 'perturbed', seed=0).head()

Unnamed: 0,Age,Educational attainment,Marital status,Sex,Disability,Employment status of parents,Lived here 1 year ago,Recorded race,Living Area,State,Citizenship,Occupation,Place of Work Area,Travel Time to Work
0,27,"1 or more years of college credit, no degree",Never married or under 15 years old,Female,With a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Some Other Race alone,Santa Barbara County--South Coast Region,California/CA,U.S. citizen by naturalization,Food Service Managers,Santa Barbara County,Less than 20 minutes
1,48,Grade 11,Divorced,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Los Angeles County (East Central)--Pomona City,California/CA,Born in the United States,"Miscellaneous Production Workers, Including Eq...",San Diego County,More than 20 minutes
2,19,Regular high school diploma,Never married or under 15 years old,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Yolo County (North)--Woodland & Winters Cities...,California/CA,Born in the United States,Other Protective Service Workers,Orange County,Less than 20 minutes
3,20,Regular high school diploma,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Black or African American alone,Alameda County (Central)--Hayward City (East),California/CA,Born in the United States,Cashiers,Alameda County,Less than 20 minutes
4,22,Associate's degree,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",White alone,Santa Cruz County (South & Coastal)--Santa Cru...,California/CA,Born in the United States,"Mail Clerks And Mail Machine Operators, Except...",Santa Cruz County,Less than 20 minutes


In [24]:
tabmemcheck.datasets.load_dataset('datasets/tabular/acs-travel-2022.csv', 'config/transform/acs-travel.yaml', 'task', seed=0).head()

Unnamed: 0,Age,Highest Education,Married,Self-reported sex,Disability,Parents employment,Lived here 1 year ago,Self-reported race,Living Area,State,Citizenship,Work,Place of Work Area,Travel time
0,27,"At least 1 year of college, no degree",Never married,Female,True,Not applicable,Yes,Other,Santa Barbara County--South Coast Region,California,Naturalized U.S. citizen,Food Service Managers,Santa Barbara County,Shorter than 20 minutes
1,48,11th grade,Divorced,Female,False,Not applicable,"No, different house in US or Puerto Rico",White,Los Angeles County (East Central)--Pomona City,California,Born in the U.S.,"Miscellaneous Production Workers, Including Eq...",San Diego County,Longer than 20 minutes
2,19,High school diploma,Never married,Female,False,Not applicable,"No, different house in US or Puerto Rico",White,Yolo County (North)--Woodland & Winters Cities...,California,Born in the U.S.,Other Protective Service Workers,Orange County,Shorter than 20 minutes
3,20,High school diploma,Never married,Male,False,Not applicable,Yes,Black,Alameda County (Central)--Hayward City (East),California,Born in the U.S.,Cashiers,Alameda County,Shorter than 20 minutes
4,22,Associate's degree,Never married,Male,False,Not applicable,Yes,White,Santa Cruz County (South & Coastal)--Santa Cru...,California,Born in the U.S.,"Mail Clerks And Mail Machine Operators, Except...",Santa Cruz County,Shorter than 20 minutes


In [22]:
tabmemcheck.datasets.load_dataset('datasets/tabular/acs-travel-2022.csv', 'config/transform/acs-travel.yaml', 'statistical').head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,Y
0,3.73,5,4,1,0,6,2,3,224,0,0,176,65,1
1,-1.09,14,3,1,1,6,1,8,218,0,4,297,81,0
2,5.52,19,4,1,1,6,1,8,30,0,4,385,1,1
3,5.35,19,4,0,1,6,2,7,247,0,4,404,31,1
4,4.87,16,4,0,1,6,2,8,5,0,4,57,150,1


## ICU

In [63]:
tabmemcheck.datasets.load_dataset('datasets/tabular/icu.csv', 'config/transform/icu.yaml', 'original', seed=0)

Unnamed: 0,Glucose,Plasma-Lyte,Time of Onset from DKA,Anion Gap,Duration of Admission Acute Kidney Injury,Delta Gap,Bicarbonate nadir,APACHE II,Bicarbonate,Age,Unit
0,82,0,0.0,30,38.0,18,9,18,9,60,0
1,143,0,0.0,17,16.0,5,15,11,15,50,1
2,93,0,0.0,21,8.0,9,11,9,16,25,1
3,91,0,0.0,38,48.0,26,7,15,7,47,1
4,84,0,0.0,34,38.0,22,11,20,11,61,1
...,...,...,...,...,...,...,...,...,...,...,...
97,143,0,0.0,35,8.0,23,7,11,7,52,1
98,115,0,0.0,20,3.0,8,12,12,12,19,1
99,71,0,0.0,29,13.0,17,11,15,11,30,1
100,133,0,25.0,28,12.0,16,6,26,6,63,0


In [66]:
tabmemcheck.datasets.load_dataset('datasets/tabular/icu.csv', 'config/transform/icu.yaml', 'perturbed', seed=0)

Unnamed: 0,Glucose,Plasma-Lyte,Time of Onset from DKA,Anion Gap,Duration of Admission Acute Kidney Injury,Delta Gap,Bicarbonate nadir,APACHE II,Bicarbonate,Age,Unit
0,83,0,0.0,31,37.0,19,8,19,8,59,0
1,144,0,0.0,18,17.0,6,14,12,16,49,1
2,94,0,0.0,20,9.0,10,10,8,15,26,1
3,90,0,0.0,39,47.0,27,8,16,6,48,1
4,83,0,0.0,35,39.0,23,12,21,10,62,1
...,...,...,...,...,...,...,...,...,...,...,...
97,142,0,0.0,34,7.0,24,8,10,8,53,1
98,114,0,0.0,19,3.0,7,13,11,13,20,1
99,72,0,0.0,28,14.0,18,12,14,10,29,1
100,132,0,24.0,27,13.0,15,5,25,5,64,0


In [70]:
tabmemcheck.datasets.load_dataset('datasets/tabular/icu.csv', 'config/transform/icu.yaml', 'task', seed=0, print_stats=False)

Unnamed: 0,Glucose Level,Plasma Lyte,Time of Onset from DKA,Anion Gap (AG),Duration of Admission Acute Kidney Injury,Delta Gap,Bicarbonate Nadir,APACHE II Score,Bicarbonate,Age,Medical Unit
0,83.00,0.0,0.0,31.01,37.0,19.01,7.99,19,8.00,59,ICU
1,144.00,0.0,0.0,18.01,17.0,6.01,14.00,12,16.01,49,Intermediate Care
2,94.00,0.0,0.0,20.00,9.0,10.02,10.00,8,15.02,26,Intermediate Care
3,90.01,0.0,0.0,39.00,47.0,27.00,7.99,16,5.99,48,Intermediate Care
4,83.00,0.0,0.0,34.99,39.0,23.01,12.00,21,9.98,62,Intermediate Care
...,...,...,...,...,...,...,...,...,...,...,...
97,142.00,0.0,0.0,34.00,7.0,24.00,8.00,10,7.99,53,Intermediate Care
98,114.00,0.0,0.0,19.01,3.0,7.00,13.01,11,13.01,20,Intermediate Care
99,72.01,0.0,0.0,28.01,14.0,18.00,11.99,14,9.99,29,Intermediate Care
100,132.00,0.0,24.0,26.98,13.0,15.01,4.99,25,5.00,64,ICU


In [49]:
tabmemcheck.datasets.load_dataset('datasets/tabular/icu.csv', 'config/transform/icu.yaml', 'statistical', seed=0)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y
0,1.33,0.30,0.99,-3.13,-0.58,-2.98,1.63,-1.31,2.12,-3.44,0
1,-4.19,0.25,0.91,3.36,0.67,3.31,-2.93,1.87,-2.89,-1.34,1
2,0.48,0.43,0.98,2.36,1.07,1.37,0.14,3.74,-2.29,3.19,1
3,0.79,0.26,0.98,-7.05,-1.22,-6.85,1.56,0.15,3.40,-1.24,1
4,1.38,0.31,0.98,-5.01,-0.65,-4.94,-1.48,-2.10,0.93,-4.00,1
...,...,...,...,...,...,...,...,...,...,...,...
97,-4.03,0.35,0.97,-4.62,1.12,-5.30,1.60,2.83,2.24,-2.19,1
98,-1.38,0.24,0.85,2.83,1.28,2.73,-2.12,2.36,-1.09,4.50,1
99,2.31,0.36,0.91,-1.61,0.73,-2.44,-1.31,1.03,1.00,2.63,1
100,-3.12,0.33,-0.59,-1.07,0.82,-1.03,3.91,-3.87,3.97,-4.43,0


## FICO

In [164]:
tabmemcheck.datasets.load_dataset('datasets/tabular/heloc_dataset_v1.csv', 'config/transform/fico.yaml', 'original')

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
0,55,144,4,84,20,3,0,83,2,3,...,0,0,0,33,-8,8,1,1,69,Bad
1,61,58,15,41,2,4,4,100,-7,0,...,0,0,0,0,-8,0,-8,-8,0,Bad
2,67,66,5,24,9,0,0,100,-7,7,...,0,4,4,53,66,4,2,1,86,Bad
3,66,169,1,73,28,1,1,93,76,6,...,0,5,4,72,83,6,4,3,91,Bad
4,81,333,27,132,12,0,0,100,-7,7,...,0,1,1,51,89,3,1,0,80,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,73,131,5,57,21,0,0,95,80,6,...,7,0,0,26,-8,5,2,0,100,Good
10455,65,147,39,68,11,0,0,92,28,6,...,1,1,1,86,53,2,2,1,80,Bad
10456,74,129,6,64,18,1,1,100,-7,6,...,3,4,4,6,-8,5,-8,0,56,Bad
10457,72,234,12,113,42,2,2,96,35,6,...,6,0,0,19,-8,4,1,0,38,Bad


In [165]:
tabmemcheck.datasets.load_dataset('datasets/tabular/heloc_dataset_v1.csv', 'config/transform/fico.yaml', 'perturbed', seed=0, print_stats=False)

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
0,56,145,4,85,21,3,0,82,2,3,...,0,0,0,32,-8,8,1,1,70,Bad
1,62,59,14,40,2,4,4,100,-7,0,...,0,0,0,0,-8,0,-8,-8,0,Bad
2,68,65,5,23,9,0,0,100,-7,7,...,0,4,4,54,65,4,2,1,87,Bad
3,65,168,1,72,29,1,1,92,77,6,...,0,5,4,73,82,6,4,3,90,Bad
4,80,332,26,131,11,0,0,100,-7,7,...,0,1,1,52,88,3,1,0,79,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,72,132,5,58,22,0,0,94,81,6,...,7,0,0,27,-8,5,2,0,99,Good
10455,66,146,40,67,10,0,0,93,29,6,...,1,1,1,87,52,2,2,1,81,Bad
10456,75,128,6,65,19,1,1,99,-7,6,...,3,4,4,6,-8,5,-8,0,55,Bad
10457,73,235,13,112,41,2,2,97,36,6,...,6,0,0,18,-8,4,1,0,37,Bad


In [166]:
tabmemcheck.datasets.load_dataset('datasets/tabular/heloc_dataset_v1.csv', 'config/transform/fico.yaml', 'task', seed=0, print_stats=False)

Unnamed: 0,External Risk Estimate,Months Since Oldest Trade Open,Months Since Most Recent Trade Open,Average Months in File,Number of Satisfactory Trades,Number of Trades ever received 60 days past due date,Number of Trades ever received 90 days past due date,Percent of Trades Never Delinquent,Months Since Most Recent Delinquency,Maximum Delinquency in Last 12 Months,...,Months Since Most Recent Inquiry (excluding last 7 days),Number of Inquiries in Last 6 Months,Number of Inquiries in Last 6 Months (excluding last 7 days),Net Fraction Revolving Burden,Net Fraction Installment Burden,Number of Revolving Trades with Balance,Number of Installment Trades with Balance,Number of Bank/National Trades with High Utilization,Percent of Trades with Balance,Credit
0,56,145,4,85,21,3,0,82,2,3,...,0,0,0,32,-8,8,1,1,70,Default
1,62,59,14,40,2,4,4,100,-7,0,...,0,0,0,0,-8,0,-8,-8,0,Default
2,68,65,5,23,9,0,0,100,-7,7,...,0,4,4,54,65,4,2,1,87,Default
3,65,168,1,72,29,1,1,92,77,6,...,0,5,4,73,82,6,4,3,90,Default
4,80,332,26,131,11,0,0,100,-7,7,...,0,1,1,52,88,3,1,0,79,Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,72,132,5,58,22,0,0,94,81,6,...,7,0,0,27,-8,5,2,0,99,Repaid
10455,66,146,40,67,10,0,0,93,29,6,...,1,1,1,87,52,2,2,1,81,Default
10456,75,128,6,65,19,1,1,99,-7,6,...,3,4,4,6,-8,5,-8,0,55,Default
10457,73,235,13,112,41,2,2,97,36,6,...,6,0,0,18,-8,4,1,0,37,Default


In [169]:
tabmemcheck.datasets.load_dataset('datasets/tabular/heloc_dataset_v1.csv', 'config/transform/fico.yaml', 'statistical', seed=0)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,1.84,1.09,1.13,-0.93,-0.47,-3.92,-0.22,0.63,0.76,1.74,...,-0.15,0.93,0.79,-0.14,3.69,-3.63,-0.08,-0.92,-0.93,1
1,0.94,3.84,-1.38,2.85,4.55,-5.22,-5.77,-1.76,2.19,4.44,...,-0.21,0.84,0.85,3.56,3.74,2.45,7.37,7.93,7.41,1
2,-0.06,3.57,0.85,4.41,2.70,0.12,-0.21,-1.70,2.21,-1.87,...,-0.22,-3.33,-3.37,-2.47,-2.08,-0.62,-0.88,-1.00,-2.97,1
3,0.38,0.47,1.90,0.17,-2.31,-1.36,-1.55,-0.74,-11.39,-0.98,...,-0.07,-4.30,-3.32,-4.57,-3.42,-2.15,-2.45,-2.93,-3.36,1
4,-1.97,-4.42,-4.32,-4.93,2.14,0.03,-0.25,-1.75,2.16,-1.79,...,-0.16,-0.16,-0.24,-2.23,-3.92,0.14,-0.10,0.06,-1.99,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,-0.71,1.62,0.90,1.41,-0.70,-0.00,-0.26,-1.03,-12.06,-0.91,...,-3.98,0.83,0.76,0.49,3.71,-1.27,-0.87,0.05,-4.51,0
10455,0.15,1.24,-7.91,0.53,2.40,0.13,-0.13,-0.85,-3.71,-0.88,...,-0.73,-0.09,-0.22,-6.21,-1.03,0.96,-0.79,-1.01,-2.26,1
10456,-1.25,1.73,0.53,0.78,0.02,-1.29,-1.66,-1.58,2.19,-0.96,...,-1.84,-3.28,-3.34,2.78,3.72,-1.35,7.38,0.06,0.96,1
10457,-0.87,-1.55,-1.15,-3.16,-5.51,-2.72,-3.02,-1.43,-4.76,-0.89,...,-3.46,0.98,0.87,1.55,3.77,-0.70,-0.04,-0.04,3.01,1
