# Statistical predictors on the different dataset versions

In [25]:
import tabmemcheck

from sklearn.model_selection import (
    train_test_split,
)
import pandas as pd

from statutils import loo_eval, accuracy, roc_auc
import yaml

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
# load list of datasets from yaml file
with open('datasets.yaml') as file:
    datasets = yaml.load(file, Loader=yaml.FullLoader)['datasets']

versions = ['original', 'perturbed', 'task', 'statistical']

In [29]:
datasets

[['datasets/tabular/spaceship-titanic-train.csv',
  'config/transform/spaceship-titanic.yaml'],
 ['datasets/tabular/acs-income-2022.csv', 'config/transform/acs-income.yaml'],
 ['datasets/tabular/acs-travel-2022.csv', 'config/transform/acs-travel.yaml'],
 ['datasets/tabular/icu.csv', 'config/transform/icu.yaml'],
 ['datasets/tabular/heloc_dataset_v1.csv', 'config/transform/fico.yaml']]

In [28]:
datasets = datasets[5:]

## Create comparable numerical representations of the different dataset versions
#### In particular, we need to create dummy variables consistently

In [30]:
datasets_numeric = {}

for csv_file, yaml_file in datasets:
    datasets_numeric[csv_file] = {}
    df = tabmemcheck.datasets.load_dataset(csv_file, yaml_file, 'original', seed=2) # 0
    df = df.drop(df.columns[-1], axis=1) 
    # on some datasets, we have to drop certain categorial columns because it would result in too many dummy variables
    drop_cols = []
    if 'spaceship' in csv_file:
        drop_cols = ['PassengerId', 'Cabin', 'Name']
    elif 'titanic' in csv_file:
        drop_cols = ['Name', 'Ticket', 'Cabin']
    print(csv_file, 'drop:', drop_cols)
    drop_cols = [df.columns.get_loc(col) for col in drop_cols] # the indices of the columns
    df = df.drop(df.columns[drop_cols], axis=1)
    # use the original version to deterime the features that should be transformed to dummy variables - in all versions (!)
    dummy_cols = df.select_dtypes(include=['object', 'string', 'category']).columns
    print(csv_file, 'dummies:', dummy_cols) 
    dummy_cols = [df.columns.get_loc(col) for col in dummy_cols] # the indices of the columns

    for version in versions:
        df = tabmemcheck.datasets.load_dataset(csv_file, yaml_file, version)

        # the last column is the target, extract it
        y = df.iloc[:, -1]
        df = df.drop(df.columns[-1], axis=1)

        # if the target is not numeric, convert it to categorical
        if y.dtype in ['object', 'string', 'category']:
            y = y.astype('category').cat.codes

        # drop the columns that should not be used
        df = df.drop(df.columns[drop_cols], axis=1)

        # create dummy variables
        df = pd.get_dummies(df, columns=df.columns[dummy_cols], drop_first=True)
        print(df.values.shape)
        
        # Ensure all data is numeric now
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Fill NaN values with 0
        df = df.fillna(0)

        # store the numeric dataset
        datasets_numeric[csv_file][version] = (df.values, y)

datasets/tabular/spaceship-titanic-train.csv drop: ['PassengerId', 'Cabin', 'Name']
datasets/tabular/spaceship-titanic-train.csv dummies: Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP'], dtype='object')
(8693, 12)
(8693, 12)
(8693, 13)
(8693, 16)
datasets/tabular/acs-income-2022.csv drop: []
datasets/tabular/acs-income-2022.csv dummies: Index(['Class of worker', 'Educational attainment', 'Marital status',
       'Occupation', 'Place of birth', 'Sex', 'Recoded race'],
      dtype='object')
(200577, 792)
(200577, 792)
(200577, 791)
(200577, 791)
datasets/tabular/acs-travel-2022.csv drop: []
datasets/tabular/acs-travel-2022.csv dummies: Index(['Educational attainment', 'Marital status', 'Sex', 'Disability',
       'Employment status of parents', 'Lived here 1 year ago',
       'Recorded race', 'Living Area', 'State', 'Citizenship', 'Occupation',
       'Place of Work Area'],
      dtype='object')
(177724, 1023)
(177724, 1023)
(177724, 1023)
(177724, 1024)
datasets/tabular/icu.csv 

## Fit Logistic Regression

In [10]:
from statutils import fit_logistic_regression_cv

# fit-predict function
def fit_predict(X_train, y_train, X_test):
    clf = fit_logistic_regression_cv(X_train, y_train, random_state=2) # 123
    return clf.predict(X_test)

In [11]:
for csv_file, yaml_file in datasets:
    for version in versions:
        # the numeric dataset
        X_data, y_data = datasets_numeric[csv_file][version]

        # for small datasets, perform leave-one-out evaluation
        if X_data.shape[0] > 2500:
            X_train, X_test, y_train, y_test = train_test_split(
                X_data, y_data, test_size=0.2, random_state=42
            )
            # on large datasets, reduce the number of training points for logistic regression
            X_train, y_train = X_train[:15000], y_train[:15000]
            X_test, y_test = X_test[:5000], y_test[:5000]
            y_pred = fit_predict(X_train, y_train, X_test)
        else:
            # leave-one-out evaluation
            y_pred = loo_eval(X_data, y_data, fit_predict)
            y_test = y_data

        # evaluate
        print(f'{csv_file} {version}')
        accuracy(y_test, y_pred)
        #roc_auc(y, y_pred)

datasets/tabular/acs-travel-2022.csv original
Accuracy: 0.64, 95%-Confidence Interval: (0.63, 0.65)
datasets/tabular/acs-travel-2022.csv perturbed
Accuracy: 0.64, 95%-Confidence Interval: (0.62, 0.65)
datasets/tabular/acs-travel-2022.csv task
Accuracy: 0.64, 95%-Confidence Interval: (0.63, 0.65)
datasets/tabular/acs-travel-2022.csv statistical
Accuracy: 0.64, 95%-Confidence Interval: (0.63, 0.65)
datasets/tabular/icu.csv original
Accuracy: 0.76, 95%-Confidence Interval: (0.68, 0.84)
datasets/tabular/icu.csv perturbed
Accuracy: 0.76, 95%-Confidence Interval: (0.68, 0.84)
datasets/tabular/icu.csv task
Accuracy: 0.77, 95%-Confidence Interval: (0.69, 0.84)
datasets/tabular/icu.csv statistical
Accuracy: 0.75, 95%-Confidence Interval: (0.66, 0.82)
datasets/tabular/heloc_dataset_v1.csv original
Accuracy: 0.70, 95%-Confidence Interval: (0.67, 0.72)
datasets/tabular/heloc_dataset_v1.csv perturbed
Accuracy: 0.70, 95%-Confidence Interval: (0.68, 0.72)
datasets/tabular/heloc_dataset_v1.csv task
Ac

#### trained on original, tested on perturbed

In [7]:
for csv_file, yaml_file in datasets:
    # the numeric dataset
    X_data, y_data = datasets_numeric[csv_file]['original']
    X_data_p, y_data_p = datasets_numeric[csv_file]['perturbed']

    # for small datasets, perform leave-one-out evaluation
    if X_data.shape[0] > 2500:
        X_train, X_test, y_train, y_test = train_test_split(
            X_data, y_data, test_size=0.2, random_state=42
        )
        _, X_test_p, _, _ = train_test_split(
            X_data_p, y_data_p, test_size=0.2, random_state=42
        )
        # on large datasets, reduce the number of training points for logistic regression
        X_train, y_train = X_train[:15000], y_train[:15000]
        X_test, y_test = X_test[:5000], y_test[:5000]
        X_test_p = X_test_p[:5000]
        y_pred = fit_predict(X_train, y_train, X_test)
        y_pred_p = fit_predict(X_train, y_train, X_test_p)
    else:
        # leave-one-out evaluation
        y_pred = loo_eval(X_data, y_data, fit_predict)
        y_pred_p = loo_eval(X_data, y_data, fit_predict, X_test=X_data_p)
        y_test = y_data

    # evaluate
    print(f'{csv_file} {version}')
    accuracy(y_test, y_pred)
    accuracy(y_test, y_pred_p)
    #roc_auc(y, y_pred)

datasets/csv/tabular/iris.csv statistical
Accuracy: 0.97, 95%-Confidence Interval: (0.93, 0.99)
Accuracy: 0.95, 95%-Confidence Interval: (0.91, 0.98)
datasets/csv/tabular/adult-train.csv statistical
Accuracy: 0.86, 95%-Confidence Interval: (0.85, 0.87)
Accuracy: 0.86, 95%-Confidence Interval: (0.85, 0.87)
datasets/csv/tabular/openml-diabetes.csv statistical
Accuracy: 0.78, 95%-Confidence Interval: (0.75, 0.81)
Accuracy: 0.77, 95%-Confidence Interval: (0.74, 0.80)
datasets/csv/tabular/uci-wine.csv statistical
Accuracy: 0.98, 95%-Confidence Interval: (0.96, 0.99)
Accuracy: 0.98, 95%-Confidence Interval: (0.96, 0.99)
datasets/csv/tabular/titanic-train.csv statistical
Accuracy: 0.79, 95%-Confidence Interval: (0.76, 0.81)
Accuracy: 0.79, 95%-Confidence Interval: (0.76, 0.81)
datasets/csv/tabular/spaceship-titanic-train.csv statistical
Accuracy: 0.78, 95%-Confidence Interval: (0.76, 0.80)
Accuracy: 0.78, 95%-Confidence Interval: (0.76, 0.80)
datasets/csv/tabular/acs-income-2022.csv statistic

: 

## Fit gradient boosted tree

In [21]:
from statutils import fit_gbtree_cv

# fit-predict function
def fit_predict(X_train, y_train, X_test):
    clf = fit_gbtree_cv(X_train, y_train)
    return clf.predict(X_test)

In [31]:
for csv_file, yaml_file in datasets:
    for version in versions:
        # the numeric dataset
        X_data, y_data = datasets_numeric[csv_file][version]

        # for small datasets, perform leave-one-out evaluation
        if X_data.shape[0] > 2500:
            X_train, X_test, y_train, y_test = train_test_split(
                X_data, y_data, test_size=0.2, random_state=42
            )
            X_test, y_test = X_test[:5000], y_test[:5000]
            # on large datasets, train gradient boosting trees with default parameters
            from xgboost import XGBClassifier
            clf = XGBClassifier()
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
        else:
            # leave-one-out evaluation
            y_pred = loo_eval(X_data, y_data, fit_predict)
            y_test = y_data

        # evaluate
        print(f'{csv_file} {version}')
        accuracy(y_test, y_pred)
        #roc_auc(y, y_pred)

datasets/tabular/spaceship-titanic-train.csv original
Accuracy: 0.78, 95%-Confidence Interval: (0.76, 0.80), Standard error: 0.01
datasets/tabular/spaceship-titanic-train.csv perturbed
Accuracy: 0.77, 95%-Confidence Interval: (0.75, 0.79), Standard error: 0.01
datasets/tabular/spaceship-titanic-train.csv task
Accuracy: 0.77, 95%-Confidence Interval: (0.75, 0.79), Standard error: 0.01
datasets/tabular/spaceship-titanic-train.csv statistical
Accuracy: 0.78, 95%-Confidence Interval: (0.76, 0.80), Standard error: 0.01
datasets/tabular/acs-income-2022.csv original
Accuracy: 0.80, 95%-Confidence Interval: (0.79, 0.81), Standard error: 0.01
datasets/tabular/acs-income-2022.csv perturbed
Accuracy: 0.80, 95%-Confidence Interval: (0.79, 0.81), Standard error: 0.01
datasets/tabular/acs-income-2022.csv task
Accuracy: 0.80, 95%-Confidence Interval: (0.79, 0.81), Standard error: 0.01
datasets/tabular/acs-income-2022.csv statistical
Accuracy: 0.80, 95%-Confidence Interval: (0.79, 0.81), Standard erro