In [1]:
# # Running attribute inference attacks on the Nursery data
# 
# In this tutorial we will show how to run both black-box and white-box inference attacks. This will be demonstrated on the Nursery dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/nursery).
# 
# ## Preliminaries
# 
# In the case of the nursery dataset, the sensitive feature we want to infer is the 'social' feature. In the original dataset this is a categorical feature with 3 possible values. To make the attack more successful, we reduced this to two possible feature values by assigning the original value 'problematic' the new value 1, and the other original values were assigned the new value 0.
# 
# We have also already preprocessed the dataset such that all categorical features are one-hot encoded, and the data was scaled using sklearn's StandardScaler.


In [2]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from urllib.request import urlretrieve

In [3]:
def load_nursery(test_set=0.5, transform_social=True, random_state=42):
    """
    Load and preprocess the nursery dataset.
    
    :param test_set: Proportion of the dataset to include in the test split
    :param transform_social: If True, transform the social feature to binary (0,1)
    :param random_state: Random seed for reproducibility
    :return: (x_train, y_train), (x_test, y_test), feature_names, social_values
    """
    # Download the dataset if it doesn't exist
    dataset_path = 'nursery.data'
    if not os.path.exists(dataset_path):
        print("Downloading nursery dataset...")
        url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
        urlretrieve(url, dataset_path)
    
    # Column names from the dataset description
    column_names = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health', 'class']
    
    # Load the dataset
    data = pd.read_csv(dataset_path, header=None, names=column_names)
    
    # Transform the social feature to binary if required
    social_values = None
    if transform_social:
        # Map 'problematic' to 1, everything else to 0
        social_mapping = {'problematic': 1, 'slightly_prob': 0, 'nonprob': 0}
        data['social'] = data['social'].map(social_mapping)
        social_values = [0, 1]  # The possible values after transformation
    
    # Extract the target variable (class)
    y = data['class']
    X = data.drop('class', axis=1)
    
    # Get the social feature for later use
    social_feature = X['social']
    
    # One-hot encode categorical features
    categorical_features = X.columns
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_encoded = encoder.fit_transform(X)
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_encoded)
    
    # Split the dataset
    x_train, x_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_set, random_state=random_state
    )
    
    # Get feature names after one-hot encoding
    feature_names = encoder.get_feature_names_out(categorical_features)
    
    # Find the index of the social feature in the transformed dataset
    social_feature_indices = [i for i, name in enumerate(feature_names) if name.startswith('social_')]
    
    return (x_train, y_train), (x_test, y_test), feature_names, social_feature_indices

In [4]:
(x_train, y_train), (x_test, y_test), feature_names, social_indices = load_nursery(test_set=0.5, transform_social=True)

print(f"Training data shape: {x_train.shape}")
print(f"Testing data shape: {x_test.shape}")
print(f"Social feature indices: {social_indices}")

Downloading nursery dataset...


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'