# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# For data preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# For classification model
# 'lr', 'rf', 'lightgbm', 'gbc', 'xgboost'
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For deep learning model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import AUC

# For evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
import shap

# For post-processing algorithm
# 'CalibratedEqOddsPostprocessing', 'RejectOptionClassification', 'EqOddsPostprocessing'
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing, RejectOptionClassification, EqOddsPostprocessing


pip install 'aif360[inFairness]'


In [2]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
# set the working directory
os.chdir('C:\\Users\\h2408\\Downloads\\RA\\1_paper_LASI\\data')

# Data preparation

In [4]:
# Load data
data = pd.read_csv("derived_df.csv")

# Drop the target variables of other papers
target_vars = ['bmi_underweight', 'bmi_overweight', 'waist_circumference']
######################################
target_var = 'waist_circumference'
######################################
data.shape

(64867, 80)

In [5]:
category_col = [
    'education',
    'state',
    'region',
    'religion',
    'MPCE',
    'working_status',
    'occupation',
    'caste',
    'water',
    'alcohol',
    'activity1',
    'benefit'
    ]

# Convert Type
for col in data.columns:
  if col in category_col:
    data[col] = data[col].astype('category')
  else:
    data[col] = data[col].astype('float')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64867 entries, 0 to 64866
Data columns (total 80 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  64867 non-null  float64 
 1   gender               64867 non-null  float64 
 2   education            64866 non-null  category
 3   migration            64770 non-null  float64 
 4   state                64867 non-null  category
 5   region               64867 non-null  category
 6   residence            64867 non-null  float64 
 7   religion             64867 non-null  category
 8   martial_status       64865 non-null  float64 
 9   living_alone         64867 non-null  float64 
 10  MPCE                 64865 non-null  category
 11  working_status       64852 non-null  category
 12  occupation           64867 non-null  category
 13  pension_amount       64867 non-null  float64 
 14  retired              64774 non-null  float64 
 15  pension            

In [7]:
used_data = data.copy()
# Drop the missing values
used_data = used_data.dropna()
# Define X and y
X = used_data.drop(target_vars, axis=1)
y = used_data[target_var]
X.shape, y.shape

((55647, 77), (55647,))

In [8]:
groups = {
        'Overall': slice(None),
        'Scheduled Caste': X['caste'] == 'Scheduled caste',
        'Scheduled Tribe': X['caste'] == 'Scheduled tribe',
        'General': X['caste'] == 'General',
        'Other Backward Class': X['caste'] == 'Other backward class',
        'MPCE 1': X['MPCE'] == 'Lowest',
        'MPCE 2': X['MPCE'] == 'Lower middle',
        'MPCE 3': X['MPCE'] == 'Middle',
        'MPCE 4': X['MPCE'] == 'Upper middle',
        'MPCE 5': X['MPCE'] == 'Highest',
    }

In [9]:
# Define protected attributes
protected_attributes = {
    'caste2': ['Scheduled caste', 'Scheduled tribe', 'Other backward class'], # 'General'
    'MPCE2': ['Lowest', 'Lower middle'] # 'Middle', 'Upper middle', 'Highest'
}

In [10]:
# Function: derive protected attributes
def derive_protected_attributes(X, original_column, protected_attribute, protected_attributes):
    binary_dataset = X.copy()
    binary_dataset[protected_attribute] = binary_dataset[original_column].apply(
        lambda x: 1 if x in protected_attributes[protected_attribute] else 0)
    # drop the original column
    binary_dataset = binary_dataset.drop(original_column, axis=1)
    return binary_dataset

# Derive protected attributes
protect_caste_X = derive_protected_attributes(X, 'caste', 'caste2', protected_attributes)
protect_MPCE_X = derive_protected_attributes(X, 'MPCE', 'MPCE2', protected_attributes)

In [11]:
# Category encoding
X = pd.get_dummies(X, drop_first=True)
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General', 'water_other', 'alcohol_abstainer', 'activity1_moderate', 'benefit_non-applicable']
# X = X.drop(dummy_col, axis=1)
X = X.astype('float32')
y = y.astype('float32')

# Standardization
scaler = StandardScaler()
X_sd = scaler.fit_transform(X)
X = pd.DataFrame(X_sd, columns=X.columns, index=X.index)

X.shape

(55647, 129)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((44517, 129), (11130, 129))

In [13]:
# Category encoding
protect_caste_X = pd.get_dummies(protect_caste_X, drop_first=True)
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General', 'water_other', 'alcohol_abstainer', 'activity1_moderate', 'benefit_non-applicable']
# protect_caste_X = protect_caste_X.drop(dummy_col, axis=1)
protect_caste_X = protect_caste_X.astype('float32')
protect_caste_X.shape

(55647, 127)

In [14]:
protect_caste_X_train, protect_caste_X_test, _, _ = train_test_split(protect_caste_X, y, test_size=0.2, random_state=42)
protect_caste_X_train.shape, protect_caste_X_test.shape

((44517, 127), (11130, 127))

In [15]:
# Category encoding
protect_MPCE_X = pd.get_dummies(protect_MPCE_X, drop_first=True)
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General', 'water_other', 'alcohol_abstainer', 'activity1_moderate', 'benefit_non-applicable']
# protect_caste_X = protect_caste_X.drop(dummy_col, axis=1)
protect_MPCE_X = protect_MPCE_X.astype('float32')
protect_MPCE_X.shape

(55647, 126)

In [16]:
protect_MPCE_X_train, protect_MPCE_X_test, _, _ = train_test_split(protect_MPCE_X, y, test_size=0.2, random_state=42)
protect_MPCE_X_train.shape, protect_MPCE_X_test.shape

((44517, 126), (11130, 126))

# Postprocessing Methods

1. Calibrated Equalized Odds Postprocessing (CEOP)
2. Reject Option Classification (ROC)
3. Equalized Odds Postprocessing (EOP)

In [17]:
# Postprocessing methods
postprocessing_methods = [
    'CalibratedEqOddsPostprocessing',
    'RejectOptionClassification',
    'EqOddsPostprocessing'
]

In [18]:
# Function: Apply inprocessing method to the model
def apply_postprocessing(method_name, unprivileged_groups, privileged_groups):
    if method_name == 'CalibratedEqOddsPostprocessing':
        postprocessor = CalibratedEqOddsPostprocessing(unprivileged_groups=unprivileged_groups,
                                                       privileged_groups=privileged_groups)
    elif method_name == 'RejectOptionClassification':
        postprocessor = RejectOptionClassification(unprivileged_groups=unprivileged_groups,
                                                  privileged_groups=privileged_groups)
    elif method_name == 'EqOddsPostprocessing':
        postprocessor = EqOddsPostprocessing(unprivileged_groups=unprivileged_groups,
                                            privileged_groups=privileged_groups)
    else:
        raise ValueError("Invalid inprocessing method")
    return postprocessor

# Functions

In [19]:
# Function: Evaluate the model
def calculate_metric(y_true, y_pred, y_pred_prob, metric_key):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Calculate the metrics
    ## Accuracy, precision, recall, f1, auc
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_prob)
    ## Sensitivity and specificity
    sensitivity = recall_score(y_true, y_pred, pos_label=1)
    specificity = recall_score(y_true, y_pred, pos_label=0)
    ## Equal opportunity, equalized odds, disparate impact
    ### Equalized Odds: 根據真實標籤（0 或 1）計算錯誤率是否相等(tpr, fpr)
    ### Demographic Parity: 預測為陽性的比例
    tpr = tp / (tp + fn) if tp + fn > 0 else 0    # True positive rate
    tnr = tn / (tn + fp) if tn + fp > 0 else 0    # True negative rate
    equal_opportunity = tpr
    equalized_odds = tpr + tnr
    ppr = (tp + fp) / (tp + fp + tn + fn) if tp + fp + tn + fn > 0 else 0    # Predicted positive rate
    disparate_impact = ppr

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'equal_opportunity': equal_opportunity,
        'equalized_odds': equalized_odds,
        'disparate_impact': disparate_impact
    }

    return metrics[metric_key]

In [20]:
# Function: Train the model
def model_fitted_post(method, model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups, protected_attribute, unprivileged_groups, privileged_groups):
    # Create a pipeline
    pipeline = Pipeline([
        # ('scaler', StandardScaler()),
        ('model', model)
    ])
    # Train the model
    if model_name in ['DNN', 'FCN']:
        pipeline['model'].fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
        y_pred_prob = pipeline.predict(X_test).ravel()
    else:
        pipeline.fit(X_train, y_train)
        y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

    # Create a BinaryLabelDataset
    test_dataset = BinaryLabelDataset(
        df=pd.DataFrame({'label': y_test, protected_attribute: X_test[protected_attribute]}),
        label_names=['label'],
        protected_attribute_names=[protected_attribute]
    )
    pred_dataset = test_dataset.copy()
    pred_dataset.labels = (y_pred_prob >= 0.5).astype(int).reshape(-1, 1)
    pred_dataset.scores = y_pred_prob.reshape(-1, 1)

    # Apply postprocessing method
    postprocessor = apply_postprocessing(method, unprivileged_groups, privileged_groups)
    postprocessor.fit(test_dataset, pred_dataset)
    transformed_dataset = postprocessor.predict(pred_dataset)

    # Calculate the metrics
    results = pd.DataFrame()
    for group_name, group_slice in groups.items():
        # Create a Boolean mask for the group
        group_mask = (
            group_slice.loc[X_test.index]  # Use the group slice
            if group_name != 'Overall'
            else pd.Series(True, index=X_test.index)  
        )

        y_group = y_test[group_mask]
        y_pred_group = transformed_dataset.labels[group_mask].ravel()
        y_pred_prob_group = transformed_dataset.scores[group_mask].ravel()

        for metric_key in metric_keys:
            # Calculate the metric
            metric_value = calculate_metric(y_group, y_pred_group, y_pred_prob_group, metric_key)
            # Store the results
            results.loc[group_name, metric_key] = metric_value

    return results

In [21]:
# Function: Define the function to create the DNN model
def create_dnn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

In [22]:
# Function: Define the function to create the Fully Connected Network (FCN) model
def create_fcn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

# Model

In [23]:
metric_keys = [
    'accuracy', 'precision', 'recall', 'f1', 'auc',
    'sensitivity', 'specificity', 
    'equal_opportunity', 'equalized_odds', 'disparate_impact'
    ]

In [24]:
models = {
    # 'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    # 'Random Forest': RandomForestClassifier(random_state=42),
    # 'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    # 'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, force_row_wise=True, verbose=-1),
}

In [25]:
# Train and evaluate the models
results = pd.DataFrame()

for protected_attribute in ['caste2', 'MPCE2']:
    # Derive the dataset
    if protected_attribute == 'caste2':
        unprivileged_groups = [{'caste2': 1}]
        privileged_groups = [{'caste2': 0}]
        X_train = protect_caste_X_train
        X_test = protect_caste_X_test
    elif protected_attribute == 'MPCE2':
        unprivileged_groups = [{'MPCE2': 1}]
        privileged_groups = [{'MPCE2': 0}]
        X_train = protect_MPCE_X_train
        X_test = protect_MPCE_X_test

    # Train and evaluate the models
    for method in postprocessing_methods:
        for model_name, model in models.items():
            print(f"Training {model_name} with {method} for {protected_attribute}")
            model_results = model_fitted_post(method, model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups, protected_attribute, unprivileged_groups, privileged_groups)
            model_results['model'] = model_name
            model_results['method'] = method
            model_results['protected_attribute'] = protected_attribute
            results = pd.concat([results, model_results], axis=0)

# results.to_csv(f"{target_var}\\postprocessing_results.csv", index=True)
results.to_csv(f"standardized\\{target_var}\\postprocessing_results_LightGBM.csv", index=True)
results

Training LightGBM with CalibratedEqOddsPostprocessing for caste2
Training LightGBM with RejectOptionClassification for caste2
Training LightGBM with EqOddsPostprocessing for caste2
Training LightGBM with CalibratedEqOddsPostprocessing for MPCE2
Training LightGBM with RejectOptionClassification for MPCE2
Training LightGBM with EqOddsPostprocessing for MPCE2


Unnamed: 0,accuracy,precision,recall,f1,auc,sensitivity,specificity,equal_opportunity,equalized_odds,disparate_impact,model,method,protected_attribute
Overall,0.754178,0.743412,0.710881,0.726783,0.837119,0.710881,0.79105,0.710881,1.501931,0.439802,LightGBM,CalibratedEqOddsPostprocessing,caste2
Scheduled Caste,0.767419,0.645403,0.555735,0.597222,0.825743,0.555735,0.862645,0.555735,1.41838,0.267168,LightGBM,CalibratedEqOddsPostprocessing,caste2
Scheduled Tribe,0.757969,0.753602,0.65375,0.700134,0.831219,0.65375,0.837298,0.65375,1.491048,0.374932,LightGBM,CalibratedEqOddsPostprocessing,caste2
General,0.751297,0.775435,0.81181,0.793206,0.831404,0.81181,0.665094,0.81181,1.476904,0.61511,LightGBM,CalibratedEqOddsPostprocessing,caste2
Other Backward Class,0.748333,0.734613,0.689089,0.711123,0.827388,0.689089,0.796713,0.689089,1.485802,0.421667,LightGBM,CalibratedEqOddsPostprocessing,caste2
MPCE 1,0.773017,0.669811,0.523599,0.587748,0.818492,0.523599,0.884565,0.523599,1.408163,0.241568,LightGBM,CalibratedEqOddsPostprocessing,caste2
MPCE 2,0.759004,0.707751,0.640967,0.672705,0.819809,0.640967,0.833333,0.640967,1.4743,0.349933,LightGBM,CalibratedEqOddsPostprocessing,caste2
MPCE 3,0.741637,0.713307,0.71261,0.712958,0.832489,0.71261,0.765412,0.71261,1.478022,0.449824,LightGBM,CalibratedEqOddsPostprocessing,caste2
MPCE 4,0.743758,0.775961,0.753175,0.764398,0.829695,0.753175,0.73216,0.753175,1.485335,0.535699,LightGBM,CalibratedEqOddsPostprocessing,caste2
MPCE 5,0.754221,0.786947,0.813809,0.800153,0.827505,0.813809,0.663108,0.813809,1.476917,0.625235,LightGBM,CalibratedEqOddsPostprocessing,caste2
