# Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# For data preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# For classification model
# 'lr', 'rf', 'lightgbm', 'gbc', 'xgboost'
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For deep learning model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import AUC

# For evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.utils import resample

# For preprocessing method
# 'RUS', 'ROS', 'SMOTE', 'DIR', 'Reweighing'
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.algorithms.preprocessing import Reweighing
from aif360.datasets import BinaryLabelDataset

In [None]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
# Ignore warnings
warnings.filterwarnings('ignore')


In [None]:
# set the working directory
os.chdir('C:\\Users\\h2408\\Downloads\\RA\\1_paper_LASI\\data')

# Data preparation

In [None]:
# Load data
data = pd.read_csv("derived_df.csv")

# Drop the target variables of other papers
target_vars = ['bmi_underweight', 'bmi_overweight', 'waist_circumference']
######################################
target_var = 'waist_circumference'
######################################
data.shape

In [None]:
category_col = [
    'education',
    'state',
    'region',
    'religion',
    'MPCE',
    'working_status',
    'occupation',
    'caste',
    'water',
    'alcohol',
    'activity1',
    'benefit'
    ]

# Convert Type
for col in data.columns:
  if col in category_col:
    data[col] = data[col].astype('category')
  else:
    data[col] = data[col].astype('float')

In [None]:
data.info()

In [None]:
used_data = data.copy()
# Drop the missing values
used_data = used_data.dropna()
# Define X and y
X = used_data.drop(target_vars, axis=1)
y = used_data[target_var]
X.shape, y.shape

In [None]:
groups = {
        'Overall': slice(None),
        'Scheduled Caste': X['caste'] == 'Scheduled caste',
        'Scheduled Tribe': X['caste'] == 'Scheduled tribe',
        'General': X['caste'] == 'General',
        'Other Backward Class': X['caste'] == 'Other backward class',
        'MPCE 1': X['MPCE'] == 'Lowest',
        'MPCE 2': X['MPCE'] == 'Lower middle',
        'MPCE 3': X['MPCE'] == 'Middle',
        'MPCE 4': X['MPCE'] == 'Upper middle',
        'MPCE 5': X['MPCE'] == 'Highest',
    }

In [None]:
# Define protected attributes
protected_attributes = {
    'caste2': ['Scheduled caste', 'Scheduled tribe', 'Other backward class'], # 'General'
    'MPCE2': ['Lowest', 'Lower middle'] # 'Middle', 'Upper middle', 'Highest'
}

In [None]:
# Function: derive protected attributes
def derive_protected_attributes(X, original_column, protected_attribute, protected_attributes):
    binary_dataset = X.copy()
    binary_dataset[protected_attribute] = binary_dataset[original_column].apply(
        lambda x: 1 if x in protected_attributes[protected_attribute] else 0)
    # drop the original column
    binary_dataset = binary_dataset.drop(original_column, axis=1)
    return binary_dataset

# Derive protected attributes
protect_caste_X = derive_protected_attributes(X, 'caste', 'caste2', protected_attributes)
protect_MPCE_X = derive_protected_attributes(X, 'MPCE', 'MPCE2', protected_attributes)

In [None]:
# Category encoding
X = pd.get_dummies(X, drop_first=True)
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General', 'water_other', 'alcohol_abstainer', 'activity1_moderate', 'benefit_non-applicable']
# X = X.drop(dummy_col, axis=1)
X = X.astype('float32')
y = y.astype('float32')

# Standardization
scaler = StandardScaler()
X_sd = scaler.fit_transform(X)
X = pd.DataFrame(X_sd, columns=X.columns, index=X.index)

X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

In [None]:
# Category encoding
protect_caste_X = pd.get_dummies(protect_caste_X, drop_first=True)
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General', 'water_other', 'alcohol_abstainer', 'activity1_moderate', 'benefit_non-applicable']
# protect_caste_X = protect_caste_X.drop(dummy_col, axis=1)
protect_caste_X = protect_caste_X.astype('float32')
protect_caste_X.shape

In [None]:
protect_caste_X_train, protect_caste_X_test, _, _ = train_test_split(protect_caste_X, y, test_size=0.2, random_state=42)
protect_caste_X_train.shape, protect_caste_X_test.shape

In [None]:
# Category encoding
protect_MPCE_X = pd.get_dummies(protect_MPCE_X, drop_first=True)
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General', 'water_other', 'alcohol_abstainer', 'activity1_moderate', 'benefit_non-applicable']
# protect_caste_X = protect_caste_X.drop(dummy_col, axis=1)
protect_MPCE_X = protect_MPCE_X.astype('float32')
protect_MPCE_X.shape

In [None]:
protect_MPCE_X_train, protect_MPCE_X_test, _, _ = train_test_split(protect_MPCE_X, y, test_size=0.2, random_state=42)
protect_MPCE_X_train.shape, protect_MPCE_X_test.shape

# Preprocessing Methods

1. RUS(Resampling UnderSampling)
2. ROS(Resampling OverSampling)
3. SMOTE(Synthetic Minority Over-sampling Technique)
4. Disparate Impact Remover
5. Reweighing

In [None]:
# Define preprocessing methods
preprocessing_methods = [
    # 'RUS', 'ROS', 
    'SMOTE', 
    'DIR_caste', 'DIR_MPCE',
    'Reweighing_caste', 'Reweighing_MPCE'
]

In [None]:
# Function: Apply preprocessing method to balance the data
def preprocess_data(X_train, y_train, method, random_state=42):
    df = pd.concat([X_train, y_train], axis=1)
    if method == 'RUS':
        sampler = RandomUnderSampler(random_state=random_state)
        X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    elif method == 'ROS':
        sampler = RandomOverSampler(random_state=random_state)
        X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    elif method == 'SMOTE':
        sampler = SMOTE(random_state=random_state)
        X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    elif method in ['DIR_caste', 'DIR_MPCE', 'Reweighing_caste', 'Reweighing_MPCE']:
        if 'caste' in method:
            protected_attribute = 'caste2'
            unprivileged_groups = [{'caste2': 1}]
            privileged_groups = [{'caste2': 0}]
        elif 'MPCE' in method:
            protected_attribute = 'MPCE2'
            unprivileged_groups = [{'MPCE2': 1}]
            privileged_groups = [{'MPCE2': 0}]
        # Create aif360 dataset
        train_data = BinaryLabelDataset(df=df, label_names=[target_var], protected_attribute_names=[protected_attribute])

        if 'DIR' in method:
            di = DisparateImpactRemover(repair_level=1.0)
            train_data_repd = di.fit_transform(train_data)
        elif 'Reweighing' in method:
            rw = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
            train_data_repd = rw.fit_transform(train_data)
            
        X_resampled = pd.DataFrame(train_data_repd.features, columns=X_train.columns)
        y_resampled = train_data_repd.labels.ravel()
            
    else:
        raise ValueError("Invalid preprocessing method")
    
    return X_resampled, y_resampled

# Functions

In [None]:
# Function: Evaluate the model
def calculate_metric(y_true, y_pred, y_pred_prob, metric_key):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Calculate the metrics
    ## Accuracy, precision, recall, f1, auc
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_prob)
    ## Sensitivity and specificity
    sensitivity = recall_score(y_true, y_pred, pos_label=1)
    specificity = recall_score(y_true, y_pred, pos_label=0)
    ## Equal opportunity, equalized odds, disparate impact
    ### Equalized Odds: 根據真實標籤（0 或 1）計算錯誤率是否相等(tpr, fpr)
    ### Demographic Parity: 預測為陽性的比例
    tpr = tp / (tp + fn) if tp + fn > 0 else 0    # True positive rate
    fpr = fp / (fp + tn) if fp + tn > 0 else 0    # False positive rate
    equal_opportunity = tpr
    equalized_odds_tpr = tpr
    equalized_odds_fpr = fpr
    ppr = (tp + fp) / (tp + fp + tn + fn) if tp + fp + tn + fn > 0 else 0    # Predicted positive rate
    disparate_impact = ppr

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'equal_opportunity': equal_opportunity,
        'equalized_odds_tpr': equalized_odds_tpr,
        'equalized_odds_fpr': equalized_odds_fpr,
        'disparate_impact': disparate_impact
    }

    return metrics[metric_key]

In [None]:
# Function: Train the model
def model_fitted(model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups):
    # Create a pipeline
    pipeline = Pipeline([
        # ('scaler', StandardScaler()),
        ('model', model)
    ])
    # Train the model
    if model_name in ['DNN', 'FCN']:
        pipeline['model'].fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
        y_pred_prob = pipeline.predict(X_test).ravel()
        y_pred = (y_pred_prob > 0.5).astype(int)
    else:
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    # Calculate the metrics
    results = pd.DataFrame()
    for group_name, group_slice in groups.items():
        # Create a Boolean mask for the group
        group_mask = (
            group_slice.loc[X_test.index]  # Use the group slice
            if group_name != 'Overall'
            else pd.Series(True, index=X_test.index)  
        )

        y_group = y_test[group_mask]
        y_pred_group = y_pred[group_mask]
        y_pred_prob_group = y_pred_prob[group_mask]

        for metric_key in metric_keys:
            # Calculate the metric
            metric_value = calculate_metric(y_group, y_pred_group, y_pred_prob_group, metric_key)
            # Store the results
            results.loc[group_name, metric_key] = metric_value
            
    return results

In [None]:
# Function: Define the function to create the DNN model
def create_dnn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

In [None]:
# Function: Define the function to create the Fully Connected Network (FCN) model
def create_fcn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

# Model

1. Logistic Regression
2. Random Forest
3. XGBoost
4. Gradient Boosting
5. LightGBM
6. DNN
7. FCN

In [None]:
metric_keys = [
    'accuracy', 'precision', 'recall', 'f1', 'auc',
    'sensitivity', 'specificity', 
    'equal_opportunity', 'equalized_odds_tpr', 'equalized_odds_fpr', 'disparate_impact'
    ]

In [None]:
models = {
    # 'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    # 'Random Forest': RandomForestClassifier(random_state=42),
    # 'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    # 'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, force_row_wise=True, verbose=-1)
}

In [None]:
# Use cross-validation to evaluate the models
results = pd.DataFrame()
for method in preprocessing_methods:
    if method in ['DIR_caste', 'Reweighing_caste']:
        X_resampled, y_resampled = preprocess_data(protect_caste_X_train, y_train, method)
        X_test_new = protect_caste_X_test
    elif method in ['DIR_MPCE', 'Reweighing_MPCE']:
        X_resampled, y_resampled = preprocess_data(protect_MPCE_X_train, y_train, method)
        X_test_new = protect_MPCE_X_test
    else:
        X_resampled, y_resampled = preprocess_data(X_train, y_train, method)
        X_test_new = X_test
    
    y_resampled = pd.Series(y_resampled)

    for model_name, model in models.items():
        print(f"Processing {model_name} with {method}...")
        model_results = model_fitted(model_name, model, X_resampled, y_resampled, X_test_new, y_test, metric_keys, groups)
        model_results['method'] = method
        model_results['model'] = model_name
        results = pd.concat([results, model_results], axis=0)

# results.to_csv(f"{target_var}\\preprocessing_results.csv", index=True)
results.to_csv(f"standardized\\{target_var}\\preprocessing_results_LightGBM.csv", index=True)
results