# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# For data preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# For classification model
# 'lr', 'rf', 'lightgbm', 'gbc', 'xgboost'
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For deep learning model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import AUC

# For evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
from sklearn.utils import resample

In [2]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
# set the working directory
os.chdir('C:\\Users\\h2408\\Downloads\\RA\\1_paper_LASI\\data')

# Data preparation

In [4]:
# Load data
data = pd.read_csv("derived_df.csv")
demographic = ['age', 'gender', 'education', 'migration', 'state', 'region', 'residence', 'religion', 'martial_status', 'living_alone']
social = ['MPCE', 'working_status', 'occupation', 'pension_amount', 'retired', 'pension', 'caste', 'public_HI', 'private_HI']
health = ['hypertension', 'diabetes', 'cancer', 'chr_lung', 'chr_heart', 'stroke', 'bone_joint', 'psychiatric', 'hi_chole', 'chr_renal', 'incontinence', 'kidney', 'BPH', 'influenza', 'pneumococcal', 'hepa_B', 'typhoid', 'dT', 'pain', 'sleep', 'mlaria', 'dengue', 'chickungunya', 'tuberculosis', 'UTI', 'cog_score', 'depression', 'selfrated']
############################################
target_var = 'waist_circumference' # 'bmi_underweight', 'bmi_overweight', 'waist_circumference'
############################################
data.shape

(64867, 80)

In [5]:
category_col = [
    'education',
    'state',
    'region',
    'religion',
    'MPCE',
    'working_status',
    'occupation',
    'caste',
    'water',
    'alcohol',
    'activity1',
    'benefit'
    ]

# Convert Type
for col in data.columns:
  if col in category_col:
    data[col] = data[col].astype('category')
  else:
    data[col] = data[col].astype('float')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64867 entries, 0 to 64866
Data columns (total 80 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  64867 non-null  float64 
 1   gender               64867 non-null  float64 
 2   education            64866 non-null  category
 3   migration            64770 non-null  float64 
 4   state                64867 non-null  category
 5   region               64867 non-null  category
 6   residence            64867 non-null  float64 
 7   religion             64867 non-null  category
 8   martial_status       64865 non-null  float64 
 9   living_alone         64867 non-null  float64 
 10  MPCE                 64865 non-null  category
 11  working_status       64852 non-null  category
 12  occupation           64867 non-null  category
 13  pension_amount       64867 non-null  float64 
 14  retired              64774 non-null  float64 
 15  pension            

In [7]:
used_data = data.copy()
used_data = used_data.dropna()
# Define X and y
X = used_data.drop(target_var, axis=1)
y = used_data[target_var]
X.shape, y.shape

((55647, 79), (55647,))

In [8]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((44517, 79), (11130, 79))

# Functions

In [None]:
# Function: Evaluate the model
def calculate_metric(y_true, y_pred, y_pred_prob, metric_key):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Calculate the metrics
    ## Accuracy, precision, recall, f1, auc
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_prob)
    ## Sensitivity and specificity
    sensitivity = recall_score(y_true, y_pred, pos_label=1)
    specificity = recall_score(y_true, y_pred, pos_label=0)
    ## Equal opportunity, equalized odds, disparate impact
    ### Equalized Odds: 根據真實標籤（0 或 1）計算錯誤率是否相等(tpr, fpr)
    ### Demographic Parity: 預測為陽性的比例
    tpr = tp / (tp + fn) if tp + fn > 0 else 0    # True positive rate
    fpr = fp / (fp + tn) if fp + tn > 0 else 0    # False positive rate
    equal_opportunity = tpr
    equalized_odds_tpr = tpr
    equalized_odds_fpr = fpr
    ppr = (tp + fp) / (tp + fp + tn + fn) if tp + fp + tn + fn > 0 else 0    # Predicted positive rate
    disparate_impact = ppr

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'equal_opportunity': equal_opportunity,
        'equalized_odds_tpr': equalized_odds_tpr,
        'equalized_odds_fpr': equalized_odds_fpr,
        'disparate_impact': disparate_impact
    }

    return metrics[metric_key]

In [10]:
# Function: Train the model
def model_fitted(model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups):
    # Create a pipeline
    pipeline = Pipeline([
        # ('scaler', StandardScaler()),
        ('model', model)
    ])
    # Train the model
    if model_name in ['DNN', 'FCN']:
        pipeline['model'].fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
        y_pred_prob = pipeline.predict(X_test).ravel()
        y_pred = (y_pred_prob > 0.5).astype(int)
    else:
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    # Calculate the metrics
    results = pd.DataFrame()
    for group_name, group_slice in groups.items():
        # Create a Boolean mask for the group
        group_mask = (
            group_slice.loc[X_test.index]  # Use the group slice
            if group_name != 'Overall'
            else pd.Series(True, index=X_test.index)  
        )

        y_group = y_test[group_mask]
        y_pred_group = y_pred[group_mask]
        y_pred_prob_group = y_pred_prob[group_mask]
        for metric_key in metric_keys:
            # Calculate the metric
            metric_value = calculate_metric(y_group, y_pred_group, y_pred_prob_group, metric_key)
            # Store the results
            results.loc[group_name, metric_key] = metric_value

    return results

In [11]:
# Function: Define the function to create the DNN model
def create_dnn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

In [12]:
# Function: Define the function to create the Fully Connected Network (FCN) model
def create_fcn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

# Model

In [None]:
metric_keys = [
    'accuracy', 'precision', 'recall', 'f1', 'auc',
    'sensitivity', 'specificity', 
    'equal_opportunity', 'equalized_odds_tpr', 'equalized_odds_fpr', 'disparate_impact'
    ]

In [14]:
groups = {
        'Overall': slice(None),
        'Scheduled Caste': X['caste'] == 'Scheduled caste',
        'Scheduled Tribe': X['caste'] == 'Scheduled tribe',
        'General': X['caste'] == 'General',
        'Other Backward Class': X['caste'] == 'Other backward class',
        'MPCE 1': X['MPCE'] == 'Lowest',
        'MPCE 2': X['MPCE'] == 'Lower middle',
        'MPCE 3': X['MPCE'] == 'Middle',
        'MPCE 4': X['MPCE'] == 'Upper middle',
        'MPCE 5': X['MPCE'] == 'Highest',
    }

## Demographic data

In [15]:
# D
used_group = 'D'
used_columns = demographic
X_train_d = X_train[used_columns]
X_test_d = X_test[used_columns]
# Catgorical variables
X_train_d = pd.get_dummies(X_train_d, drop_first=True)
X_test_d = pd.get_dummies(X_test_d, drop_first=True)
# remove dummy variables
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others']
# X_train_d = X_train_d.drop(dummy_col, axis=1)
# X_test_d = X_test_d.drop(dummy_col, axis=1)
# Convert to float
X_train_d = X_train_d.astype('float32')
X_test_d = X_test_d.astype('float32')
# Standardization
scaler = StandardScaler()
X_train_d_sd = scaler.fit_transform(X_train_d)
X_test_d_sd = scaler.transform(X_test_d)
X_train_d = pd.DataFrame(X_train_d_sd, columns=X_train_d.columns, index=X_train_d.index)
X_test_d = pd.DataFrame(X_test_d_sd, columns=X_test_d.columns, index=X_test_d.index)
X_train_d.shape, X_test_d.shape

((44517, 50), (11130, 50))

In [16]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, force_row_wise=True, verbose=-1),
    'DNN': create_dnn_model(dim = X_train_d.shape[1]),
    'FCN': create_fcn_model(dim = X_train_d.shape[1])
}

In [17]:
# Train and evaluate the models
results = pd.DataFrame()
for model_name, model in models.items():
    print(f"Training the {model_name} model...")
    model_results = model_fitted(model_name, model, X_train_d, y_train, X_test_d, y_test, metric_keys, groups)
    model_results['model'] = model_name
    results = pd.concat([results, model_results])
results.to_csv(f"standardized\\{target_var}\\results_{used_group}.csv")
results

Training the Logistic Regression model...
Training the Random Forest model...
Training the XGBoost model...
Training the Gradient Boosting model...
Training the LightGBM model...
Training the DNN model...
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Training the FCN model...
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 864us/step


Unnamed: 0,accuracy,precision,recall,f1,auc,sensitivity,specificity,equal_opportunity,equalized_odds,disparate_impact,model
Overall,0.723810,0.711829,0.671225,0.690931,0.801268,0.671225,0.768591,0.671225,1.439816,0.433693,Logistic Regression
Scheduled Caste,0.747368,0.599309,0.560582,0.579299,0.795457,0.560582,0.831395,0.560582,1.391977,0.290226,Logistic Regression
Scheduled Tribe,0.723393,0.701117,0.627500,0.662269,0.795664,0.627500,0.796384,0.627500,1.423884,0.386818,Logistic Regression
General,0.721790,0.771023,0.748896,0.759798,0.798534,0.748896,0.683176,0.748896,1.432072,0.570687,Logistic Regression
Other Backward Class,0.714286,0.694131,0.651483,0.672131,0.785797,0.651483,0.765571,0.651483,1.417054,0.421905,Logistic Regression
...,...,...,...,...,...,...,...,...,...,...,...
MPCE 1,0.725615,0.553977,0.575221,0.564399,0.766699,0.575221,0.792876,0.575221,1.368097,0.320875,FCN
MPCE 2,0.697199,0.603753,0.629459,0.616338,0.752697,0.629459,0.739855,0.629459,1.369314,0.402846,FCN
MPCE 3,0.680458,0.640758,0.660802,0.650626,0.750016,0.660802,0.696557,0.660802,1.357359,0.464349,FCN
MPCE 4,0.705650,0.745000,0.709524,0.726829,0.754135,0.709524,0.700880,0.709524,1.410404,0.525624,FCN


## Demographic data + Socialeconomic data

In [18]:
# DS
used_group = 'DS'
used_columns = demographic + social
X_train_ds = X_train[used_columns]
X_test_ds = X_test[used_columns]
# Catgorical variables
X_train_ds = pd.get_dummies(X_train_ds, drop_first=True)
X_test_ds = pd.get_dummies(X_test_ds, drop_first=True)
# remove dummy variables
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General']
# X_train_ds = X_train_ds.drop(dummy_col, axis=1)
# X_test_ds = X_test_ds.drop(dummy_col, axis=1)
# Convert to float
X_train_ds = X_train_ds.astype('float32')
X_test_ds = X_test_ds.astype('float32')
# Standardization
scaler = StandardScaler()
X_train_ds_sd = scaler.fit_transform(X_train_ds)
X_test_ds_sd = scaler.transform(X_test_ds)
X_train_ds = pd.DataFrame(X_train_ds_sd, columns=X_train_ds.columns, index=X_train_ds.index)
X_test_ds = pd.DataFrame(X_test_ds_sd, columns=X_test_ds.columns, index=X_test_ds.index)
X_train_ds.shape, X_test_ds.shape

((44517, 67), (11130, 67))

In [19]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, force_row_wise=True, verbose=-1),
    'DNN': create_dnn_model(dim = X_train_ds.shape[1]),
    'FCN': create_fcn_model(dim = X_train_ds.shape[1])
}

In [20]:
# Train and evaluate the models
results = pd.DataFrame()
for model_name, model in models.items():
    print(f"Training the {model_name} model...")
    model_results = model_fitted(model_name, model, X_train_ds, y_train, X_test_ds, y_test, metric_keys, groups)
    model_results['model'] = model_name
    results = pd.concat([results, model_results])
results.to_csv(f"standardized\\{target_var}\\results_{used_group}.csv")
results

Training the Logistic Regression model...
Training the Random Forest model...
Training the XGBoost model...
Training the Gradient Boosting model...
Training the LightGBM model...
Training the DNN model...
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 912us/step
Training the FCN model...
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 874us/step


Unnamed: 0,accuracy,precision,recall,f1,auc,sensitivity,specificity,equal_opportunity,equalized_odds,disparate_impact,model
Overall,0.734501,0.723461,0.684313,0.703343,0.810628,0.684313,0.777242,0.684313,1.461555,0.435040,Logistic Regression
Scheduled Caste,0.763910,0.668950,0.473344,0.554399,0.806230,0.473344,0.894622,0.473344,1.367966,0.219549,Logistic Regression
Scheduled Tribe,0.733657,0.732929,0.603750,0.662097,0.797733,0.603750,0.832540,0.603750,1.436290,0.356024,Logistic Regression
General,0.738327,0.763503,0.803532,0.783006,0.807315,0.803532,0.645440,0.803532,1.448972,0.618353,Logistic Regression
Other Backward Class,0.718095,0.691513,0.673199,0.682233,0.794822,0.673199,0.754758,0.673199,1.427957,0.437619,Logistic Regression
...,...,...,...,...,...,...,...,...,...,...,...
MPCE 1,0.705561,0.523881,0.517699,0.520772,0.725932,0.517699,0.789578,0.517699,1.307277,0.305378,FCN
MPCE 2,0.663851,0.563986,0.573072,0.568493,0.702157,0.573072,0.721014,0.573072,1.294087,0.392619,FCN
MPCE 3,0.666373,0.626070,0.643206,0.634523,0.718655,0.643206,0.685348,0.643206,1.328555,0.462588,FCN
MPCE 4,0.667105,0.708333,0.674603,0.691057,0.727727,0.674603,0.657869,0.674603,1.332472,0.525624,FCN


## Demographic data + Socialeconomic data + Health data

In [21]:
# DSH
used_group = 'DSH'
used_columns = demographic + social + health
X_train_dsh = X_train[used_columns]
X_test_dsh = X_test[used_columns]
# Catgorical variables
X_train_dsh = pd.get_dummies(X_train_dsh, drop_first=True)
X_test_dsh = pd.get_dummies(X_test_dsh, drop_first=True)
# remove dummy variables
# dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General']
# X_train_dsh = X_train_dsh.drop(dummy_col, axis=1)
# X_test_dsh = X_test_dsh.drop(dummy_col, axis=1)
# Convert to float
X_train_dsh = X_train_dsh.astype('float32')
X_test_dsh = X_test_dsh.astype('float32')
# Standardization
scaler = StandardScaler()
X_train_dsh_sd = scaler.fit_transform(X_train_dsh)
X_test_dsh_sd = scaler.transform(X_test_dsh)
X_train_dsh = pd.DataFrame(X_train_dsh_sd, columns=X_train_dsh.columns, index=X_train_dsh.index)
X_test_dsh = pd.DataFrame(X_test_dsh_sd, columns=X_test_dsh.columns, index=X_test_dsh.index)
X_train_dsh.shape, X_test_dsh.shape

((44517, 95), (11130, 95))

In [22]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, force_row_wise=True, verbose=-1),
    'DNN': create_dnn_model(dim = X_train_dsh.shape[1]),
    'FCN': create_fcn_model(dim = X_train_dsh.shape[1])
}

In [23]:
# Train and evaluate the models
results = pd.DataFrame()
for model_name, model in models.items():
    print(f"Training the {model_name} model...")
    model_results = model_fitted(model_name, model, X_train_dsh, y_train, X_test_dsh, y_test, metric_keys, groups)
    model_results['model'] = model_name
    results = pd.concat([results, model_results])
results.to_csv(f"standardized\\{target_var}\\results_{used_group}.csv")
results

Training the Logistic Regression model...
Training the Random Forest model...
Training the XGBoost model...
Training the Gradient Boosting model...
Training the LightGBM model...
Training the DNN model...
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 961us/step
Training the FCN model...
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 961us/step


Unnamed: 0,accuracy,precision,recall,f1,auc,sensitivity,specificity,equal_opportunity,equalized_odds,disparate_impact,model
Overall,0.748697,0.738791,0.701700,0.719768,0.826143,0.701700,0.788721,0.701700,1.490420,0.436837,Logistic Regression
Scheduled Caste,0.766416,0.665227,0.497577,0.569316,0.810806,0.497577,0.887355,0.497577,1.384931,0.232080,Logistic Regression
Scheduled Tribe,0.742301,0.742129,0.618750,0.674847,0.811230,0.618750,0.836346,0.618750,1.455096,0.360346,Logistic Regression
General,0.747406,0.771129,0.810706,0.790422,0.823006,0.810706,0.657233,0.810706,1.467939,0.617704,Logistic Regression
Other Backward Class,0.744048,0.722496,0.699153,0.710633,0.818420,0.699153,0.780709,0.699153,1.479862,0.435000,Logistic Regression
...,...,...,...,...,...,...,...,...,...,...,...
MPCE 1,0.699180,0.513433,0.507375,0.510386,0.696618,0.507375,0.784960,0.507375,1.292335,0.305378,FCN
MPCE 2,0.661627,0.562500,0.559264,0.560877,0.698847,0.559264,0.726087,0.559264,1.285350,0.384171,FCN
MPCE 3,0.670335,0.624319,0.672532,0.647529,0.726646,0.672532,0.668535,0.672532,1.341067,0.485035,FCN
MPCE 4,0.664477,0.705150,0.673810,0.689123,0.723257,0.673810,0.652981,0.673810,1.326791,0.527376,FCN
