# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# For data preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# For classification model
# 'lr', 'rf', 'lightgbm', 'gbc', 'xgboost'
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For deep learning model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import AUC

# For evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
from sklearn.utils import resample

In [2]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
# set the working directory
os.chdir('C:\\Users\\h2408\\Downloads\\RA\\1_paper_LASI\\data')

# Data Preparation

In [4]:
# Load data
data = pd.read_csv("derived_df.csv")

# Drop the target variables of other papers
target_vars = ['bmi_underweight', 'bmi_overweight', 'waist_circumference']
######################################
target_var = 'waist_circumference'
######################################
data.shape

(64867, 80)

In [5]:
category_col = [
    'education',
    'state',
    'region',
    'religion',
    'MPCE',
    'working_status',
    'occupation',
    'caste',
    'water',
    'alcohol',
    'activity1',
    'benefit'
    ]

# Convert Type
for col in data.columns:
  if col in category_col:
    data[col] = data[col].astype('category')
  else:
    data[col] = data[col].astype('float')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64867 entries, 0 to 64866
Data columns (total 80 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   age                  64867 non-null  float64 
 1   gender               64867 non-null  float64 
 2   education            64866 non-null  category
 3   migration            64770 non-null  float64 
 4   state                64867 non-null  category
 5   region               64867 non-null  category
 6   residence            64867 non-null  float64 
 7   religion             64867 non-null  category
 8   martial_status       64865 non-null  float64 
 9   living_alone         64867 non-null  float64 
 10  MPCE                 64865 non-null  category
 11  working_status       64852 non-null  category
 12  occupation           64867 non-null  category
 13  pension_amount       64867 non-null  float64 
 14  retired              64774 non-null  float64 
 15  pension            

In [7]:
used_data = data.copy()
# Drop the missing values
used_data = used_data.dropna()
# Define X and y
X = used_data.drop(target_vars, axis=1)
y = used_data[target_var]
X.shape, y.shape

((55647, 77), (55647,))

# Functions

In [8]:
# Function: Evaluate the model
def calculate_metric(y_true, y_pred, y_pred_prob, metric_key):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Calculate the metrics
    ## Accuracy, precision, recall, f1, auc
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_prob)
    ## Sensitivity and specificity
    sensitivity = recall_score(y_true, y_pred, pos_label=1)
    specificity = recall_score(y_true, y_pred, pos_label=0)
    ## Equal opportunity, equalized odds, disparate impact
    ### Equalized Odds: 根據真實標籤（0 或 1）計算錯誤率是否相等(tpr, fpr)
    ### Demographic Parity: 預測為陽性的比例
    tpr = tp / (tp + fn) if tp + fn > 0 else 0    # True positive rate
    tnr = tn / (tn + fp) if tn + fp > 0 else 0    # True negative rate
    equal_opportunity = tpr
    equalized_odds = tpr + tnr
    ppr = (tp + fp) / (tp + fp + tn + fn) if tp + fp + tn + fn > 0 else 0    # Predicted positive rate
    disparate_impact = ppr

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'equal_opportunity': equal_opportunity,
        'equalized_odds': equalized_odds,
        'disparate_impact': disparate_impact
    }

    return metrics[metric_key]

In [9]:
# Function: Train the model
def model_fitted(model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups):
    # Create a pipeline
    pipeline = Pipeline([
        # ('scaler', StandardScaler()),
        ('model', model)
    ])
    # Train the model
    if model_name in ['DNN', 'FCN']:
        pipeline['model'].fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
        y_pred_prob = pipeline.predict(X_test).ravel()
        y_pred = (y_pred_prob > 0.5).astype(int)
    else:
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
    # Calculate the metrics
    results = pd.DataFrame()
    for group_name, group_slice in groups.items():
        # Create a Boolean mask for the group
        group_mask = (
            group_slice.loc[X_test.index]  # Use the group slice
            if group_name != 'Overall'
            else pd.Series(True, index=X_test.index)  
        )

        y_group = y_test[group_mask]
        y_pred_group = y_pred[group_mask]
        y_pred_prob_group = y_pred_prob[group_mask]
        
        for metric_key in metric_keys:
            # Calculate the metric
            metric_value = calculate_metric(y_group, y_pred_group, y_pred_prob_group, metric_key)
            # Store the results
            results.loc[group_name, metric_key] = metric_value
            
    return results

In [10]:
# Function: Define the function to create the DNN model
def create_dnn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

In [11]:
# Function: Define the function to create the Fully Connected Network (FCN) model
def create_fcn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

# Models

1. Logistic Regression
2. Random Forest
3. XGBoost
4. Gradient Boosting
5. LightGBM
6. DNN
7. FCN

In [12]:
metric_keys = [
    'accuracy', 'precision', 'recall', 'f1', 'auc',
    'sensitivity', 'specificity', 
    'equal_opportunity', 'equalized_odds', 'disparate_impact'
    ]

In [13]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, force_row_wise=True, verbose=-1),
    'DNN': None,
    'FCN': None
}

In [14]:
groups = {
    'Overall': slice(None)
}

## Caste Subgroup

In [15]:
# Define different data splits
data_splits = {
    'Scheduled Caste': X['caste'] == 'Scheduled caste',
    'Scheduled Tribe': X['caste'] == 'Scheduled tribe',
    'General': X['caste'] == 'General',
    'Other Backward Class': X['caste'] == 'Other backward class',
}

In [16]:
# Use the data splits to train the models
results = pd.DataFrame()
for split_name, split_mask in data_splits.items():
    for model_name, model in models.items():
        print(f'Training {model_name} on {split_name}...')
        # Split the data
        sub_X = X[split_mask]
        sub_y = y[split_mask]
        # Category encoding
        sub_X = pd.get_dummies(sub_X, drop_first=True)
        print(sub_X.shape, sub_y.shape)
        # Convert to float32
        sub_X = sub_X.astype('float32')
        sub_y = sub_y.astype('float32')
        # Standardization
        scaler = StandardScaler()
        sub_X_sd = scaler.fit_transform(sub_X)
        sub_X = pd.DataFrame(sub_X_sd, columns=sub_X.columns, index=sub_X.index)
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(sub_X, sub_y, test_size=0.2, random_state=42)
        # Train the model
        if model_name in ['DNN', 'FCN']:
            dim = X_train.shape[1]
            model = create_dnn_model(dim) if model_name == 'DNN' else create_fcn_model(dim)
        model_results = model_fitted(model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups)
        model_results['Model'] = model_name
        model_results['Subgroup'] = split_name
        results = pd.concat([results, model_results], axis=0)

# Save the results to a CSV file
results = pd.DataFrame(results)
results.to_csv(f"standardized\\{target_var}\\results_subgroup_caste.csv")
results

Training Logistic Regression on Scheduled Caste...
(9817, 129) (9817,)
Training Random Forest on Scheduled Caste...
(9817, 129) (9817,)
Training XGBoost on Scheduled Caste...
(9817, 129) (9817,)
Training Gradient Boosting on Scheduled Caste...
(9817, 129) (9817,)
Training LightGBM on Scheduled Caste...
(9817, 129) (9817,)
Training DNN on Scheduled Caste...
(9817, 129) (9817,)
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Training FCN on Scheduled Caste...
(9817, 129) (9817,)
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Training Logistic Regression on Scheduled Tribe...
(9393, 129) (9393,)
Training Random Forest on Scheduled Tribe...
(9393, 129) (9393,)
Training XGBoost on Scheduled Tribe...
(9393, 129) (9393,)
Training Gradient Boosting on Scheduled Tribe...
(9393, 129) (9393,)
Training LightGBM on Scheduled Tribe...
(9393, 129) (9393,)
Training DNN on Scheduled Tribe...
(9393, 129) (9393,)
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[

Unnamed: 0,accuracy,precision,recall,f1,auc,sensitivity,specificity,equal_opportunity,equalized_odds,disparate_impact,Model,Subgroup
Overall,0.770876,0.701826,0.533128,0.605954,0.827184,0.533128,0.888213,0.533128,1.421341,0.251018,Logistic Regression,Scheduled Caste
Overall,0.762729,0.729323,0.448382,0.555344,0.817878,0.448382,0.917871,0.448382,1.366253,0.203157,Random Forest,Scheduled Caste
Overall,0.740326,0.634951,0.503852,0.561856,0.801399,0.503852,0.857034,0.503852,1.360886,0.26222,XGBoost,Scheduled Caste
Overall,0.778004,0.726115,0.526965,0.610714,0.836621,0.526965,0.901901,0.526965,1.428866,0.239817,Gradient Boosting,Scheduled Caste
Overall,0.769857,0.697395,0.53621,0.606272,0.822664,0.53621,0.885171,0.53621,1.421381,0.254073,LightGBM,Scheduled Caste
Overall,0.736762,0.61828,0.531587,0.571665,0.768307,0.531587,0.838023,0.531587,1.36961,0.284114,DNN,Scheduled Caste
Overall,0.715886,0.576728,0.526965,0.550725,0.753434,0.526965,0.809125,0.526965,1.33609,0.301935,FCN,Scheduled Caste
Overall,0.745609,0.70692,0.66539,0.685526,0.816724,0.66539,0.80292,0.66539,1.468309,0.39223,Logistic Regression,Scheduled Tribe
Overall,0.730176,0.706587,0.60281,0.650586,0.8004,0.60281,0.821168,0.60281,1.423978,0.355508,Random Forest,Scheduled Tribe
Overall,0.729111,0.687671,0.641124,0.663582,0.791105,0.641124,0.791971,0.641124,1.433095,0.388505,XGBoost,Scheduled Tribe


## MPCE Subgroup

In [17]:
# Define different data splits
data_splits = {
    'MPCE 1': used_data['MPCE'] == 'Lowest',
    'MPCE 2': used_data['MPCE'] == 'Lower middle',
    'MPCE 3': used_data['MPCE'] == 'Middle',
    'MPCE 4': used_data['MPCE'] == 'Upper middle',
    'MPCE 5': used_data['MPCE'] == 'Highest',
}

In [18]:
# Use the data splits to train the models
results = pd.DataFrame()
for split_name, split_mask in data_splits.items():
    for model_name, model in models.items():
        print(f'Training {model_name} on {split_name}...')
        # Split the data
        sub_X = X[split_mask]
        sub_y = y[split_mask]
        # Category encoding
        sub_X = pd.get_dummies(sub_X, drop_first=True)
        print(sub_X.shape, sub_y.shape)
        # Convert to float32
        sub_X = sub_X.astype('float32')
        sub_y = sub_y.astype('float32')
        # Standardization
        scaler = StandardScaler()
        sub_X_sd = scaler.fit_transform(sub_X)
        sub_X = pd.DataFrame(sub_X_sd, columns=sub_X.columns, index=sub_X.index)
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(sub_X, sub_y, test_size=0.2, random_state=42)
        # Train the model
        if model_name in ['DNN', 'FCN']:
            dim = X_train.shape[1]
            model = create_dnn_model(dim) if model_name == 'DNN' else create_fcn_model(dim)
        # Train the model
        model_results = model_fitted(model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups)
        model_results['Model'] = model_name
        model_results['Subgroup'] = split_name
        results = pd.concat([results, model_results], axis=0)

# Save the results to a CSV file
results = pd.DataFrame(results)
results.to_csv(f"standardized\\{target_var}\\results_subgroup_MPCE.csv")
results

Training Logistic Regression on MPCE 1...
(11285, 129) (11285,)
Training Random Forest on MPCE 1...
(11285, 129) (11285,)
Training XGBoost on MPCE 1...
(11285, 129) (11285,)
Training Gradient Boosting on MPCE 1...
(11285, 129) (11285,)
Training LightGBM on MPCE 1...
(11285, 129) (11285,)
Training DNN on MPCE 1...
(11285, 129) (11285,)
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Training FCN on MPCE 1...
(11285, 129) (11285,)
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Training Logistic Regression on MPCE 2...
(11313, 129) (11313,)
Training Random Forest on MPCE 2...
(11313, 129) (11313,)
Training XGBoost on MPCE 2...
(11313, 129) (11313,)
Training Gradient Boosting on MPCE 2...
(11313, 129) (11313,)
Training LightGBM on MPCE 2...
(11313, 129) (11313,)
Training DNN on MPCE 2...
(11313, 129) (11313,)
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Training FCN on MPCE 2...
(11313, 129) (11313,)
[1m71/71[

Unnamed: 0,accuracy,precision,recall,f1,auc,sensitivity,specificity,equal_opportunity,equalized_odds,disparate_impact,Model,Subgroup
Overall,0.789544,0.674952,0.536474,0.597798,0.832553,0.536474,0.893684,0.536474,1.430158,0.231724,Logistic Regression,MPCE 1
Overall,0.76872,0.661137,0.424012,0.516667,0.816576,0.424012,0.910569,0.424012,1.334581,0.186974,Random Forest,MPCE 1
Overall,0.76296,0.614953,0.5,0.551551,0.797242,0.5,0.871169,0.5,1.371169,0.23704,XGBoost,MPCE 1
Overall,0.782455,0.689342,0.462006,0.55323,0.828296,0.462006,0.914321,0.462006,1.376328,0.195392,Gradient Boosting,MPCE 1
Overall,0.776252,0.654545,0.492401,0.562012,0.820882,0.492401,0.893058,0.492401,1.385459,0.219318,LightGBM,MPCE 1
Overall,0.762074,0.604853,0.530395,0.565182,0.77623,0.530395,0.857411,0.530395,1.387806,0.255649,DNN,MPCE 1
Overall,0.71821,0.516871,0.512158,0.514504,0.74889,0.512158,0.803002,0.512158,1.31516,0.288879,FCN,MPCE 1
Overall,0.752541,0.707169,0.645949,0.675174,0.812849,0.645949,0.823054,0.645949,1.469003,0.363677,Logistic Regression,MPCE 2
Overall,0.737517,0.711724,0.572697,0.634686,0.79466,0.572697,0.846549,0.572697,1.419246,0.320371,Random Forest,MPCE 2
Overall,0.722492,0.66426,0.612653,0.637413,0.790058,0.612653,0.795154,0.612653,1.407807,0.367212,XGBoost,MPCE 2
