# Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# For data preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# For classification model
# 'lr', 'rf', 'lightgbm', 'gbc', 'xgboost'
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For deep learning model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.metrics import AUC

# For evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
from sklearn.utils import resample


In [None]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
# Ignore warnings
warnings.filterwarnings('ignore')

In [None]:
# set the working directory
os.chdir('C:\\Users\\h2408\\Downloads\\RA\\1_paper_LASI\\data')

# Data Preparation

In [None]:
# Load data
data = pd.read_csv("derived_df.csv")

# Drop the target variables of other papers
target_vars = ['bmi_underweight', 'bmi_overweight', 'waist_circumference']
######################################
target_var = 'waist_circumference'
######################################
data.shape

In [None]:
category_col = [
    'education',
    'state',
    'region',
    'religion',
    'MPCE',
    'working_status',
    'occupation',
    'caste',
    'water',
    'alcohol',
    'activity1',
    'benefit'
    ]

# Convert Type
for col in data.columns:
  if col in category_col:
    data[col] = data[col].astype('category')
  else:
    data[col] = data[col].astype('float')

In [None]:
data.info()

In [None]:
used_data = data.copy()
# Drop the missing values
used_data = used_data.dropna()
# Define X and y
X = used_data.drop(target_vars, axis=1)
y = used_data[target_var]
X.shape, y.shape

In [None]:
groups = {
        'Overall': slice(None),
        'Scheduled Caste': X['caste'] == 'Scheduled caste',
        'Scheduled Tribe': X['caste'] == 'Scheduled tribe',
        'General': X['caste'] == 'General',
        'Other Backward Class': X['caste'] == 'Other backward class',
        'MPCE 1': X['MPCE'] == 'Lowest',
        'MPCE 2': X['MPCE'] == 'Lower middle',
        'MPCE 3': X['MPCE'] == 'Middle',
        'MPCE 4': X['MPCE'] == 'Upper middle',
        'MPCE 5': X['MPCE'] == 'Highest',
    }

In [None]:
# Category encoding
X = pd.get_dummies(X)
dummy_col = ['education_No', 'state_Chandigarh', 'region_Central', 'religion_Others', 'MPCE_Middle', 'working_status_Never worked', 'occupation_Currently no work', 'caste_General', 'water_other', 'alcohol_abstainer', 'activity1_moderate', 'benefit_non-applicable']
X = X.drop(dummy_col, axis=1)
X.shape

In [None]:
X = X.astype('float32')
y = y.astype('float32')

# Standardization
scaler = StandardScaler()
X_sd = scaler.fit_transform(X)
X = pd.DataFrame(X_sd, columns=X.columns, index=X.index)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

# Functions

In [None]:
# Function: Evaluate the model
def calculate_metric(y_true, y_pred, y_pred_prob, metric_key):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Calculate the metrics
    ## Accuracy, precision, recall, f1, auc
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_prob)
    ## Sensitivity and specificity
    sensitivity = recall_score(y_true, y_pred, pos_label=1)
    specificity = recall_score(y_true, y_pred, pos_label=0)
    ## Equal opportunity, equalized odds, disparate impact
    ### Equalized Odds: 根據真實標籤（0 或 1）計算錯誤率是否相等(tpr, fpr)
    ### Demographic Parity: 預測為陽性的比例
    tpr = tp / (tp + fn) if tp + fn > 0 else 0    # True positive rate
    tnr = tn / (tn + fp) if tn + fp > 0 else 0    # True negative rate
    equal_opportunity = tpr
    equalized_odds = tpr + tnr
    ppr = (tp + fp) / (tp + fp + tn + fn) if tp + fp + tn + fn > 0 else 0    # Predicted positive rate
    disparate_impact = ppr

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'equal_opportunity': equal_opportunity,
        'equalized_odds': equalized_odds,
        'disparate_impact': disparate_impact
    }

    return metrics[metric_key]

In [None]:
# Function: Calculate the confidence interval
def calculate_ci(y_true, y_pred, y_pred_prob, metric_keys, n_resamples=1000, ci=0.95, seed=42):
    # Set random seed
    np.random.seed(seed)

    # Bootstrap resampling
    resample_metrics = {metric_key: [] for metric_key in metric_keys}
    for _ in range(n_resamples):
        for metric_key in metric_keys:
            # Resample the data with replacement
            resampled_y_true, resampled_y_pred, resampled_y_pred_prob = resample(y_true, y_pred, y_pred_prob)
            # Calculate the metric for the resampled data
            resampled_metric = calculate_metric(resampled_y_true, resampled_y_pred, resampled_y_pred_prob, metric_key)

            resample_metrics[metric_key].append(resampled_metric)

    # Calculate the confidence interval
    ci_metrics = {}
    for metric_key in metric_keys:
        resample_metric = resample_metrics[metric_key]
        ci_mean = np.mean(resample_metric)
        lower_bound = np.percentile(resample_metric, (1 - ci) / 2 * 100)
        upper_bound = np.percentile(resample_metric, (1 + ci) / 2 * 100)

        # mean, lower_bound, upper_bound
        ci_metrics[metric_key] = (ci_mean, lower_bound, upper_bound)
        
    return ci_metrics

In [None]:
# Function: Train the model
def model_fitted(model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups):
   
    # Train the model
    if model_name in ['DNN', 'FCN']:
        model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
        y_pred_prob = model.predict(X_test).ravel()
        y_pred = (y_pred_prob > 0.5).astype(int)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        
    # Calculate the metrics
    results = pd.DataFrame()
    for group_name, group_slice in groups.items():
        # Create a Boolean mask for the group
        group_mask = (
            group_slice.loc[X_test.index]  # Use the group slice
            if group_name != 'Overall'
            else pd.Series(True, index=X_test.index)  
        )

        y_group = y_test[group_mask]
        y_pred_group = y_pred[group_mask]
        y_pred_prob_group = y_pred_prob[group_mask]

        # Calculate the CI
        ci_metrics = calculate_ci(y_group, y_pred_group, y_pred_prob_group, metric_keys)
        for metric_key in metric_keys:
            # Calculate the metric
            metric_value = calculate_metric(y_group, y_pred_group, y_pred_prob_group, metric_key)
            ci_mean, ci_lower_bound, ci_upper_bound = ci_metrics[metric_key]
            # Store the results
            results.loc[group_name, metric_key] = metric_value
            results.loc[group_name, f'{metric_key}_cilow'] = ci_lower_bound
            results.loc[group_name, f'{metric_key}_ciup'] = ci_upper_bound
            results.loc[group_name, f'{metric_key}_ci'] = f'{ci_mean:.2f} ({ci_lower_bound:.2f}-{ci_upper_bound:.2f})'

    return results

In [None]:
# Function: Define the function to create the DNN model
def create_dnn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

In [None]:
# Function: Define the function to create the Fully Connected Network (FCN) model
def create_fcn_model(dim):
    model = Sequential()
    model.add(Dense(128, input_dim=dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # For binary classification

    # Compile the model with AUROC as a metric
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='auroc')])
    return model

# Models

1. Logistic Regression
2. Random Forest
3. XGBoost
4. Gradient Boosting
5. LightGBM
6. DNN
7. FCN

In [None]:
metric_keys = [
    'accuracy', 'precision', 'recall', 'f1', 'auc',
    'sensitivity', 'specificity', 
    'equal_opportunity', 'equalized_odds', 'disparate_impact'
    ]

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42, force_row_wise=True, verbose=-1),
    'DNN': create_dnn_model(dim = X_train.shape[1]),
    'FCN': create_fcn_model(dim = X_train.shape[1])
}

In [None]:
# Train and evaluate the models
results = pd.DataFrame()
for model_name, model in models.items():
    print(f"Training the {model_name} model...")
    model_results = model_fitted(model_name, model, X_train, y_train, X_test, y_test, metric_keys, groups)
    model_results['model'] = model_name
    results = pd.concat([results, model_results])
results.to_csv(f"{target_var}\\results_standardized.csv", index=True)
results

In [None]:
# # Use cross-validation to evaluate the models
# results = pd.DataFrame()
# for model_name, model in models.items():
#     cv_scores = model_cv(model_name, model, X, y, groups)
#     cv_scores['model'] = model_name
#     results = pd.concat([results, cv_scores])
# results = pd.DataFrame(results)
# results.to_csv(f"{target_var}\\cvresults.csv")
# results