In [1]:
import numpy as np
import pandas as pd
from typing import List
import os
import pickle
from sklearn import metrics
from dataclasses import dataclass
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [2]:
CHARLS_PATH = '../Doc/external-validation/CHARLS'

In [3]:
def keys_with_substring(df: pd.DataFrame, pre: str) -> List[str]:
    cols = df.columns
    return list(filter(lambda x : x.find(pre) != -1, cols))

## 0. Read the raw data

In [4]:
# demographic data
demographic_2011 = pd.read_csv(CHARLS_PATH + "/2011/CHARLS2011r/demographic_background.csv", index_col=0)
demographic_2013 = pd.read_csv(CHARLS_PATH + "/2013/CHARLS2013r/demographic_background.csv", index_col=0)
demographic_2015 = pd.read_csv(CHARLS_PATH + "/2015/CHARLS2015r/demographic_background.csv", index_col=0)
demographic_2018 = pd.read_csv(CHARLS_PATH + "/2018/CHARLS2018r/demographic_background.csv", index_col=0)
hm_dataset = pd.read_stata(CHARLS_PATH + "/harmonized/data.dta")

# baseline data
biomarker_2011 = pd.read_stata(CHARLS_PATH + "/2011/CHARLS2011r/biomarker.dta")
biomarker_2013 = pd.read_stata(CHARLS_PATH + "/2013/CHARLS2013r/Biomarker.dta")
biomarker_2015 = pd.read_stata(CHARLS_PATH + "/2015/CHARLS2015r/Biomarker.dta")
# biomarker_2018 = pd.read_stata(CHARLS_PATH + "/2011/CHARLS2011r/biomarker.dta")
# diagnosis
diag_2011 = pd.read_stata(CHARLS_PATH + "/2011/CHARLS2011r/health_status_and_functioning.dta")
diag_2013 = pd.read_stata(CHARLS_PATH + "/2013/CHARLS2013r/health_status_and_functioning.dta")
diag_2015 = pd.read_stata(CHARLS_PATH + "/2015/CHARLS2015r/health_status_and_functioning.dta")
diag_2018 = pd.read_stata(CHARLS_PATH + "/2018/CHARLS2018r/health_status_and_functioning.dta")


In [5]:
demographic_2011

Unnamed: 0,ID,householdID,communityID,ba001,ba002_1,ba002_2,ba002_3,ba003,ba004,ba005,...,bf005_2,bf006_1,bf006_2,bf007_1,bf007_2,bf008,bf009,bf010,proxy,rgender
0,1010410101,10104101,101041,6 Snake,1965.0,5.0,19.0,2 Lunar calendar,,2 No,...,,,,,,1 Never,,,0 No,2 Female
1,1010410102,10104101,101041,4 Rabbit,1963.0,5.0,12.0,1 Solar calendar,,2 No,...,,,,,,1 Never,,,0 No,1 Male
2,1010410201,10104102,101041,7 Horse,1954.0,12.0,23.0,2 Lunar calendar,,2 No,...,,,,,,1 Never,,,0 No,2 Female
3,1010410202,10104102,101041,4 Rabbit,1951.0,9.0,9.0,2 Lunar calendar,,2 No,...,,,,,,1 Never,,,0 No,1 Male
4,1010410301,10104103,101041,4 Rabbit,1963.0,11.0,28.0,2 Lunar calendar,,2 No,...,,,,,,1 Never,,,0 No,1 Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17700,34776331001,347763310,3477633,5 Dragon,1940.0,7.0,12.0,1 Solar calendar,,2 No,...,,,,,,1 Never,,,0 No,1 Male
17701,34776331002,347763310,3477633,12 Pig,1950.0,8.0,15.0,1 Solar calendar,,2 No,...,,,,,,1 Never,,,0 No,2 Female
17702,34776331101,347763311,3477633,3 Tiger,1962.0,11.0,25.0,2 Lunar calendar,,2 No,...,,,,,,1 Never,,,0 No,2 Female
17703,34776331201,347763312,3477633,4 Rabbit,1963.0,4.0,28.0,1 Solar calendar,,2 No,...,,,,,,1 Never,,,0 No,2 Female


In [7]:
biomarker_2011.shape[0]

11847

In [6]:
demographic_2011 = pd.read_csv(CHARLS_PATH + "/2011/CHARLS2011r/demographic_background.csv", index_col=0)

In [7]:
demographic_2011.rename(columns={"ID":"ID_old"}, inplace=True)
demographic_2011["ID_old"] = demographic_2011["ID_old"].astype("str")
demographic_2011 = pd.merge(right=demographic_2011, left=hm_dataset[["ID", "ID_w1", "ragender"]], right_on="ID_old", left_on="ID_w1")
demographic_2011.drop(columns=["ID_w1"], inplace=True)

## 1. Baseline Analysis

In [8]:
demo_list = ["ba002_1", "ba002_2", "ba002_3", "ragender"]

In [17]:
merged = pd.merge(left=demographic_2011, right=biomarker_2011, left_on='ID_old', right_on="ID", how="inner").rename(columns={"ID_x":"ID"})

clean the gender and the calculate the age, ""6. The first wave of CHARLS was conducted between June 2011 and March 2012", we take the median, 10 Oct 2011

In [18]:
merged["age"] = 2011 - merged["ba002_1"]
merged["sex"] = merged["ragender"].astype("str").str.split(".").apply(lambda x : 2-int(x[0]))
merged = merged[["ID", "ID_old", "sex", "age", "newcho", "newhdl", "newldl", "newhba1c", "newtg", "newglu", "newcrea"]]

In [24]:
baseline_dataset = pd.merge(left=merged, right=diag_2011[["ID", "da007_3_"]].rename(columns={"ID": "id"}), left_on="ID_old", right_on="id")
baseline_dataset = baseline_dataset[["ID", "ID_old", "sex", "age", "newcho", "newhdl", "newldl", "newhba1c", "newtg", "newglu", "newcrea", "da007_3_"]]

In [22]:
CONVERT_CONST = {
    "creatinine": 88.4,
    "choles": 0.02585983966,
    "glucose": 0.0555,
    "HDL": 0.0555,
    "LDL": 0.0555,
    "triglycerides": 0.01129050468,
}

In [26]:
# exclude the diabetes individual
baseline_dataset = baseline_dataset.query("da007_3_ != '1 Yes'")
# convert the unit
tests = ["newcho", "newhdl", "newldl", "newhba1c", "newtg", "newglu", "newcrea"]


name_map = {
    "newcho": "choles",
    "newhdl": 'HDL',
    "newldl": 'LDL',
    "newhba1c": "HbA1c",
    "newtg": "triglycerides",
    "newglu": "glucose",
    "newcrea": "creatinine",
    "age": "baseline_age"
}

# rename the tests to comply with the original ones
baseline_dataset.rename(columns=name_map, inplace=True)
for name in name_map.values():
    if name  == "HbA1c" or name == "baseline_age":
        continue
    baseline_dataset[name] = baseline_dataset[name] * CONVERT_CONST[name]

In [27]:
# find the pre-diabetes patients
predm = baseline_dataset.query("HbA1c > 5.6 and HbA1c < 6.4 or glucose > 5.6 and glucose < 6.9")

In [137]:
predm.shape

(3649, 12)

## 2 Validation

### 2.1 2-year spectrum
The diagnosis and HbA1c is missing, so we dig out the information from the diabetes questions.



If respondents have diabetes, then answer DA014-DA016
[reference]("https://charls.charlsdata.com/Public/ashelf/public/uploads/document/2015-charls-wave4/application/CHARLS_2015_Questionnaire.pdf")

In [134]:
CHECK_POINT_PATH = r"../Output/A98_20230627_output/"
TIME_SPECTRUM = 10

wave = {
    2 : diag_2013,
    5: diag_2015,
    10: diag_2018
}

df = wave[TIME_SPECTRUM]

In [135]:
# load the model from the folder
def load_model(time_slot: int):
    model_path = os.path.join(CHECK_POINT_PATH, f"spec-{time_slot}year", "models", "weighted_model")
    scaler_path = os.path.join(CHECK_POINT_PATH, f"spec-{time_slot}year", "models","scaler.pkl")
    scaler = pickle.load(open(scaler_path, 'rb'))
    model: tf.keras.Model = tf.keras.models.load_model(model_path)
    return scaler, model

# if one of the question is not null, we know the person had diabetes
def diagnose(df: pd.DataFrame, diab_questions: List[str]) -> None:
    df["diabetes"] = False
    for question in diab_questions:
        df["diabetes"] += df[question].notnull()
    df["diabetes"] = df["diabetes"] > 0

In [136]:
diab_questions = keys_with_substring(df, "da014") + keys_with_substring(df, "da015") + keys_with_substring(df, "da016")
diagnose(df, diab_questions)
ds = pd.merge(left=predm, right=df[["ID", "diabetes"]], on="ID")
ds.drop(columns="da007_3_", inplace=True)
# fill the null value with mean value since the missing is small
ds.fillna(ds.mean(), inplace=True)
n_pre2dm = ds[ds["diabetes"]].shape[0]
n_pre = ds.shape[0]
print("the incidence rate for {}-year is {:.2%}, pos is {}, total is {}".format(TIME_SPECTRUM, (n_pre2dm / n_pre), n_pre2dm, n_pre))

X_scaled = pd.DataFrame({})
VALID_FEATURES = {
    2: ['creatinine', 'glucose', 'HbA1c', 'baseline_age'],
    5: [ 'HDL', 'creatinine', 'glucose','triglycerides','LDL','potassium_serum_plasma','HbA1c', 'baseline_age', 'sex'],
    10: ['creatinine', 'glucose', 'triglycerides', 'potassium_serum_plasma', 'HbA1c', 'baseline_age', 'sex']
}
VALID_TESTS = {
    2: ['HDL','choles','creatinine','glucose','triglycerides','LDL','potassium_serum_plasma','HbA1c'],
    5: ['HDL','choles','creatinine','glucose','triglycerides','LDL','potassium_serum_plasma','HbA1c'],
    10: ["choles", "creatinine", "glucose", "triglycerides", "potassium_serum_plasma", "HbA1c"]
}
for t in VALID_TESTS[TIME_SPECTRUM]:
    if t in ds.columns:
        X_scaled[t] = ds[t]
        assert ds[t].isnull().sum() == 0
    else:
        X_scaled[t] = np.zeros(ds.shape[0])

the incidence rate for 10-year is 11.40%, pos is 341, total is 2990


In [114]:
X_scaled.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HDL,3280.0,0.156135,0.047437,0.021435,0.122655,0.150044,0.182196,0.482284
choles,3280.0,0.131702,0.025714,0.055067,0.114529,0.129524,0.146846,0.321613
creatinine,3280.0,69.323647,19.681026,25.97192,57.937359,66.927643,77.915764,642.305603
glucose,3280.0,6.1467,0.790301,3.34665,5.75424,5.994,6.35364,20.959021
triglycerides,3280.0,1.555238,1.046054,0.309755,0.899289,1.268996,1.878514,15.307891
LDL,3280.0,0.369842,0.1104,0.007145,0.297706,0.363202,0.438223,1.188443
potassium_serum_plasma,3280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HbA1c,3280.0,5.232975,0.475144,3.5,4.9,5.2,5.5,9.2


In [121]:
scaler, model = load_model(TIME_SPECTRUM)
X_scaled = pd.DataFrame(scaler.transform(X_scaled), columns=VALID_TESTS[TIME_SPECTRUM])
X_scaled["sex"] = ds["sex"]
X_scaled["baseline_age"] = ds["baseline_age"]
X_scaled["potassium_serum_plasma"] = 0
y = ds["diabetes"].astype("int")

# prediction
pred = model.predict(X_scaled[VALID_FEATURES[TIME_SPECTRUM]].to_numpy())
print(f"time spectra:  {TIME_SPECTRUM}")
eva = model.evaluate(X_scaled[VALID_FEATURES[TIME_SPECTRUM]].to_numpy(), y.to_numpy())





time spectra:  5


In [122]:
print("For {}-year cohort, the model AUC is {:.2%}, recall rate is {:.2%}, precision is {:.2%} and accuracy is {:.2%}".format(TIME_SPECTRUM, eva[-2], eva[-3], eva[-4], eva[-5]))

For 5-year cohort, the model AUC is 69.54%, recall rate is 89.14%, precision is 5.97% and accuracy is 22.02%


For 2-year cohort, the model AUC is 71.92%, recall rate is 83.90%, precision is 4.79% and accuracy is 39.42%, the incidence rate for 2-year is 3.60%
For 5-year cohort, the model AUC is 69.54%, recall rate is 89.14%, precision is 5.97% and accuracy is 22.02%, the incidence rate for 5-year is 5.51%
For 10-year cohort, the model AUC is 72.29%, recall rate is 73.90%, precision is 17.46% and accuracy is 57.19%, the incidence rate for 10-year is 11.40%