In [None]:
#import libraries
import pandas as pd
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

import util.cleaning_tools as tools


import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import RandomOverSampler, SMOTE

import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay,\
precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score
from sklearn.feature_selection import SelectKBest, f_classif, chi2,f_regression
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier

from random import sample
import time
import warnings

warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

In [None]:
name_dict = {
    5200279: "creatinineRenal",
    5200289: "cholesHDL",
    5200290: "choles",
    5200295: "creatinine",
    5200305: "glucose",
    5200306: "fastingGlucose",
    5200325: "triglyceride",
    5200406: "cholesLDL_1",
    5201215: "glucoseInBlood",
    5203289: "proteinCreatinineRatio" ,   
    5200345: "albumin",
    5200346: "albumin24h",
    5200387: "potassiumSerumOrPlasma",
    5200393: "proteinUrine",
    5200394: "proteinUrine24h",
    5200402: "albuminCreatinineRatio",
    5200485: "HBA1C",
    5200547: "albuminUnspecifiedTime",
    5200679: "cholesLDL_2",
    5200715: "creatinineRenalClearance",
    5200935: "microalbuminCreatinineRatio",
    5201051: "proteinCreatinineMassRatio",
    5204348: "glomerularFiltrationRate"
}
test = list(name_dict.values())


In [None]:
tests_dict = {5200289: "cholesHDL",
 5200290: "choles",
 5200295: "creatinine",
 5200306: "fastingGlucose",
 5200325: "triglyceride",
 5200485: "HBA1C"
}
tests_id = list(tests_dict.keys())
tests_name = list(tests_dict.values())
demo_info =['pseudo_patient_key',
            'pre_dtm', 
            'pre_diff_hour', 
            'sex',
            'pre_age']

In [None]:
# patients data
patients = pd.read_csv(r'../tables/output/group_patient_age.csv', index_col=0)

# define the file path and tables path for file reading
file_path = r'../DATAFILE'
tid_to_eid_path = r'iams_entity_concept'
labresult_cps_path = 'lis_cps_result_data'
labresult_hms_path = 'lis_hms_result_data'
 
# read the fragment files and concat them
usecols = ["pseudo_patient_key", "reference_dtm", "diff_in_hour_reference_dtm", "result_str", "entity_id", "si_unit", "si_numeric"]
labresult_cps = tools.fileReader(file_path, labresult_cps_path, usecols=usecols)
labresult_hms = tools.fileReader(file_path, labresult_hms_path, usecols=usecols)
tid_to_eid = tools.fileReader(file_path, tid_to_eid_path)

# the datafield of cps and hms are the same, so we can concate them.
labresult = pd.concat([labresult_cps, labresult_hms])
# delete the reference to the raw data for the sake of garbage recycling
del labresult_cps
del labresult_hms

In [None]:
tid_to_eid_path = r'iams_entity_concept'
tid_to_eid = tools.fileReader(file_path, tid_to_eid_path)

In [None]:
patients = patients.query("label != 2")
patients = patients.query("diab_age >= 18.0 or diab_age.isnull()", engine="python")

In [None]:
eid = tid_to_eid[tid_to_eid.term_id.isin(tests_id)]["entity_id"]

In [None]:
features=["pseudo_patient_key", "age", 'sex','test_name', 'si_numeric', 'diff_in_hour_reference_dtm']

In [None]:
# left join the table with the patients test
dataset = pd.merge(left=patients, right=labresult[labresult.entity_id.isin(eid)], how='inner', on="pseudo_patient_key")
del labresult
# map the age
f = lambda x : int(x[:4])
dataset = dataset.assign(age=dataset["reference_dtm"].apply(f) - dataset["dob_Y"].apply(f))
# merge with tid
dataset = pd.merge(left=dataset, right=tid_to_eid, how='inner', on="entity_id")
# map the name of the tests
dataset["test_name"] = dataset["term_id"].apply(lambda x : tests_dict[x])
# truncate the dataset at the moment of the prediabetes
dataset = dataset.query("reference_dtm <= pre_dtm")
# drop the patient later than 2016-12-31
dataset = dataset.query("pre_dtm <= '2016-12-31'")

Table pivote to make sparse matrix

In [None]:
ds = dataset[features]
sex_mapper = {'F':0, 'M':1}
ds['sex'] = ds['sex'].apply(lambda x : sex_mapper[x])
ds = ds.replace(r'""', np.nan)
ds["si_numeric"] = ds["si_numeric"].astype(np.float32)
ds = ds.pivot_table(index=["pseudo_patient_key", "age", 'sex',"diff_in_hour_reference_dtm"], 
                 columns="test_name", 
                 values="si_numeric",
                    fill_value = 0
                )
ds = ds.reset_index()

In [None]:
ds.head()

In [None]:
right = dataset[["pseudo_patient_key","label"]].drop_duplicates()
ds = pd.merge(left=ds, right=right, on="pseudo_patient_key")

In [None]:
# find the max length of test sequence
MAX_LEN = 60
# n_patient = ds.pseudo_patient_key.nunique()
# ds = ds.sort_values(["pseudo_patient_key","age","diff_in_hour_reference_dtm"])


patient_id = ds["pseudo_patient_key"].unique()
head = 0
tail = 0
n = ds.shape[0]
def to_X(df):
    col = ["age", "sex"]+tests_name
    n = df.shape[0]
    n_patient = df.pseudo_patient_key.nunique()
    df = df.sort_values(["pseudo_patient_key", "diff_in_hour_reference_dtm"])
    patient_id = df["pseudo_patient_key"].unique()
    X = np.zeros((n_patient, MAX_LEN, 8))
    head = 0
    tail = 0
    y = np.zeros((n_patient,))
    for i, id in enumerate(patient_id):
        y[i] = df.iloc[tail]["label"]
        while(df.iloc[tail]["pseudo_patient_key"] == id):
            tail += 1
            if tail == n:
                break
        diff = min(tail - head, 60)
        X[i,:diff, :] = df.iloc[head:head+diff][col]
        head = tail
        if i % 1000 == 0:
            print(f"finished {i}/{n_patient}")
    return X, y
# split the data to test and train set
# train_df, test_df = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [None]:
X, y = to_X(ds)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
METRICS = [
    keras.metrics.TruePositives(name='tp'),
    keras.metrics.FalsePositives(name='fp'),
    keras.metrics.TrueNegatives(name='fn'),
    keras.metrics.BinaryAccuracy(name='accuracy'),
    keras.metrics.Precision(name='precision'),
    keras.metrics.Recall(name='recall'), # we focus on recall metrics
    keras.metrics.AUC(name='auc'),
    keras.metrics.AUC(name='prc', curve='PR') # precision-recall curve
    
]

In [None]:
# build the model
EMB_DIM = 8 # the dimension of defining the state
LSTM1_DIM = 128
LSTM2_DIM = 64
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=64, kernel_size=3, strides=1, activation='relu', padding='causal', input_shape=[MAX_LEN,EMB_DIM]),
    tf.keras.layers.LSTM(LSTM1_DIM, return_sequences=True),
    tf.keras.layers.LSTM(LSTM2_DIM),
    tf.keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.5), # avoid overfitting
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(loss=keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(learning_rate=1e-4), metrics=METRICS)
model_lstm.summary()

In [None]:
n = 599
print(y[n])
p = ds["pseudo_patient_key"].unique()
print(p[n])
pd.DataFrame(X[n], columns=["age", "sex"]+tests_name)

In [None]:
EPOCHS = 100
weight_minor = 1
weight_major = 10
class_weight = {0: weight_major, 1: weight_minor}
weighted_history = model_lstm.fit(
    X_train,
    y_train,
    epochs=EPOCHS,
    batch_size=1000,
    validation_data=(X_test, y_test),
    class_weight=class_weight,
    verbose=1
)

In [None]:
prec = keras.metrics.Precision(name='precision')
recall = keras.metrics.Recall(name='recall')
auc = keras.metrics.AUC(name='auc')
prc = keras.metrics.AUC(name='prc', curve='PR')

In [None]:
y_pred = model_lstm.predict(temp_x_test)

In [None]:
# auc(temp_y_test, y_pred)
prc(temp_y_test, y_pred)