In [None]:
library(poLCA)
library(reticulate)

source("utils/LCA_analysis.R")
source_python("utils/data_processing.py")

In [2]:

# this raw data is the result of sql queries, including filtered patients, and their demogrpahic information and mobrdity conditions
RAW_DATA_PATH <- "data/raw_data/LCA_raw_data.csv"
# RAW_DATA_PATH <- "data/raw_data/LCA_raw_data.csv"

# Before applying data to LCA, we need to process them to right format for poLCA to work, which would be done in utils/data_processing.py
LCA_PREPROCESSED_DATA_PATH <- "data/processed_data/LCA_preprocessed_data.csv"

# This store the posterior probabilities for all the latent classes, so we can combine small classes to larger classes
LCA_POSTERIOR_PROBABILITIES_PATH <- "data/processed_data/LCA_posterior_probabilities.csv"

# This stores the patients that are classified to different subgroups based on LCA

LCA_LATENT_CLASS_DATA_PATH <- "data/processed_data/LCA_latent_class_data.csv"
# Save the plots of LCA's result and different groups' age and multi-mordity count

PLOT_DIR_PATH <- "plots"

In [None]:
# Run the preprocess data function and store the results in a list, including processed df and morbidity_distribution
results <- preprocess_lca_data(RAW_DATA_PATH, LCA_PREPROCESSED_DATA_PATH)

# Access each part of the result, df is the preprocessed result of df (input for LCA), 
# morbidity_distribution tells the distribution of multi-morbidity count
df <- results$df
morbidity_distribution <- results$morbidity_distribution

# Define the columns you want to convert to factors before passing into the LCA model
factor_columns <- c("admission_type", "gender", "age_bucket")

df <- convert_to_factors(df, factor_columns)

print(morbidity_distribution)

In [None]:
# The formula (variables used in LCA)
formula <- as.formula(cbind(admission_type, gender, age_at_admission, congestive_heart_failure, 
                            cardiac_arrhythmias, valvular_disease, pulmonary_circulation, 
                            peripheral_vascular, hypertension, paralysis, other_neurological, 
                            chronic_pulmonary, diabetes_uncomplicated, diabetes_complicated, 
                            hypothyroidism, renal_failure, liver_disease, peptic_ulcer, aids, 
                            lymphoma, metastatic_cancer, solid_tumor, rheumatoid_arthritis, 
                            coagulopathy, obesity, weight_loss, fluid_electrolyte, blood_loss_anemia, 
                            deficiency_anemias, alcohol_abuse, drug_abuse, psychoses, 
                            depression) ~ 1)

# Run the LCA
best_models <- find_best_lca_model(df, formula, class_range = 6, plot_dir = PLOT_DIR_PATH)

# Access the best models
best_models$best_model_bic   # Best model by BIC
best_models$best_model_aic   # Best model by AIC
best_models$best_model_aic_bic_combined  # Best model by combined AIC+BIC


# Check Patients fall into which Latent Classes

In [None]:

# Assign the predicted class for each observation to a new column in df
df$class_assignment <- best_models$best_model_aic_bic_combined$predclass

# Use table to summarize the number of individuals in each class
table(df$class_assignment)

# Convert the posterior probabilities to a dataframe 
posterior_df <- as.data.frame(best_models$best_model_aic_bic_combined$posterior)

# Save the dataframe to a CSV file, the posterior_df is usde to combine small classes to larger classes if it is necessary
write.csv(posterior_df, LCA_POSTERIOR_PROBABILITIES_PATH, row.names = FALSE)

write.csv(df, LCA_LATENT_CLASS_DATA_PATH, row.names = FALSE)
