In [12]:
library(poLCA)
library(reticulate)

source("utils/LCA_analysis.R")
source_python("utils/data_processing.py")

In [13]:
# File paths
RAW_DATA_PATH <- "data/raw_data/poLCA_35128.csv"
LCA_PREPROCESSED_DATA_PATH <- "data/processed_data/LCA_preprocessed_data.csv"
LCA_POSTERIOR_PROBABILITIES_PATH <- "data/processed_data/LCA_posterior_probabilities.csv"
LCA_LATENT_CLASS_DATA_PATH <- "data/processed_data/LCA_latent_class_data.csv"
PLOT_DIR_PATH <- "plots"


In [None]:
# Run the preprocess data function and store the results in a list, including processed df and morbidity_distribution
results <- preprocess_lca_data(RAW_DATA_PATH, LCA_PREPROCESSED_DATA_PATH)

# Access each part of the result
df <- results$df
morbidity_distribution <- results$morbidity_distribution

# Define the columns you want to convert to factors before passing into the LCA model
factor_columns <- c("admission_type", "gender", "age_bucket")

# Apply the function to your data frame
df <- convert_to_factors(df, factor_columns)

# Print the morbidity distribution with proper formatting
print(morbidity_distribution)

In [None]:
# Define the formula as in your example
formula <- as.formula(cbind(admission_type, gender, age_at_admission, congestive_heart_failure, 
                            cardiac_arrhythmias, valvular_disease, pulmonary_circulation, 
                            peripheral_vascular, hypertension, paralysis, other_neurological, 
                            chronic_pulmonary, diabetes_uncomplicated, diabetes_complicated, 
                            hypothyroidism, renal_failure, liver_disease, peptic_ulcer, aids, 
                            lymphoma, metastatic_cancer, solid_tumor, rheumatoid_arthritis, 
                            coagulopathy, obesity, weight_loss, fluid_electrolyte, blood_loss_anemia, 
                            deficiency_anemias, alcohol_abuse, drug_abuse, psychoses, 
                            depression) ~ 1)

# Run the function with your data frame and desired class range
best_models <- find_best_lca_model(df, formula, class_range = 7, plot_dir = PLOT_DIR_PATH)

# Access the best models
best_models$best_model_bic   # Best model by BIC
best_models$best_model_aic   # Best model by AIC
best_models$best_model_aic_bic_combined  # Best model by combined AIC+BIC


# Check Patients fall into which Latent Classes

In [None]:

# Assign the predicted class for each observation to a new column in df
df$class_assignment <- best_models$best_model_aic_bic_combined$predclass

# Use table to summarize the number of individuals in each class
table(df$class_assignment)

# Convert the posterior probabilities to a dataframe 
posterior_df <- as.data.frame(best_models$best_model_aic_bic_combined$posterior)

# Save the dataframe to a CSV file, the posterior_df is usde to combine small classes to larger classes if it is necessary
write.csv(posterior_df, LCA_POSTERIOR_PROBABILITIES_PATH, row.names = FALSE)

write.csv(df,LCA_LATENT_CLASS_DATA_PATH,row.names = FALSE)
