In [84]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, auc


In [None]:
df = pd.read_csv('data/LCA_latent_class_data.csv').drop(columns=["X"])
df_prob = pd.read_csv('data/LCA_posterior_probabilities.csv')
# find most popular 6 classes, however we need to minus 1 later since python starts from 0.
# By minus 1, we can get all column index for all target classes
keep_classes = (df["class_assignment"].value_counts()/df["class_assignment"].count()).index[:6]
target_prob_class = keep_classes - 1
# print(target_prob_class)
# print(keep_classes)

classes_mapping = {i: f"{i[1:]}" for i in df_prob.columns}
df_prob = df_prob.rename(columns=classes_mapping)


df_prob = df_prob.iloc[:,target_prob_class]

# In all six popluar classes, find the most popular classes
df_prob["class_assignment"] = df_prob.idxmax(axis=1)

# Drop the original class assignment, since it has more than 6 classes, and assigned the new classes we have get from prob df
df = df.drop(columns=["class_assignment"])
df["class_assignment"] = df_prob["class_assignment"]
df

In [None]:

# remove classes to keep only 6 classes
# df = df[df["class_assignment"].isin(keep_classes)].reset_index().drop(columns=["index"])

start_index = df.columns.get_loc("congestive_heart_failure")
print(f"congestive_heart_failure's index is {start_index}")
# minus 1 for all Elixhauser index, since we manully add 1 for latent class analysis to avoid 0
df.iloc[:,start_index:-1] = df.iloc[:,start_index:-1] - 1

df["count_morbidity"] = df.iloc[:,start_index:-1].apply(lambda row: sum(row),axis=1)
df = df[~df["count_morbidity"].isna()]
df["count_morbidity"] = df["count_morbidity"].apply(lambda x: '>=8' if x >= 8 else str(x))
df["age_at_admission"] = df["age_at_admission"].apply(lambda x: 85 if x >= 95 else x)
df
df

In [None]:
df[df["age_at_admission"]>85]["age_at_admission"].value_counts()

In [None]:
df["count_morbidity"].value_counts()

In [None]:
print(df.groupby(["class_assignment","count_morbidity"])["subject_id"].count()/df.groupby(["class_assignment"])["subject_id"].count())
# Calculate the percentage for each row within each group and assign it to a new column
df['percent'] = df.groupby(["class_assignment", "count_morbidity"])["subject_id"].transform('count') / \
                df.groupby("class_assignment")["subject_id"].transform('count') * 100
df

In [None]:
# df["class_assignment"] = df["class_assignment"].replace(8,6)
unique_values = sorted(df['class_assignment'].unique())
value_map = {original_value: new_value for new_value, original_value in enumerate(unique_values, start=1)}
df['class_assignment'] = df['class_assignment'].map(value_map)

df = df.sort_values(by="count_morbidity", ascending=True)
df

In [None]:

# Create a bubble plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    x=df['class_assignment'],
    y=df['count_morbidity'],
    s=df['percent']*10,  # Scale bubble size (adjust multiplier as needed)
    alpha=0.6,
    c=df['class_assignment'],  # Color by subgroup
    cmap='Set1'  # Choose a color map
)

# Customize the plot
plt.xlabel("Subgroup")
plt.ylabel("Multimorbidity count")
plt.title("Subgroup Characteristics by Multimorbidity Count and Percentage")

# Legend for bubble sizes
for size in [10, 20, 30]:  # Adjust sizes to match your `percent` range
    plt.scatter([], [], s=size * 10, color='gray', alpha=0.5, label=str(size) + '%')
plt.legend(
    title="Percent", 
    loc="upper left", 
    bbox_to_anchor=(1.05, 1),  
    scatterpoints=1, 
    frameon=True,
    labelspacing=1.2,  
    borderpad=1.2      
)

plt.show()

In [None]:
# Sample code to create a box plot
plt.figure(figsize=(8, 10))
sns.boxplot(
    data=df,
    x="class_assignment",  # X-axis representing the subgroups
    y="age_at_admission", # Y-axis representing age
    palette="Set1"         # Color palette similar to the one in the plot
)

# Customize the plot
plt.xlabel("Subgroup")
plt.ylabel("Age (years)")
plt.title("Boxplot of Age Distribution in Subgroups")

plt.show()

In [None]:
df['age_bucket'].value_counts()

In [None]:
df["admission_type"] = df["admission_type"].map({"Non-elective":0,"Elective":1})
df["gender"] = df["gender"].map({"M":0,"F":1})

age_bucket_order = {'16-24': 1, "25-44": 2, "45-64": 3, "65-84": 4, "85-95": 5}
df['age_bucket'] = df['age_bucket'].map(age_bucket_order)


df

In [None]:
cols_used_LCA = ["admission_type", "gender", "age_bucket", "congestive_heart_failure", "cardiac_arrhythmias", "valvular_disease", "pulmonary_circulation", "peripheral_vascular", "hypertension", "paralysis", "other_neurological", "chronic_pulmonary", "diabetes_uncomplicated", "diabetes_complicated", "hypothyroidism", "renal_failure", "liver_disease", "peptic_ulcer", "aids", "lymphoma", "metastatic_cancer", "solid_tumor", "rheumatoid_arthritis", "coagulopathy", "obesity", "weight_loss", "fluid_electrolyte", "blood_loss_anemia", "deficiency_anemias", "alcohol_abuse", "drug_abuse", "psychoses", "depression"]

for i in sorted(df["class_assignment"].unique()):
    print(f"check class {i} with all other classes")
    df["dichotomized_class"] = df["class_assignment"].apply(lambda x: 0 if x!=i else 1)
    # Define the features (X) and target (y)
    X = df[cols_used_LCA]  
    y = df['dichotomized_class']

    # Initialize the logistic regression model
    log_reg = LogisticRegression(max_iter=1000)

    # Cross-validation to get AUC-ROC scores
    auc_scores = cross_val_score(log_reg, X, y, cv=10, scoring='roc_auc')
    print("Average AUC-ROC for class {i} with other classes:", auc_scores.mean())
df

In [None]:

plt.figure(figsize=(10, 8))

# Define the color map for each class
colors = ['black', 'red', 'green', 'blue', 'cyan', 'magenta']

# Iterate through each unique class for the one-vs-all approach
for i, class_label in enumerate(sorted(df["class_assignment"].unique())):
    print(f"Check class {class_label} with all other classes")

    # Create a binary target for the class vs. all others
    df["dichotomized_class"] = df["class_assignment"].apply(lambda x: 1 if x == class_label else 0)
    
    # Define features and target
    X = df[cols_used_LCA]
    y = df["dichotomized_class"]

    # Train logistic regression on full data for ROC curve (after cross-validation)
    log_reg = LogisticRegression(max_iter=1000)
    auc_scores = cross_val_score(log_reg, X, y, cv=10, scoring='roc_auc')
    print("Average AUC-ROC for class {i} with other classes:", auc_scores.mean())


# Plot the diagonal line representing random guessing
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)

# Customize the plot
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
plt.title('ROC Curves for Each Subgroup')
plt.legend(loc="lower right")
plt.grid()

# Show the plot
plt.show()

