## File imports and settings


In [None]:


import os
import psutil

###This is for Windows users...python will default to only using "E" cores
desired_cpus = list(range(20))  # CPUs 0 through 11


p = psutil.Process(os.getpid())


p.cpu_affinity(desired_cpus)

print("CPU affinity set to:", p.cpu_affinity())
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import pandas as pd
import numpy as np
from IPython.display import clear_output
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed

#import psutil
import os
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' ### this hides some of the annoying tflow errors
#### this depends on the hardware
from pathlib import Path ####Required for save checks
import prince ####Required for MCA functions

import j_process ####Required to load custom modeling functions
import j_clustertuner####Required for best K functions
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU devices:", tf.config.list_physical_devices('GPU'))
skip = False
subset=False
redundant=False

In [None]:
df = pd.read_parquet("data/filtered.parquet")

In [None]:
df.head()

In [None]:
#Just a function to help me visualize each feature

def i_graph(colname):
    ig_counts = df[colname].value_counts(dropna=False)
    
    print("Value counts for", colname)
    print(ig_counts)
    


In [None]:
def repval(colname, stringstorepl):
    df[colname] = df[colname].replace(stringstorepl,np.nan)
    rg_counts = df[colname].value_counts(dropna=False)





In [None]:


def basic_plot(colname):
    rg_counts = df[colname].value_counts(dropna=False)
    rg_ax = rg_counts.plot(kind='bar')
    plt.title('Counts per Category')
    plt.xlabel('Category')
    plt.ylabel('Count')

    for i, v in enumerate(rg_counts):
        rg_ax.text(i, v + max(rg_counts) * 0.01, str(v), ha='center', va='bottom')

    plt.tight_layout()

    # Ensure the images directory exists
    os.makedirs("images", exist_ok=True)

    rg_title = colname
    rg_filename = rg_title.replace(" ", "_").replace("/", "-") + ".jpg"
    filepath = os.path.join("images", rg_filename)

    plt.savefig(filepath, format='jpg', dpi=300)
    plt.show()

## Cleaned up variables

In [None]:
default = ["Don't know/Not sure","Refused","Missing","Don't know / Not sure","Don't know/Not Sure"]
repval("CVDINFR4", ["Don't know/Not sure", "Refused", "Missing"])
repval("CVDCRHD4", ["Don't know/Not sure", "Refused", "Missing"])
repval("CVDSTRK3", default)
df["ASTHMA3"] = df["ASTHMA3"].replace(["No -  Go to Section 07.06 CHCSCNC1"], "No")
repval("ASTHMA3",
       ["Don't know/Not Sure -  Go to Section 07.06 CHCSCNC1", "Refused -  Go to Section 07.06 CHCSCNC1", "Missing"])
repval("CHCSCNC1", default)
repval("CHCOCNC1", default)
repval("CHCCOPD3", default)
repval("CHCKDNY2", default)
repval("HAVARTH4", default)
df["DIABETE4"]=df["DIABETE4"].replace(["No -  Go to Section 08.01 AGE"],"No")
repval("DIABETE4", ["No, pre-diabetes or borderline diabetes -  Go to Section 08.01 AGE", "Yes, but female told only during pregnancy -  Go to Section 08.01 AGE", "Don't know/Not Sure -  Go to Section 08.01 AGE", "Refused -  Go to Section 08.01 AGE"])
df["EDUCA"]=df["EDUCA"].replace(["College 4 years or more (College graduate)"],"Bachelors degree or Higher")
df["EDUCA"]=df["EDUCA"].replace(["College 1 year to 3 years (Some college or technical school)"],"Some College or Associates")
df["EDUCA"]=df["EDUCA"].replace(["Grade 12 or GED (High school graduate)"],"High School Graduate")
df["EDUCA"]=df["EDUCA"].replace(["Grades 9 through 11 (Some high school)"],"Some High School")
df["EDUCA"]=df["EDUCA"].replace(["Grades 1 through 8 (Elementary)"],"Elementary Only")
df["EDUCA"]=df["EDUCA"].replace(["Never attended school or only kindergarten"],"None or Kindergarten Only")
repval("EDUCA", default)
repval("DEAF", default)
repval("BLIND", default)
repval("DECIDE", default)
repval("DIFFWALK", default)
repval("DIFFDRES", default)
repval("DIFFALON", default)
df["SMOKE100"]=df["SMOKE100"].replace(["No -  Go to Section 12.03 USENOW3"],"No")
df["SMOKE100"]=df["SMOKE100"].replace(["Don't know/Not Sure -  Go to Section 12.03 USENOW3"],"Don't know/Not Sure")
df["SMOKE100"]=df["SMOKE100"].replace(["Refused -  Go to Section 12.03 USENOW3"],"Refused")
repval("SMOKE100", default)
df["ECIGNOW2"]=df["ECIGNOW2"].replace(["Never used e-cigarettes in your entire life"],"Never")
df["ECIGNOW2"]=df["ECIGNOW2"].replace(["Not at all (right now)"],"Not Currently")
df["ECIGNOW2"]=df["ECIGNOW2"].replace(["Use them some days"],"Some Days")
df["ECIGNOW2"]=df["ECIGNOW2"].replace(["Use them every day"],"Every Day")
df["FLUSHOT7"]=df["FLUSHOT7"].replace(["No -  Go to Section 13.03 PNEUVAC4"],"No")
df["FLUSHOT7"]=df["FLUSHOT7"].replace(["Don't know/Not Sure -  Go to Section 13.03 PNEUVAC4"],"Don't know/Not sure")
df["FLUSHOT7"]=df["FLUSHOT7"].replace(["Refused -  Go to Section 13.03 PNEUVAC4"],"Refused")
repval("ECIGNOW2", default)
repval("FLUSHOT7", default)
repval("PNEUVAC4", default)
df["SEATBELT"]=df["SEATBELT"].replace(["Never drive or ride in a car -  Go to Section 16.1 COVIDPO1"],"Refused")

repval("SEATBELT", default)
df["COVIDPO1"]=df["COVIDPO1"].replace(["No -  Go to Modules or Closing Statement"],"No")
df["COVIDPO1"]=df["COVIDPO1"].replace(["Refused -  Go to Modules or Closing Statement"],"Refused")
df["COVIDPO1"]=df["COVIDPO1"].replace(["Don't know/Not Sure -  Go to Modules or Closing Statement"],"Don't know/Not sure")

repval("COVIDPO1", default)
repval("GENHLTH", default)
df["EXERANY2"]=df["EXERANY2"].replace(["No -  Go to Section 04.08 STRENGTH"],"No")
df["EXERANY2"]=df["EXERANY2"].replace(["Don't know/Not Sure -  Go to Section 04.08 STRENGTH"],"Don't know/Not sure")
df["EXERANY2"]=df["EXERANY2"].replace(["Refused -  Go to Section 04.08 STRENGTH"],"Refused")

repval("EXERANY2", default)
repval("TOLDHI3", default)

In [None]:
j_process.resp_tally2(df)

In [None]:
print(df.isna().sum())
print(df.shape)
df.dropna(inplace=True)
print(df.shape)


In [None]:
columns_to_check = ['CVDINFR4', 'CVDCRHD4', 'CVDSTRK3','ASTHMA3','CHCSCNC1','CHCOCNC1','CHCCOPD3','CHCKDNY2','HAVARTH4','DIABETE4']
chronic_col = columns_to_check
df['ALL_CHRONIC'] = df[columns_to_check].eq('Yes').any(axis=1).map({True: 'Yes', False: 'No'})


### Derived target variable information

In [None]:
df['ALL_CHRONIC'].value_counts(dropna=False)
df['ALL_CHRONIC'].value_counts().plot(kind='bar')
plt.title('Any Chronic Condition')
plt.xlabel('Response')
plt.ylabel('Count')
title = plt.gca().get_title()
filename = title.replace(" ", "_").replace("/", "-") + ".jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300)
plt.show()
df['ALL_CHRONIC'].value_counts(normalize=True, dropna=False)

In [None]:
columns_to_check = ['CVDINFR4', 'CVDCRHD4', 'CVDSTRK3','ASTHMA3','CHCSCNC1','CHCOCNC1','CHCCOPD3','CHCKDNY2','DIABETE4']

df['ALL_CHRONIC2'] = df[columns_to_check].eq('Yes').any(axis=1).map({True: 'Yes', False: 'No'})

In [None]:
columns_to_check = ['CVDINFR4', 'CVDCRHD4', 'CVDSTRK3']

df['ALL_CARDIAC'] = df[columns_to_check].eq('Yes').any(axis=1).map({True: 'Yes', False: 'No'})

In [None]:
df['ALL_CHRONIC2'].value_counts(dropna=False)
df['ALL_CHRONIC2'].value_counts().plot(kind='bar')
plt.title('Any Chronic (alt) Condition')
plt.xlabel('Response')
plt.ylabel('Count')
title = plt.gca().get_title()
filename = title.replace(" ", "_").replace("/", "-") + ".jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300)
plt.show()
df['ALL_CHRONIC2'].value_counts(normalize=True, dropna=False)

In [None]:
df['ALL_CARDIAC'].value_counts(dropna=False)
df['ALL_CARDIAC'].value_counts().plot(kind='bar')
plt.title('Any Cardiac Condition')
plt.xlabel('Response')
plt.ylabel('Count')
title = plt.gca().get_title()
filename = title.replace(" ", "_").replace("/", "-") + ".jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300)
plt.show()
df['ALL_CARDIAC'].value_counts(normalize=True, dropna=False)

In [None]:
columns_to_check = ['CHCSCNC1','CHCOCNC1']

df['ALL_CANCER'] = df[columns_to_check].eq('Yes').any(axis=1).map({True: 'Yes', False: 'No'})

In [None]:
df['ALL_CANCER'].value_counts(dropna=False)
df['ALL_CANCER'].value_counts().plot(kind='bar')
plt.title('Any Cancer Response')
plt.xlabel('Response')
plt.ylabel('Count')
title = plt.gca().get_title()
filename = title.replace(" ", "_").replace("/", "-") + ".jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300)
plt.show()
df['ALL_CANCER'].value_counts(normalize=True, dropna=False)

In [None]:
columns_to_check = ['ASTHMA3','CHCCOPD3']

df['ALL_PUL'] = df[columns_to_check].eq('Yes').any(axis=1).map({True: 'Yes', False: 'No'})

In [None]:
df['ALL_PUL'].value_counts(dropna=False)
df['ALL_PUL'].value_counts().plot(kind='bar')
plt.title('Any Pulmonary Response')
plt.xlabel('Response')
plt.ylabel('Count')
title = plt.gca().get_title()
filename = title.replace(" ", "_").replace("/", "-") + ".jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300)
plt.show()
df['ALL_PUL'].value_counts(normalize=True, dropna=False)

### Chronic target statistics

In [None]:


columns_to_check = [
    'CVDINFR4', 'CVDCRHD4', 'CVDSTRK3',
    'ASTHMA3', 'CHCSCNC1', 'CHCOCNC1',
    'CHCCOPD3', 'CHCKDNY2', 'HAVARTH4', 'DIABETE4'
]
yes_rates = {
    col: (df[col] == "Yes").mean() * 100  
    for col in columns_to_check
}


yes_df = pd.Series(yes_rates).sort_values(ascending=False)

ax = yes_df.plot(kind='bar')
plt.title('Percentage of Yes Responses by Condition')
plt.ylabel('Percentage (%)')
plt.xlabel('Condition')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
title = plt.gca().get_title()
filename = title.replace(" ", "_").replace("/", "-") + ".jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300)
plt.show()


In [None]:
column_list = df.columns.tolist()
print(column_list)
feature_list = [col for col in column_list if col not in chronic_col]
deriv_col = ['ALL_CHRONIC','ALL_CARDIAC','ALL_CANCER','ALL_PUL','ALL_CHRONIC2']

In [None]:
feature_list = [col for col in feature_list if col not in deriv_col]
print(feature_list)

In [None]:
for col in feature_list:
    counts = df[col].value_counts(dropna=False)

    # Print counts for reference
    print(f"\nValue counts for {col}:\n{counts}")

    # Plot
    ax = counts.plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xlabel('Response')
    plt.ylabel('Count')
    title = plt.gca().get_title()
    filename = title.replace(" ", "_").replace("/", "-") + ".jpg"
    filename = os.path.join("images",filename)
    plt.savefig(filename, format='jpg', dpi=300)

    #plt.tight_layout()
    plt.show()

In [None]:
bool_feat = ['DEAF', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES',
              'DIFFALON', 'SMOKE100', 'FLUSHOT7', 'PNEUVAC4',
               'COVIDPO1', 'EXERANY2', 'TOLDHI3']
cat_feat=['GENHLTH', 'SEATBELT','ECIGNOW2','EDUCA']

In [None]:
lifestyle_factors = bool_feat  
heat_data = {}

for col in lifestyle_factors:
    yes_rate = df[df[col] == 'Yes']['ALL_CHRONIC'].value_counts(normalize=True)
    heat_data[col] = yes_rate

heat_df = pd.DataFrame(heat_data).T.fillna(0)

sns.heatmap(heat_df, annot=True, cmap='Blues')
plt.title("Chronic Condition Rates Among Respondents With Each Reported Risk Factor")
plt.ylabel("Risk Factor")
plt.xlabel("Chronic Condition")
plt.tight_layout()
filename="chronic_condition_heatmap.jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300, bbox_inches='tight')

plt.show()


In [None]:
heat_data = {}

for col in lifestyle_factors:
    yes_rate = df[df[col] == 'Yes']['ALL_CHRONIC2'].value_counts(normalize=True)
    heat_data[col] = yes_rate

heat_df = pd.DataFrame(heat_data).T.fillna(0)

sns.heatmap(heat_df, annot=True, cmap='Blues')
plt.title("Chronic2 Condition Rates Among Respondents With Each Reported Risk Factor")
plt.ylabel("Risk Factor")
plt.xlabel("Chronic Condition")
plt.tight_layout()
filename="chronic2_condition_heatmap.jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
heat_data={}
for col in lifestyle_factors:
    yes_rate = df[df[col] == 'No']['ALL_CHRONIC'].value_counts(normalize=True)
    heat_data[col] = yes_rate

heat_df = pd.DataFrame(heat_data).T.fillna(0)
heat_df = heat_df[['Yes', 'No']]
sns.heatmap(heat_df, annot=True, cmap='Blues')
plt.title("Chronic Condition Rates Among Respondents Without Each Reported Risk Factor")
plt.ylabel("Risk Factor")
plt.xlabel("Chronic Condition")
plt.tight_layout()
filename="neg_chronic_condition_heatmap.jpg"
filename = os.path.join("images",filename)
plt.savefig(filename, format='jpg', dpi=300,bbox_inches='tight')
plt.show()

In [None]:
#######USED IN EDA#####

edu_chronic = df.groupby('EDUCA')['ALL_CHRONIC'].value_counts(normalize=True).unstack().fillna(0)
ordered_levels = ["Bachelors degree or Higher","Some College or Associates", "High School Graduate", "Elementary Only", "None or Kindergarten Only"]
edu_chronic = edu_chronic.loc[ordered_levels]

edu_chronic.plot(kind='bar', stacked=True)
plt.title('Chronic Condition Distribution by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Proportion')
plt.legend(title='Has Chronic Condition')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
filename ="educationvschronic.jpg"
filename = os.path.join("images",filename)

plt.savefig(filename, format='jpg', dpi=300,bbox_inches='tight')
plt.show()

In [None]:
##### USED IN EDA#####

features = ['ECIGNOW2', 'SEATBELT', 'GENHLTH']

# Friendly labels for x-axis
x_labels = {
    'ECIGNOW2': 'E-Cigarette Use',
    'SEATBELT': 'Seatbelt Use Frequency',
    'GENHLTH': 'General Health'
}

#Fix the order of the labels
category_orders = {
    'ECIGNOW2': ['Every Day', 'Some Days', 'Not Currently', 'Never'],  
    'SEATBELT': ['Always', 'Nearly always', 'Sometimes', 'Seldom', 'Never'],
    'GENHLTH': ['Excellent', 'Very good', 'Good', 'Fair', 'Poor']
}

for col in features:

    ctab = pd.crosstab(df[col], df['ALL_CHRONIC'], normalize='index').fillna(0)
    if col in category_orders:
        ctab = ctab.reindex(category_orders[col])
    ax = ctab.plot(kind='bar', stacked=True, figsize=(8, 4))
    plt.title(f'Chronic Condition Distribution by {x_labels[col]}')
    plt.xlabel(x_labels[col])
    plt.ylabel('Proportion')
    plt.legend(title='Has Chronic Condition')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    filename = f"chronic_vs_{col}.jpg".replace(" ", "_").lower()
    filename = os.path.join("images",filename)

    plt.savefig(filename, format='jpg', dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
####Check to see if the filtered data was saved

from pathlib import Path

file_path = Path("data/rq3_filtered.parquet")
if file_path.exists():
    print("File exists.")
else:
    print("File not found.")
    df.to_parquet(
        "rq3_filtered.parquet",
        engine="pyarrow",
        compression="BROTLI",
        compression_level=11,
        index=False
    )

In [None]:
df['ALL_CARDIAC'].value_counts()

In [None]:
if subset==True:
    df_yes=df[df['ALL_CHRONIC2']=='Yes']
    df_no=df[df['ALL_CHRONIC2']=='No']
    df_no=df_no.sample(n=len(df_yes),random_state=42)
    df=pd.concat([df_yes,df_no])
else:
    print("skipping subsample...")

In [None]:
print(df['ALL_CARDIAC'].value_counts())
print(df['ALL_CHRONIC2'].value_counts())
print(df['ALL_CHRONIC'].value_counts())
print(df['ALL_PUL'].value_counts())


In [None]:


df_val, df_train = train_test_split(
    df, test_size=0.70, random_state=42, stratify=df["ALL_CHRONIC2"]
)




print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")



Testing for optimal Kmode Clusters

In [None]:
#### Check to see if KMODE clustering cost results are available
#### Will take a long time to rerun, download "rq3_all_kresults.parquet" to skip

file_path = Path("data/rq3_all_kresults.parquet")
if file_path.exists():
    print("File exists.")
    all_kresults = pd.read_parquet(file_path)
    print(all_kresults.head())
    
else:
    print("Save not found, running tuner...May take a while...")
    all_kresults = j_clustertuner.kmode_tune(df_val,feature_list,n_cluster=256,n_trials=10,cores=15)
    j_process.save_if_changed(all_kresults,"data/rq3_all_kresults.parquet")




In [None]:
j_clustertuner.plot_kmode_elbow(all_kresults)

Test for Optimal TFlow number of clusters

In [None]:
#####Check to see if silhouette score data is available. 
#####Will take a long time ot rerun, make sure "rq3_all_tresults.parquet" is present

file_path = Path("data/rq3_all_tresults.parquet")
if file_path.exists():
    print("File exists.Loading file.")
    all_tresults = pd.read_parquet(file_path)
    print(all_tresults.head())

else:
    print("Save not found, running tuner...May take a while...")
    

    all_tresults = j_clustertuner.tflow_tune(df_test,df_val,feature_list)
    j_process.save_if_changed(all_tresults,"data/rq3_all_tresults.parquet")

In [None]:

j_clustertuner.analyze_silhouette_scores(all_tresults)

In [None]:


# Ensure both are lists, then combine
bool_feat = list(bool_feat)
cat_feat = list(cat_feat)
combined_feat = bool_feat + cat_feat

# Drop the combined features from df
#mca_set = df_train.drop(columns=combined_feat, inplace=False)
#mca_valset = df_val.drop(columns=combined_feat,inplace=False)
mca_set = df_train.copy()
mca_set = mca_set[combined_feat]
mca_val = df_val.copy()
mca_val = mca_val[combined_feat]



mca = prince.MCA( n_components=27,random_state=42)
mca = mca.fit(mca_set)

X_reduced = mca.transform(mca_set)
#X_reduced.shape

#explained = mca.explained_inertia_
eigen = mca.eigenvalues_
total = mca.total_inertia_
explained = eigen / total
#print(explained)


cumulative = np.cumsum(explained)

plt.plot(range(1, len(cumulative)+1), cumulative, marker='o')
plt.axhline(y=0.8, color='r', linestyle='--', label='80% threshold')
plt.title("Cumulative Explained Inertia (MCA)")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Inertia")
plt.grid(True)
plt.legend()
plt.savefig("images/mca_inertia.jpg", format='jpg', dpi=300)
plt.show()

# Find minimum number of components to reach 80%

k = np.argmax(cumulative >= 0.80) + 1
print(f"Components to reach 80% inertia: {k}")
k=np.argmax(cumulative>=.95) +1
print(f"Max intertia at: {k}")


In [None]:
plt.plot(range(1, len(explained)+1), explained, marker='o')
plt.title("Scree Plot (Explained Inertia per Component)")
plt.xlabel("Component")
plt.ylabel("Explained Inertia")
plt.grid(True)
plt.savefig("images/screeplot_mca.jpg", format='jpg', dpi=300)
plt.show()

In [None]:
mca = prince.MCA(n_components=2, random_state=42)
mca = mca.fit(df_train)

if skip == True:
    print("skipping")
else:
    mca_row_coords = mca.row_coordinates(df_train)
    plt.scatter(mca_row_coords[0], mca_row_coords[1], alpha=0.5)
    plt.title("Individuals Factorial Plane (Dim 1 vs Dim 2)")
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")
    plt.grid(True)
    plt.show()
    
    # For variables
    mca_col_coords = mca.column_coordinates(df_train)
    plt.scatter(mca_col_coords[0], mca_col_coords[1])
    plt.title("Variable Factorial Plane (Dim 1 vs Dim 2)")
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")
    plt.grid(True)
    plt.show()
    
    cos2 = (mca_col_coords ** 2).div((mca_col_coords ** 2).sum(axis=1), axis=0)
    cos2.head()

In [None]:
####future: save ram and clean this up

###create model datasets

####Raw set




df_rawtrain = df_train.copy()
df_rawval = df_val.copy()


###Kmodes
km_train = df_train.copy()
km_val = df_val.copy()





###Cluster
mc_train=df_train.copy()
mc_val=df_val.copy()


## Baseline and Feature Importance Results


In [None]:

# Your targets
target_labels = ["ALL_CHRONIC", "ALL_CARDIAC", "ALL_PUL", "ALL_CHRONIC2"]

# Run all models in parallel
baseline_scores = Parallel(n_jobs=4)(
    delayed(j_process.run_logistic_model)(
        df_rawtrain,
        df_rawval,
        feature_list,
        target,
        plot_importance=True,
        importance_filename=f"images/{target.lower()}_importance.jpg"
    )
    for target in target_labels
)


all_chronic_scores, all_cardiac_scores, all_pul_scores, all_chronic2_scores = baseline_scores
print(baseline_scores)
df_scores = pd.DataFrame(baseline_scores, index=target_labels)


# Convert metrics to percent and round to 3 decimals
df_percent = df_scores.copy() * 100
df_percent = df_percent.round(3)

# Reset index to make target names a column
df_percent.insert(0, "Target", df_percent.index)

# Plot as a table
fig, ax = plt.subplots(figsize=(10, 3))
ax.axis('tight')
ax.axis('off')
table = ax.table(
    cellText=df_percent.values,
    colLabels=df_percent.columns,
    cellLoc='center',
    loc='center'
)

# Add a title
ax.set_title("Baseline Model Performance Metrics (%)", fontsize=14, fontweight="bold", pad=20)

plt.tight_layout()
plt.savefig("images/baseline_scores_table.jpg", dpi=300)
plt.show()

### Logistic Scores

In [None]:
def summarize_model_results(name, score_dict):
    return {
        'Model': name,
        'Accuracy': score_dict.get('accuracy'),
        'Precision': score_dict.get('precision'),
        'Recall': score_dict.get('recall'),
        'F1 Score': score_dict.get('f1_score')
    }

summary_df = pd.DataFrame([
    summarize_model_results('ALL_CHRONIC', all_chronic_scores),
    summarize_model_results('ALL_CARDIAC', all_cardiac_scores),
    summarize_model_results('ALL_PUL', all_pul_scores),
    summarize_model_results('ALL_CHRONIC2', all_chronic2_scores)
])


print(summary_df)


In [None]:
all_chronic_results_dict={}
all_cardiac_results_dict={}
all_pul_results_dict={}
all_chronic2_results_dict={}
for feature in feature_list:
    #print(f"\n---Running baseline on feature:{feature}---")
    results = j_process.run_logistic_model(df_rawtrain,df_rawval,[feature],"ALL_CHRONIC")
    all_chronic_results_dict[feature]=results

for feature in feature_list:
    #print(f"\n---Running baseline on feature:{feature}---")
    results = j_process.run_logistic_model(df_rawtrain,df_rawval,[feature],"ALL_CARDIAC")
    all_cardiac_results_dict[feature]=results

for feature in feature_list:
    #print(f"\n---Running baseline on feature:{feature}---")
    results = j_process.run_logistic_model(df_rawtrain,df_rawval,[feature],"ALL_PUL")
    all_pul_results_dict[feature]=results
for feature in feature_list:
    #print(f"\n---Running baseline on feature:{feature}---")
    results = j_process.run_logistic_model(df_rawtrain,df_rawval,[feature],"ALL_CHRONIC2")
    all_chronic2_results_dict[feature]=results

In [None]:
# --- ALL_CHRONIC ---
df_all_chronic = pd.DataFrame.from_dict(all_chronic_results_dict, orient='index') * 100
df_all_chronic = df_all_chronic.round(3)
df_all_chronic.index.name = 'Feature'
df_all_chronic.reset_index(inplace=True)
display(df_all_chronic)

fig, ax = plt.subplots(figsize=(10, len(df_all_chronic) * 0.35))
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df_all_chronic.values,
                 colLabels=df_all_chronic.columns,
                 cellLoc='center',
                 loc='center')
ax.set_title("ALL_CHRONIC: Single-Feature Logistic Regression Results (%)", fontsize=14, fontweight="bold", pad=20)
plt.tight_layout()
plt.savefig("images/all_chronic_results.jpg", dpi=300)
plt.show()

# --- ALL_CARDIAC ---
df_all_cardiac = pd.DataFrame.from_dict(all_cardiac_results_dict, orient='index') * 100
df_all_cardiac = df_all_cardiac.round(3)
df_all_cardiac.index.name = 'Feature'
df_all_cardiac.reset_index(inplace=True)
display(df_all_cardiac)

fig, ax = plt.subplots(figsize=(10, len(df_all_cardiac) * 0.35))
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df_all_cardiac.values,
                 colLabels=df_all_cardiac.columns,
                 cellLoc='center',
                 loc='center')
ax.set_title("ALL_CARDIAC: Single-Feature Logistic Regression Results (%)", fontsize=14, fontweight="bold", pad=20)
plt.tight_layout()
plt.savefig("images/all_cardiac_results.jpg", dpi=300)
plt.show()

# --- ALL_PUL ---
df_all_pul = pd.DataFrame.from_dict(all_pul_results_dict, orient='index') * 100
df_all_pul = df_all_pul.round(3)
df_all_pul.index.name = 'Feature'
df_all_pul.reset_index(inplace=True)
display(df_all_pul)

fig, ax = plt.subplots(figsize=(10, len(df_all_pul) * 0.35))
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df_all_pul.values,
                 colLabels=df_all_pul.columns,
                 cellLoc='center',
                 loc='center')
ax.set_title("ALL_PUL: Single-Feature Logistic Regression Results (%)", fontsize=14, fontweight="bold", pad=20)
plt.tight_layout()
plt.savefig("images/all_pul_results.jpg", dpi=300)
plt.show()

# --- ALL_CHRONIC2 ---
df_all_chronic2 = pd.DataFrame.from_dict(all_chronic2_results_dict, orient='index') * 100
df_all_chronic2 = df_all_chronic2.round(3)
df_all_chronic2.index.name = 'Feature'
df_all_chronic2.reset_index(inplace=True)
display(df_all_chronic2)

fig, ax = plt.subplots(figsize=(10, len(df_all_chronic2) * 0.35))
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df_all_chronic2.values,
                 colLabels=df_all_chronic2.columns,
                 cellLoc='center',
                 loc='center')
ax.set_title("ALL_CHRONIC2: Single-Feature Logistic Regression Results (%)", fontsize=14, fontweight="bold", pad=20)
plt.tight_layout()
plt.savefig("images/all_chronic2_results.jpg", dpi=300)
plt.show()

In [None]:
###RANDOM FOREST BASELINE

# Run Random Forest on all features at once
rf_all_chronic_scores = j_process.run_rf_model(df_rawtrain, df_rawval, feature_list, "ALL_CHRONIC")
rf_all_cardiac_scores = j_process.run_rf_model(df_rawtrain, df_rawval, feature_list, "ALL_CARDIAC")
rf_all_pul_scores = j_process.run_rf_model(df_rawtrain, df_rawval, feature_list, "ALL_PUL")
rf_chronic2_scores = j_process.run_rf_model(df_rawtrain,df_rawval,feature_list,"ALL_CHRONIC2")

rfsummary_df = pd.DataFrame([
    summarize_model_results('ALL_CHRONIC', rf_all_chronic_scores),
    summarize_model_results('ALL_CARDIAC', rf_all_cardiac_scores),
    summarize_model_results('ALL_PUL', rf_all_pul_scores),
    summarize_model_results('ALL_CHRONIC2', rf_chronic2_scores)
])


print(rfsummary_df)

In [None]:
# Scale and round the values
rf_table = rfsummary_df.copy()
for col in rf_table.columns:
    if col != 'Model':
        rf_table[col] = (rf_table[col] * 100).round(3)

# Create figure
fig, ax = plt.subplots(figsize=(12, 3))  # Adjust width as needed
ax.axis('off')

# Create table with padding and font size
table = ax.table(
    cellText=rf_table.values,
    colLabels=rf_table.columns,
    cellLoc='center',
    loc='center',
    bbox=[0, 0, 1, 1]
)
table.auto_set_font_size(False)
table.set_fontsize(12)

# Set column widths more evenly
col_width = 1.0 / (len(rf_table.columns) + 1)
for i in range(len(rf_table.columns)):
    table.auto_set_column_width(i)

# Title
ax.set_title("Random Forest Validation Metrics (%)", fontsize=14, fontweight="bold", pad=20)

# Save
plt.tight_layout()
plt.savefig("images/rf_validation_results_fixed.jpg", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
#### Skipped this section, this is redundant

if redundant == False:
    print("skipping redundant tests")
else:
    
    


    rf_all_chronic_results_dict = {}
    rf_all_cardiac_results_dict = {}
    rf_all_pul_results_dict = {}
    rf_all_chronic2_results_dict={}
    
    
    
    
    rf_result_dicts = {
        "ALL_CHRONIC": rf_all_chronic_results_dict,
        "ALL_CARDIAC": rf_all_cardiac_results_dict,
        "ALL_PUL": rf_all_pul_results_dict,
        "ALL_CHRONIC2": rf_all_chronic2_results_dict
    }
    
    for label in target_labels:
        for feature in feature_list:
            clear_output(wait=True)
            print(f"\n---Running RF baseline on feature: {feature} [{label}] ---")
            results = j_process.run_rf_model(df_rawtrain, df_rawval, [feature], label)
            rf_result_dicts[label][feature] = results
        

In [None]:
#####TENSOR FLOW BASELINE

tf_all_chronic_scores = j_process.run_tf_model(df_rawtrain, df_rawval, feature_list, "ALL_CHRONIC",verbose=0)
tf_all_cardiac_scores = j_process.run_tf_model(df_rawtrain, df_rawval, feature_list, "ALL_CARDIAC")
tf_all_pul_scores = j_process.run_tf_model(df_rawtrain, df_rawval, feature_list, "ALL_PUL")
tf_chronic2_scores = j_process.run_tf_model(df_rawtrain,df_rawval,feature_list,"ALL_CHRONIC2")
tfsummary_df = pd.DataFrame([
    summarize_model_results('ALL_CHRONIC', tf_all_chronic_scores),
    summarize_model_results('ALL_CARDIAC', tf_all_cardiac_scores),
    summarize_model_results('ALL_PUL', tf_all_pul_scores),
    summarize_model_results('ALL_CHRONIC2', tf_chronic2_scores)
])


print(tfsummary_df)

In [None]:
# Prepare table
tf_table = tfsummary_df.copy()
for col in tf_table.columns:
    if col != 'Model':
        tf_table[col] = (tf_table[col] * 100).round(3)

# Plot table as image
fig, ax = plt.subplots(figsize=(12, 3))  # Widen as needed
ax.axis('off')

table = ax.table(
    cellText=tf_table.values,
    colLabels=tf_table.columns,
    cellLoc='center',
    loc='center',
    bbox=[0, 0, 1, 1]
)

# Font and layout fixes
table.auto_set_font_size(False)
table.set_fontsize(12)
for i in range(len(tf_table.columns)):
    table.auto_set_column_width(i)

ax.set_title("TensorFlow Validation Metrics (%)", fontsize=14, fontweight="bold", pad=20)
plt.tight_layout()
plt.savefig("images/tf_validation_results.jpg", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
######Redundant section, can be skipped. Will run if 'skip' is set to False at the import section


if redundant == False:
    print("skipping redundant tests")
else:

    ####Tensor flow output
    # Prepare per-feature result dicts
    tf_all_chronic_results_dict = {}
    tf_all_cardiac_results_dict = {}
    tf_all_pul_results_dict = {}
    tf_all_chronic2_results_dict={}
    # Individual feature runs for ALL_CHRONIC
    for feature in feature_list:
        print(f"\n---Running baseline on feature: {feature} [ALL_CHRONIC] ---")
        results = j_process.run_tf_model(df_rawtrain, df_rawval, [feature], "ALL_CHRONIC")
        tf_all_chronic_results_dict[feature] = results
    
    # Individual feature runs for ALL_CARDIAC
    for feature in feature_list:
        print(f"\n---Running baseline on feature: {feature} [ALL_CARDIAC] ---")
        results = j_process.run_tf_model(df_rawtrain, df_rawval, [feature], "ALL_CARDIAC")
        tf_all_cardiac_results_dict[feature] = results
    
    # Individual feature runs for ALL_PUL
    for feature in feature_list:
        print(f"\n---Running baseline on feature: {feature} [ALL_PUL] ---")
        results = j_process.run_tf_model(df_rawtrain, df_rawval, [feature], "ALL_PUL")
        tf_all_pul_results_dict[feature] = results
    for feature in feature_list:
        print(f"\n---Running baseline on feature: {feature} [ALL_CHRONIC2] ---")
        results = j_process.run_tf_model(df_rawtrain, df_rawval, [feature], "ALL_CHRONIC2")
        tf_all_chronic2_results_dict[feature] = results

## Cluster with selected K's

### Kmodes

In [None]:
from joblib import Parallel, delayed

k_trials = [2,3,4,5,6,7,8,9,10,15,20,25,35,40,50,100]

cluster_df_train = df_train.copy()
cluster_df_val = df_val.copy()
cluster_columns = []

if skip == True:
    cluster_df_train = pd.read_parquet("data/cluster_df_train.parquet")
    cluster_df_val = pd.read_parquet("data/cluster_df_val.parquet")
    print("Data loaded...")
    print(cluster_df_train.head())
else:
    def run_kmodes_parallel(trial, df, feature_list):
        print(f"[K-Modes] {trial} clusters")
        updated_df, col_name, _ = j_process.run_kmodes_cluster(
            df.copy(), feature_cols=feature_list, n_clusters=trial
        )
        return updated_df[col_name], col_name

    # Train set in parallel
    results_train = Parallel(n_jobs=-1)(
        delayed(run_kmodes_parallel)(trial, cluster_df_train, feature_list)
        for trial in k_trials
    )

    for col_series, col_name in results_train:
        cluster_df_train[col_name] = col_series
        cluster_columns.append(col_name)

    # Val set in parallel
    results_val = Parallel(n_jobs=-1)(
        delayed(run_kmodes_parallel)(trial, cluster_df_val, feature_list)
        for trial in k_trials
    )

    for col_series, col_name in results_val:
        cluster_df_val[col_name] = col_series


### Tensor

In [None]:
if skip == True:
    print("skipping...")
    
else:
    for trial in k_trials:
        print(f"[TF Clustering] {trial} clusters")
        print("Train Set")
        cluster_df_train, col_train = j_process.run_tf_clustering(
            cluster_df_train, feature_cols=feature_list, n_clusters=trial
        )
        cluster_columns.append(col_train)
        print("Val Set")
        cluster_df_val, _ = j_process.run_tf_clustering(
            cluster_df_val, feature_cols=feature_list, n_clusters=trial
        )


In [None]:
if skip==True:
    print("No need to save....")
else:
    j_process.p_save(cluster_df_val,"data/cluster_df_val.parquet")
    j_process.p_save(cluster_df_train,"data/cluster_df_train.parquet")

In [None]:
### Sanity check


for km in cluster_columns:
    print(km)
clustering_only=[]

#print(cluster_df_train)

In [None]:

for km in cluster_columns:
    for target in target_labels:
        print(km)
        results = j_process.run_logistic_model(cluster_df_train,cluster_df_val,feature_list+[km],target)
        print(km)
        clustering_only.append(results)
        results = j_process.run_tf_model(cluster_df_train,cluster_df_val,feature_list+[km],target,verbose=1)
        clustering_only.append(results)
        results = j_process.run_rf_model(cluster_df_train,cluster_df_val,feature_list+[km],target)
        clustering_only.append(results)

In [None]:
print(cluster_df_train.head())
print(feature_list)

In [None]:

cluster_col = 'tf_n4_d8_e50'
cluster_val = 3


target_cols = ['ALL_CHRONIC', 'ALL_CARDIAC', 'ALL_PUL', 'ALL_CANCER','ALL_CHRONIC2']
subset = cluster_df_train[cluster_df_train[cluster_col] == cluster_val]


yes_counts = []
no_counts = []

for col in target_cols:
    vc = subset[col].value_counts()
    yes_counts.append(vc.get('Yes', 0))
    no_counts.append(vc.get('No', 0))

# Step 4: Plot stacked bar
x = target_cols
x_pos = range(len(x))

plt.figure(figsize=(7, 5))
plt.bar(x_pos, yes_counts, label='Yes', color='steelblue')
plt.bar(x_pos, no_counts, bottom=yes_counts, label='No', color='lightgray')

plt.xticks(x_pos, x, rotation=45)
plt.ylabel('Count')
plt.title(f'"Yes" and "No" Counts for {cluster_col} = {cluster_val}')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plot_cols = [col for col in cluster_df_train.columns if col.startswith('kmode') or col.startswith('tf')]

for col in plot_cols:
    counts = cluster_df_train[col].value_counts().sort_values(ascending=False)

    plt.figure(figsize=(12, 8))
    counts.plot(kind='bar')
    plt.title(f'Value Counts for {col}')
    plt.xlabel('Cluster Label')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

## MCA Dimensonality reduction

In [None]:
mca_df_train = df_train.copy()
mca_df_val = df_val.copy()
print(feature_list)
mca_df_train=mca_df_train[feature_list]
mca_df_val=mca_df_val[feature_list]
print(mca_df_train.head())

### Mca analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

os.makedirs("images", exist_ok=True)

mca_results = []

for n_components in sorted(mca_set.keys()):
    # Step 1: Run MCA on dropped-column (feature-only) data
    train_mca = j_process.run_mca(mca_set[n_components], n_components=n_components)
    val_mca = j_process.run_mca(mca_val[n_components], n_components=n_components)

    # Rename MCA columns to ensure uniqueness
    train_mca.columns = [f"mca_{n_components}_{i}" for i in range(n_components)]
    val_mca.columns = [f"mca_{n_components}_{i}" for i in range(n_components)]

    # Step 2: Reattach targets
    train_combined = pd.concat([train_mca, df_train[target_labels].reset_index(drop=True)], axis=1)
    val_combined = pd.concat([val_mca, df_val[target_labels].reset_index(drop=True)], axis=1)

    mca_feat_cols = train_mca.columns.tolist()

    # Step 3: Run models
    for target in target_labels:
        print(f"Running MCA={n_components}, Target={target}...")

        log_results = j_process.run_logistic_model(train_combined, val_combined, mca_feat_cols, target)
        log_results.update({
            "Model": "Logistic",
            "Target": target,
            "MCA Components": n_components
        })
        mca_results.append(log_results)

        tf_results = j_process.run_tf_model(train_combined, val_combined, mca_feat_cols, target, verbose=0)
        tf_results.update({
            "Model": "TensorFlow",
            "Target": target,
            "MCA Components": n_components
        })
        mca_results.append(tf_results)

# Step 4: Create summary table
df_mca_all = pd.DataFrame(mca_results)
df_mca_all = df_mca_all[["MCA Components", "Target", "Model", "accuracy", "precision", "recall", "f1_score"]]

for col in ["accuracy", "precision", "recall", "f1_score"]:
    df_mca_all[col] = (df_mca_all[col] * 100).round(3)

display(df_mca_all)

# Step 5: Save one image per MCA component count
for comp in sorted(df_mca_all["MCA Components"].unique()):
    subset = df_mca_all[df_mca_all["MCA Components"] == comp]

    fig, ax = plt.subplots(figsize=(12, 0.5 + len(subset) * 0.5))
    ax.axis('tight')
    ax.axis('off')

    table = ax.table(
        cellText=subset.values,
        colLabels=subset.columns,
        cellLoc='center',
        loc='center',
        bbox=[0, 0, 1, 1]
    )

    table.auto_set_font_size(False)
    table.set_fontsize(12)

    ax.set_title(f"MCA-{comp} Model Results (%)", fontsize=14, fontweight="bold", pad=20)

    plt.tight_layout()
    plt.savefig(f"images/mca_{comp}_results.jpg", dpi=300, bbox_inches='tight')
    plt.close()


In [None]:
print(feature_list)

In [None]:
# Keep only columns starting with 'mca_' or 'ALL_'
#mca2_df_train = mca2_df_train[[col for col in mca2_df_train.columns if col.startswith("mca_") or col.startswith("ALL_")]]
#mca2_df_val = mca2_df_val[[col for col in mca2_dgf_val.columns if col.startswith("mca_") or col.startswith("ALL_")]]


In [None]:
for comp in df_mca_all["MCA Components"].unique():
    subset = df_mca_all[df_mca_all["MCA Components"] == comp]

    fig, ax = plt.subplots(figsize=(12, len(subset) * 0.5))
    ax.axis('tight')
    ax.axis('off')
    table = ax.table(
        cellText=subset.values,
        colLabels=subset.columns,
        cellLoc='center',
        loc='center',
        bbox=[0, 0, 1, 1]
    )
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    ax.set_title(f"MCA-{comp} Model Results (%)", fontsize=14, fontweight="bold", pad=20)

    plt.tight_layout()
    plt.savefig(f"images/mca_{comp}_results.jpg", dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
print(mca_only)


In [None]:
print(mca_only)

In [None]:
print(mca_set.head())
print(combined_feat)

In [None]:

import ipywidgets as widgets
df = cluster_df_train.copy()

# Split column names into kmode and tf lists
kmode_cols = [col for col in df.columns if col.startswith('kmode')]
tf_cols = [col for col in df.columns if col.startswith('tf')]

# Dropdown widget
col_type = widgets.Dropdown(
    options=['kmode', 'tf'],
    description='Type:'
)

# Dropdown widget for specific column (updated dynamically)
col_dropdown = widgets.Dropdown(description='Column:')

# Output area for the plot
output = widgets.Output()

# Function to update column dropdown based on type
def update_col_dropdown(*args):
    if col_type.value == 'kmode':
        col_dropdown.options = kmode_cols
    else:
        col_dropdown.options = tf_cols

# Plotting function
def plot_column_counts(change):
    with output:
        output.clear_output()
        col = col_dropdown.value
        if col:
            counts = df[col].value_counts().sort_index()
            plt.figure(figsize=(6, 4))
            counts.plot(kind='bar')
            plt.title(f'Value Counts for {col}')
            plt.xlabel('Value')
            plt.ylabel('Count')
            plt.tight_layout()
            plt.show()
col_type.observe(update_col_dropdown, names='value')
col_dropdown.observe(plot_column_counts, names='value')

# Initialize dropdown
update_col_dropdown()

# Display widgets
display(widgets.VBox([col_type, col_dropdown, output]))

In [None]:
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display


feature_list_1 = feature_list
feature_list_2 = target_cols


cluster_cols = [col for col in df.columns if col.startswith("kmode") or col.startswith("tf")]

cluster_col_dropdown = widgets.Dropdown(options=cluster_cols, description='Cluster Column:')
cluster_val_dropdown = widgets.Dropdown(description='Cluster Value:')
feature_set_dropdown = widgets.Dropdown(options=['Feature Set 1', 'Feature Set 2'], description='Feature Set:')

output = widgets.Output()


def update_cluster_vals(*args):
    selected_col = cluster_col_dropdown.value
    if selected_col:
        cluster_val_dropdown.options = sorted(df[selected_col].dropna().unique())


def plot_stacked_bar(*args):
    with output:
        output.clear_output()
        cluster_col = cluster_col_dropdown.value
        cluster_val = cluster_val_dropdown.value

        if feature_set_dropdown.value == 'Feature Set 1':
            features = feature_list_1
        else:
            features = feature_list_2


        filtered = df[df[cluster_col] == cluster_val]


        records = []
        for col in features:
            total = filtered[col].notna().sum()
            counts = filtered[col].value_counts(dropna=False)
            for val, count in counts.items():
                pct = 100 * count / total if total else 0
                records.append({
                    'Feature': col,
                    'Response': str(val),
                    'Count': count,
                    'Percentage': f"{pct:.1f}%"
                })

        plot_df = pd.DataFrame(records)


        response_order = ["No", "Yes", "Refused", "Don't know", "Missing"]
        color_map = {
            "No": "#636EFA",
            "Yes": "#00CC96",
            "Refused": "#FFA15A",
            "Don't know": "#AB63FA",
            "Missing": "#B6E880"
        }

        plot_df["Response"] = pd.Categorical(plot_df["Response"], categories=response_order, ordered=True)
        plot_df = plot_df.sort_values("Response")  # Enforce consistent stacking

        fig = px.bar(
            plot_df,
            x='Feature',
            y='Count',
            color='Response',
            text='Percentage',
            hover_data={'Count': True, 'Percentage': True, 'Response': True},
            color_discrete_map=color_map
        )

        fig.update_layout(
            barmode='stack',
            title=f'Stacked Bar: Cluster {cluster_val} in {cluster_col}',
            xaxis_title='Feature',
            yaxis_title='Count',
            showlegend=False
        )

        fig.update_traces(textposition='inside')
        fig.show()
cluster_col_dropdown.observe(update_cluster_vals, names='value')
cluster_val_dropdown.observe(plot_stacked_bar, names='value')
feature_set_dropdown.observe(plot_stacked_bar, names='value')
update_cluster_vals()
display(widgets.VBox([
    cluster_col_dropdown,
    cluster_val_dropdown,
    feature_set_dropdown,
    output
]))


In [None]:

from PIL import Image
from fpdf import FPDF
from datetime import datetime

# PDF setup
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)

# Folder with images
image_dir = 'images/'

# Get all image files with their creation time
image_files = [
    (f, os.path.getctime(os.path.join(image_dir, f)))
    for f in os.listdir(image_dir)
    if f.lower().endswith(('.png', '.jpg', '.jpeg'))
]

# Sort by creation time
image_files.sort(key=lambda x: x[1])

# Add each image to the PDF
for i, (filename, ctime) in enumerate(image_files, start=1):
    filepath = os.path.join(image_dir, filename)

    pdf.add_page()

    title = os.path.splitext(filename)[0]
    date_str = datetime.fromtimestamp(ctime).strftime("%Y-%m-%d %H:%M:%S")

    pdf.set_font("Arial", size=14)
    pdf.cell(0, 10, f"Fig {i}: {title}", ln=True)
    pdf.set_font("Arial", size=10)
    pdf.cell(0, 10, f"Created: {date_str}", ln=True)

    # Resize image to fit page
    im = Image.open(filepath)
    width, height = im.size
    aspect = width / height
    max_width, max_height = 180, 180 / aspect
    pdf.image(filepath, x=15, y=40, w=max_width)

# Save PDF
pdf.output("output.pdf")
