### Persistence images from distance matrix

In [17]:
# import libraries
import numpy as np
import re
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import traceback
import pandas as pd


from ripser import Rips
from persim import PersistenceImager

import glob
import pickle
import geopandas as gpd
import dcor
import pandas as pd

In [4]:
# remove warnings
import warnings
warnings.filterwarnings("ignore")

In [8]:
import ripser
import persim

def diagram_sizes(dgms):
    return ", ".join([f"|$H_{i}$|={len(d)}" for i, d in enumerate(dgms)])

In [21]:
# Define constant variables
DATA_DIR = "/Users/h6x/ORNL/git/modeling-ideas/overdose modeling for entire country/data/processed data/selected coordinates for each state - percentiles(below 90th)- all variables"
RESULTS_DIR = "/Users/h6x/ORNL/git/modeling-ideas/overdose modeling for entire country/results/persistence images/below 90th percentile/h1h0/distance matrix 16 channels npy"
VARIABLES = ['EP_POV','EP_UNEMP','EP_PCI','EP_NOHSDP','EP_UNINSUR','EP_AGE65','EP_AGE17','EP_DISABL','EP_SNGPNT','EP_LIMENG','EP_MINRTY','EP_MUNIT','EP_MOBILE','EP_CROWD','EP_NOVEH','EP_GROUPQ']
PERSISTENCE_IMAGE_SHAPE = (310, 310)
NUMBER_OF_VARIABLES = 16
PERSISTENCE_IMAGE_PARAMS = {
    'pixel_size': 0.001,
    'birth_range': (0.0, 0.31),
    'pers_range': (0.0, 0.31),
    'kernel_params': {'sigma': 0.00004}
}

In [26]:
# imoporting SVI data for the entire US(county level) 
us_svi = gpd.read_file('/Users/h6x/ORNL/git/modeling-ideas/overdose modeling for entire country/data/processed data/svi with hepvu/2018/SVI 2018 with HepVu census tracts/SVI2018_US_census_with_opioid_indicators.shp')

In [11]:
# Function to get the list of folders in a specified location
def get_folders(location):
    return [name for name in os.listdir(location) if os.path.isdir(os.path.join(location, name))]

# Get the list of state folders
states = get_folders(DATA_DIR)

In [12]:
# Create a folder for each variable if it does not exist
for variable in VARIABLES:
    os.makedirs(os.path.join(RESULTS_DIR, variable), exist_ok=True)
print('Done creating folders for each variable')

Done creating folders for each variable


In [37]:
# Loop through each state
for state in tqdm(states, desc="Processing states"):
    print('Processing:', state)

    try:
        # Load data from pickle files into a dictionary
        data = {}

        for file in glob.glob(os.path.join(DATA_DIR, state, '*.pkl')):
            with open(file, 'rb') as f:
                
                # Extract the last 20 characters of the file name
                extracted_words = file[-20:]

                # Search for numbers in the extracted string
                match = re.search(r'(\d+)', extracted_words)
                if match:
                    extracted_number = match.group(1)
                    # Load the pickle file data into the dictionary
                    data[extracted_number] = pd.read_pickle(f)
                else:
                    print("No number found in the string.")

                 # Process each county (FIPS) in the data
        for fips, dictionary in data.items():
            # Dictionary where the key is the county code (FIPS) and the value is another dictionary
            # print('Processing:', fips)

            for key, value in dictionary.items():

                # If the value is not empty, process it
                if len(value) > 0:
                    distance_matrix = np.zeros((NUMBER_OF_VARIABLES, NUMBER_OF_VARIABLES))

                    # get the selected fips realated to the specific variable
                    selected_fips = value['FIPS'].tolist()
     
                    # filter the dataframe svi_df to only include the selected fips
                    svi_df_selected_fips = us_svi[us_svi['FIPS'].isin(selected_fips)]
                    svi_df_selected_fips = svi_df_selected_fips[VARIABLES]

                    # create a matrix
                    data_matrix = svi_df_selected_fips.to_numpy()

                    # compute the distance matrix
                    for i in range(NUMBER_OF_VARIABLES):
                        for j in range(NUMBER_OF_VARIABLES):
                            distance_matrix[i, j] = dcor.distance_correlation(data_matrix[:, i], data_matrix[:, j]) #Distance Correlations 

                    distance_matrix = distance_matrix.T + np.eye(NUMBER_OF_VARIABLES)
                    distance_matrix = 1 - distance_matrix



                else:
                    # If there is no data to compute persistence image, save an empty image
                    peristence_image = np.zeros(PERSISTENCE_IMAGE_SHAPE)
                    # np.save(os.path.join(RESULTS_DIR, key, fips), peristence_image)

                    # break
                # break
                

            # break
        
        # break

    except Exception as e:
        print(f"Error processing {state}: {e}")
        traceback.print_exc()
        continue  # Continue to the next iteration if an error occurs

print('All states processed.')

Processing states:   0%|          | 0/50 [00:00<?, ?it/s]

Processing: VT


Processing states:   2%|▏         | 1/50 [00:03<02:52,  3.53s/it]

Processing: VA


Processing states:   4%|▍         | 2/50 [00:37<17:10, 21.46s/it]

Processing: SD


Processing states:   6%|▌         | 3/50 [00:53<14:52, 19.00s/it]

Processing: SC


Processing states:   8%|▊         | 4/50 [01:06<12:45, 16.65s/it]

Processing: UT


Processing states:  10%|█         | 5/50 [01:14<10:12, 13.62s/it]

Processing: GA


Processing states:  12%|█▏        | 6/50 [01:57<17:14, 23.51s/it]

Processing: MS


Processing states:  14%|█▍        | 7/50 [04:50<51:54, 72.43s/it]

Processing: MT


Processing states:  16%|█▌        | 8/50 [05:04<37:31, 53.60s/it]

Processing: MO


Processing states:  18%|█▊        | 9/50 [05:34<31:46, 46.50s/it]

Processing: MA


Processing states:  20%|██        | 10/50 [05:43<23:06, 34.67s/it]

Processing: AK


Processing states:  22%|██▏       | 11/50 [05:47<16:32, 25.44s/it]

Processing: KY


Processing states:  24%|██▍       | 12/50 [06:05<14:36, 23.06s/it]

Processing: AL


Processing states:  26%|██▌       | 13/50 [06:18<12:18, 19.97s/it]

Processing: NH


Processing states:  28%|██▊       | 14/50 [06:19<08:40, 14.46s/it]

Processing: MN


Processing states:  30%|███       | 15/50 [06:35<08:36, 14.76s/it]

Processing: MI


Processing states:  32%|███▏      | 16/50 [06:51<08:41, 15.34s/it]

Processing: OK


Processing states:  34%|███▍      | 17/50 [07:04<08:00, 14.55s/it]

Processing: IN


Processing states:  36%|███▌      | 18/50 [07:19<07:43, 14.50s/it]

Processing: CO


Processing states:  38%|███▊      | 19/50 [07:29<06:54, 13.37s/it]

Processing: CA


Processing states:  40%|████      | 20/50 [07:57<08:50, 17.68s/it]

Processing: IA


Processing states:  42%|████▏     | 21/50 [08:12<08:11, 16.94s/it]

Processing: CT


Processing states:  44%|████▍     | 22/50 [08:15<05:55, 12.69s/it]

Processing: FL


Processing states:  46%|████▌     | 23/50 [08:32<06:21, 14.12s/it]

Processing: WV


Processing states:  48%|████▊     | 24/50 [08:41<05:22, 12.38s/it]

Processing: RI


Processing states:  50%|█████     | 25/50 [08:42<03:44,  8.98s/it]

Processing: WY


Processing states:  52%|█████▏    | 26/50 [08:45<02:56,  7.34s/it]

Processing: TX


Processing states:  54%|█████▍    | 27/50 [09:30<07:06, 18.55s/it]

Processing: PA


Processing states:  56%|█████▌    | 28/50 [09:44<06:18, 17.19s/it]

Processing: NC


Processing states:  58%|█████▊    | 29/50 [10:00<05:53, 16.83s/it]

Processing: ND


Processing states:  60%|██████    | 30/50 [10:08<04:45, 14.30s/it]

Processing: NM


Processing states:  62%|██████▏   | 31/50 [10:14<03:39, 11.58s/it]

Processing: NJ


Processing states:  64%|██████▍   | 32/50 [10:20<02:59,  9.97s/it]

Processing: ME


Processing states:  66%|██████▌   | 33/50 [10:22<02:11,  7.74s/it]

Processing: AR


Processing states:  68%|██████▊   | 34/50 [10:35<02:27,  9.20s/it]

Processing: NV


Processing states:  70%|███████   | 35/50 [10:39<01:53,  7.57s/it]

Processing: MD


Processing states:  72%|███████▏  | 36/50 [10:44<01:37,  6.99s/it]

Processing: KS


Processing states:  74%|███████▍  | 37/50 [11:01<02:09,  9.94s/it]

Processing: NE


Processing states:  76%|███████▌  | 38/50 [11:17<02:18, 11.55s/it]

Processing: HI


Processing states:  78%|███████▊  | 39/50 [11:18<01:32,  8.45s/it]

Processing: DE


Processing states:  80%|████████  | 40/50 [11:19<01:01,  6.13s/it]

Processing: AZ


Processing states:  82%|████████▏ | 41/50 [11:24<00:53,  5.93s/it]

Processing: NY


Processing states:  84%|████████▍ | 42/50 [11:43<01:18,  9.79s/it]

Processing: ID


Processing states:  86%|████████▌ | 43/50 [11:50<01:02,  8.94s/it]

Processing: OH


Processing states:  88%|████████▊ | 44/50 [12:06<01:06, 11.16s/it]

Processing: OR


Processing states:  90%|█████████ | 45/50 [12:12<00:48,  9.64s/it]

Processing: IL


Processing states:  92%|█████████▏| 46/50 [12:32<00:50, 12.70s/it]

Processing: LA


Processing states:  94%|█████████▍| 47/50 [12:42<00:35, 11.98s/it]

Processing: WI


Processing states:  96%|█████████▌| 48/50 [12:54<00:23, 11.80s/it]

Processing: WA


Processing states:  98%|█████████▊| 49/50 [13:01<00:10, 10.57s/it]

Processing: TN


Processing states: 100%|██████████| 50/50 [13:17<00:00, 15.95s/it]

All states processed.



