### Adj experiment on SVI data

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import geopandas as gpd

import libpysal as ps

In [2]:
# import svi data
svi_data = gpd.read_file('/Users/h6x/ORNL/git/WORKSTAION GIT/universal-experiment-lab/experiment_10_multiple_variables_adj_package/data/SVI2018_US_tract.gdb')

In [3]:
# selected variables for analysis
variables = [
    'EP_POV', 'EP_UNEMP', 'EP_PCI', 'EP_NOHSDP', 'EP_UNINSUR', 'EP_AGE65', 'EP_AGE17', 'EP_DISABL', 
    'EP_SNGPNT', 'EP_LIMENG', 'EP_MINRTY', 'EP_MUNIT', 'EP_MOBILE', 'EP_CROWD', 'EP_NOVEH', 'EP_GROUPQ'
]

variables_with_censusinfo = ['FIPS', 'STCNTY'] + variables + ['geometry'] +['ST_ABBR','COUNTY']

In [4]:
# filter data
svi_data = svi_data[variables_with_censusinfo]

Getting a single state to work on SVI data

In [5]:
texas_df = svi_data[svi_data['ST_ABBR'] == 'TX']

In [6]:
# get the number of negative values for each variable in Texas
negative_values = {}
for variable in variables:
    negative_values[variable] = texas_df[texas_df[variable] < 0].shape[0]

negative_values

{'EP_POV': 37,
 'EP_UNEMP': 31,
 'EP_PCI': 30,
 'EP_NOHSDP': 25,
 'EP_UNINSUR': 32,
 'EP_AGE65': 25,
 'EP_AGE17': 0,
 'EP_DISABL': 32,
 'EP_SNGPNT': 0,
 'EP_LIMENG': 0,
 'EP_MINRTY': 0,
 'EP_MUNIT': 0,
 'EP_MOBILE': 39,
 'EP_CROWD': 0,
 'EP_NOVEH': 40,
 'EP_GROUPQ': 0}

In [7]:
#Ensure the dataframe is a GeoDataFrame
texas_gdf = texas_df.copy()  # Make a copy to avoid modifying the original
texas_gdf = gpd.GeoDataFrame(texas_gdf, geometry="geometry")

In [8]:
# Compute Queen adjacency
w = ps.weights.Queen.from_dataframe(texas_gdf, ids=texas_gdf["FIPS"])

In [9]:
# Store adjacent county FIPS in a new column
texas_gdf["adjacent_fips"] = texas_gdf["FIPS"].apply(lambda fips: w.neighbors.get(fips, []))

In [10]:
# replace negative values with the average of the adjacent counties without negative adjacent values
for county in texas_df['FIPS'].unique():
    for variable in variables:

        if texas_df.loc[texas_df['FIPS'] == county, variable].values[0] < 0:
            # print(f"County {county} has a negative value for variable {variable}")
            # print(f"County {county} has the following adjacent counties: {texas_gdf.loc[texas_gdf['FIPS'] == county, 'adjacent_fips'].values[0]}")

            # get those adjacent counties variable values to a list
            adjacent_values = []
            for adjacent_county in texas_gdf.loc[texas_gdf['FIPS'] == county, 'adjacent_fips'].values[0]:
                adjacent_values.append(texas_df.loc[texas_df['FIPS'] == adjacent_county, variable].values[0])

            # print(f"Adjacent counties have the following values for variable {variable}: {adjacent_values}")
            
            # get the average of the adjacent values without the negative values
            adjacent_values = [value for value in adjacent_values if value >= 0]
            average_adjacent_value = np.mean(adjacent_values)

            # print(f"Average value of adjacent counties for variable {variable} is {average_adjacent_value}")

            # replace the negative value with the average of the adjacent values
            texas_df.loc[texas_df['FIPS'] == county, variable] = average_adjacent_value
            
        else:
            continue

In [11]:
# get the number of negative values for each variable in Texas
negative_values = {}
for variable in variables:
    negative_values[variable] = texas_df[texas_df[variable] < 0].shape[0]

negative_values

{'EP_POV': 0,
 'EP_UNEMP': 0,
 'EP_PCI': 0,
 'EP_NOHSDP': 0,
 'EP_UNINSUR': 0,
 'EP_AGE65': 0,
 'EP_AGE17': 0,
 'EP_DISABL': 0,
 'EP_SNGPNT': 0,
 'EP_LIMENG': 0,
 'EP_MINRTY': 0,
 'EP_MUNIT': 0,
 'EP_MOBILE': 0,
 'EP_CROWD': 0,
 'EP_NOVEH': 0,
 'EP_GROUPQ': 0}

Let's do a exploratory analysis on the SVI data

In [12]:
texas_df.describe()

Unnamed: 0,EP_POV,EP_UNEMP,EP_PCI,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_SNGPNT,EP_LIMENG,EP_MINRTY,EP_MUNIT,EP_MOBILE,EP_CROWD,EP_NOVEH,EP_GROUPQ
count,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0
mean,16.822485,5.708215,29851.123381,18.43729,17.87825,13.005274,25.034716,12.459059,10.350171,7.637019,56.605634,13.446422,7.401808,5.132756,5.987518,2.375771
std,12.181914,3.726698,17673.220963,14.20876,9.89259,6.610645,7.101143,5.767338,6.31345,8.790543,27.886015,18.899184,11.660844,5.208373,6.121535,9.587436
min,0.0,0.0,678.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.6,3.2,18670.25,7.1,10.3,8.4,21.3,8.4,5.7,1.5,33.0,0.5,0.0,1.5,1.8,0.0
50%,14.1,5.0,25731.0,14.7,16.6,12.1,25.6,11.8,9.4,4.1,55.25,4.9,1.2,3.7,4.1,0.0
75%,23.375,7.3,35276.75,26.8,24.4,16.7,29.6,15.7,14.1,10.7,83.0,19.0,11.375,7.2,8.1,0.9
max,100.0,55.4,198114.0,100.0,57.7,78.8,55.0,100.0,57.6,55.5,100.0,100.0,100.0,100.0,66.4,100.0


In [13]:
# min max scaling for each variable
for variable in variables:
    texas_df[variable] = (texas_df[variable] - texas_df[variable].min()) / (texas_df[variable].max() - texas_df[variable].min())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [14]:
texas_df.describe()

Unnamed: 0,EP_POV,EP_UNEMP,EP_PCI,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,EP_SNGPNT,EP_LIMENG,EP_MINRTY,EP_MUNIT,EP_MOBILE,EP_CROWD,EP_NOVEH,EP_GROUPQ
count,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0,5254.0
mean,0.168225,0.103036,0.14776,0.184373,0.309848,0.165042,0.455177,0.124591,0.17969,0.137604,0.566056,0.134464,0.074018,0.051328,0.090173,0.023758
std,0.121819,0.067269,0.089514,0.142088,0.171449,0.083891,0.129112,0.057673,0.109609,0.158388,0.27886,0.188992,0.116608,0.052084,0.092192,0.095874
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.076,0.057762,0.09113,0.071,0.17851,0.106599,0.387273,0.084,0.098958,0.027027,0.33,0.005,0.0,0.015,0.027108,0.0
50%,0.141,0.090253,0.126892,0.147,0.287695,0.153553,0.465455,0.118,0.163194,0.073874,0.5525,0.049,0.012,0.037,0.061747,0.0
75%,0.23375,0.131769,0.17524,0.268,0.422877,0.211929,0.538182,0.157,0.244792,0.192793,0.83,0.19,0.11375,0.072,0.121988,0.009
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
texas_df.head()

Unnamed: 0,FIPS,STCNTY,EP_POV,EP_UNEMP,EP_PCI,EP_NOHSDP,EP_UNINSUR,EP_AGE65,EP_AGE17,EP_DISABL,...,EP_LIMENG,EP_MINRTY,EP_MUNIT,EP_MOBILE,EP_CROWD,EP_NOVEH,EP_GROUPQ,geometry,ST_ABBR,COUNTY
522,48027980001,48027,0.0758,0.0,0.168536,0.0,0.101906,0.0,0.0,0.1198,...,0.0,0.767,0.0,0.0416,0.0,0.034639,1.0,"MULTIPOLYGON (((-97.88313 31.07851, -97.88166 ...",TX,Bell
523,48027980002,48027,0.0839,0.099278,0.140977,0.01,0.091681,0.013325,0.0,0.1064,...,0.0,0.0,0.0,0.0268,0.0,0.02259,0.0,"MULTIPOLYGON (((-97.84694 31.08029, -97.84082 ...",TX,Bell
524,48027980003,48027,0.284167,0.120423,0.110138,0.092143,0.142692,0.083575,0.0,0.211667,...,0.0,0.0,0.0,0.299,0.0,0.050954,0.0,"MULTIPOLYGON (((-97.75925 31.14563, -97.68215 ...",TX,Bell
525,48029980001,48029,0.049714,0.053378,0.264788,0.030571,0.095073,0.16715,0.0,0.08,...,0.0,0.0,0.0,0.024429,0.0,0.021084,0.0,"MULTIPOLYGON (((-98.6315 29.67979, -98.6315 29...",TX,Bexar
526,48029980002,48029,0.291333,0.127557,0.06932,0.373333,0.396303,0.156937,0.0,0.217333,...,0.0,0.0,0.0,0.021,0.0,0.161145,0.0,"MULTIPOLYGON (((-98.55142 29.38069, -98.54909 ...",TX,Bexar


Let's compute the adjacency information for the SVI data

In [16]:
from utills.adjacency_simplex import AdjacencySimplex  # Import the class
from utills.calculate_tda_summaries import compute_persistence

In [17]:
# get the unique county stcnty

county_stcntys = texas_df['STCNTY'].unique()

In [18]:
print(f'Number of unique county stcnty: {len(county_stcntys)}')

Number of unique county stcnty: 254


In [19]:
def process_county_variable(county_id, county_df, result_df, variable,filter_method):
    """
    Process a single county and variable combination, updating the result DataFrame.

    Parameters:
        county_id (str): The county identifier.
        county_df (GeoDataFrame): GeoDataFrame filtered to the current county.
        result_df (DataFrame): The DataFrame to update with results.
        variable (str): The variable to process.

    Returns:
        DataFrame: The updated result DataFrame.
    """
    # Select only the relevant columns
    temp_df = county_df[[variable, 'geometry']]

    # Initialize the AdjacencySimplex object
    adj_simplex = AdjacencySimplex(
        gdf=temp_df,
        variable=variable,
        threshold=None,
        filter_method=filter_method
    )

    # Filter and sort the GeoDataFrame; ignore the second return value if not needed
    filtered_df, _ = adj_simplex.filter_sort_gdf()

    # Calculate adjacent countries and form the simplicial complex
    adj_simplex.calculate_adjacent_countries()
    simplex = adj_simplex.form_simplicial_complex()

    # Compute persistence values
    total_h0_points, tl, al, tml, aml, intervals_dim0 = compute_persistence(
        simplices=simplex,
        filtered_df=filtered_df,
        variable_name=variable
    )

    # Store the computed persistence values in the DataFrame
    result_df.loc[county_id, f'{variable}_TL'] = tl
    result_df.loc[county_id, f'{variable}_AL'] = al
    result_df.loc[county_id, f'{variable}_TML'] = tml
    result_df.loc[county_id, f'{variable}_AML'] = aml
    result_df.loc[county_id, 'filter_method'] = filter_method

    return result_df

In [20]:
# Create a result DataFrame with county identifiers as its index
result_df = pd.DataFrame(index=county_stcntys)
result_df.index.name = 'STCNTY'

# Loop through each county and process each variable of interest
for county_id in county_stcntys:
    # Filter the main GeoDataFrame for the current county
    county_df = texas_df[texas_df['STCNTY'] == county_id]
    
    # Process each variable of interest for this county
    for variable in variables:
        result_df = process_county_variable(county_id, county_df, result_df, variable,filter_method='up')

In [21]:
result_df

Unnamed: 0_level_0,EP_POV_TL,EP_POV_AL,EP_POV_TML,EP_POV_AML,filter_method,EP_UNEMP_TL,EP_UNEMP_AL,EP_UNEMP_TML,EP_UNEMP_AML,EP_PCI_TL,...,EP_CROWD_TML,EP_CROWD_AML,EP_NOVEH_TL,EP_NOVEH_AL,EP_NOVEH_TML,EP_NOVEH_AML,EP_GROUPQ_TL,EP_GROUPQ_AL,EP_GROUPQ_TML,EP_GROUPQ_AML
STCNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48027,11.735867,0.180552,5.867933,0.090276,up,10.433213,0.160511,5.216606,0.080255,8.207167,...,1.4525,0.022695,6.987701,0.107503,3.493850,0.053752,4.520,0.155862,2.2600,0.077931
48029,66.220254,0.180930,33.110127,0.090465,up,40.669303,0.111118,20.334651,0.055559,49.790692,...,8.7035,0.024517,46.223561,0.128756,23.111780,0.064378,7.538,0.036771,3.7690,0.018385
48039,6.224000,0.122039,3.112000,0.061020,up,5.501805,0.107879,2.750903,0.053939,8.286830,...,1.1275,0.022108,3.239458,0.063519,1.619729,0.031759,1.713,0.048943,0.8565,0.024471
48041,12.378000,0.294714,6.189000,0.147357,up,3.999398,0.095224,1.999699,0.047612,5.842004,...,0.8385,0.021500,4.111446,0.111120,2.055723,0.055560,3.206,0.110552,1.6030,0.055276
48061,30.452643,0.354100,15.226321,0.177050,up,12.081743,0.140485,6.040872,0.070243,7.173241,...,5.4670,0.063570,12.083369,0.140504,6.041685,0.070252,1.042,0.034733,0.5210,0.017367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48229,0.000000,0.000000,0.000000,0.000000,up,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.0000,0.000000
48271,0.000000,0.000000,0.000000,0.000000,up,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.0000,0.000000
48283,0.000000,0.000000,0.000000,0.000000,up,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.0000,0.000000
48301,0.000000,0.000000,0.000000,0.000000,up,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.0000,0.000000


In [22]:
result_df

Unnamed: 0_level_0,EP_POV_TL,EP_POV_AL,EP_POV_TML,EP_POV_AML,filter_method,EP_UNEMP_TL,EP_UNEMP_AL,EP_UNEMP_TML,EP_UNEMP_AML,EP_PCI_TL,...,EP_CROWD_TML,EP_CROWD_AML,EP_NOVEH_TL,EP_NOVEH_AL,EP_NOVEH_TML,EP_NOVEH_AML,EP_GROUPQ_TL,EP_GROUPQ_AL,EP_GROUPQ_TML,EP_GROUPQ_AML
STCNTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48027,11.735867,0.180552,5.867933,0.090276,up,10.433213,0.160511,5.216606,0.080255,8.207167,...,1.4525,0.022695,6.987701,0.107503,3.493850,0.053752,4.520,0.155862,2.2600,0.077931
48029,66.220254,0.180930,33.110127,0.090465,up,40.669303,0.111118,20.334651,0.055559,49.790692,...,8.7035,0.024517,46.223561,0.128756,23.111780,0.064378,7.538,0.036771,3.7690,0.018385
48039,6.224000,0.122039,3.112000,0.061020,up,5.501805,0.107879,2.750903,0.053939,8.286830,...,1.1275,0.022108,3.239458,0.063519,1.619729,0.031759,1.713,0.048943,0.8565,0.024471
48041,12.378000,0.294714,6.189000,0.147357,up,3.999398,0.095224,1.999699,0.047612,5.842004,...,0.8385,0.021500,4.111446,0.111120,2.055723,0.055560,3.206,0.110552,1.6030,0.055276
48061,30.452643,0.354100,15.226321,0.177050,up,12.081743,0.140485,6.040872,0.070243,7.173241,...,5.4670,0.063570,12.083369,0.140504,6.041685,0.070252,1.042,0.034733,0.5210,0.017367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48229,0.000000,0.000000,0.000000,0.000000,up,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.0000,0.000000
48271,0.000000,0.000000,0.000000,0.000000,up,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.0000,0.000000
48283,0.000000,0.000000,0.000000,0.000000,up,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.0000,0.000000
48301,0.000000,0.000000,0.000000,0.000000,up,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.0000,0.000000
