In [22]:
# import libraries
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import shapely.geometry as geom

In [23]:
# dataframe

In [24]:
# Create a DataFrame with box names and values
box_names = [chr(65 + i) for i in range(16)]
index = list(range(0, 16))
values = list(range(1, 17))

In [25]:
df = pd.DataFrame({'FIPS': box_names,'Index': index, 'Value': values})

# Sorting the DataFrame based on the 'rate' column :::: Values are needed to be sorted in ascending order
df.sort_values(by='Value', inplace=True)

# Adding a new column 'new_ID' with ID values starting from zero
df['sortedID'] = range(len(df))

# Function to calculate square coordinates for a given name
def calculate_square_coordinates(row):
    value = row['Index']
    # Assuming each smaller square has a side length of 1 unit
    x = value % 4
    y = value // 4
    # Return square coordinates as a Shapely polygon
    return geom.Polygon([(x, y), (x+1, y), (x+1, y+1), (x, y+1)])

# Calculate square coordinates for each row and create a geometry column
df['geometry'] = df.apply(calculate_square_coordinates, axis=1)

# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')

# Set the CRS to a simple Cartesian coordinate system
gdf.crs = "EPSG:3395"  # This is a commonly used projected CRS

gdf = gdf[['FIPS','Value', 'geometry']]  

In [26]:
from adjacency_simplex import AdjacencySimplex  # Import the class

In [27]:
# Initialize the AdjacencySimplex class
adj_simplex = AdjacencySimplex(gdf, 'Value', threshold = [5,17], filter_method = 'up')

# Filter the GeoDataFrame
filtered_df,gdf_id = adj_simplex.filter_sort_gdf()

# Calculate the adjacent countries
adj_simplex.calculate_adjacent_countries()

# Form the simplicial complex
simplices = adj_simplex.form_simplicial_complex()

In [28]:
gdf.shape

(16, 3)

In [29]:
filtered_df.head()

Unnamed: 0,FIPS,Value,geometry,sortedID
4,E,5,"POLYGON ((0 1, 1 1, 1 2, 0 2, 0 1))",4
5,F,6,"POLYGON ((1 1, 2 1, 2 2, 1 2, 1 1))",5
6,G,7,"POLYGON ((2 1, 3 1, 3 2, 2 2, 2 1))",6
7,H,8,"POLYGON ((3 1, 4 1, 4 2, 3 2, 3 1))",7
8,I,9,"POLYGON ((0 2, 1 2, 1 3, 0 3, 0 2))",8


In [30]:
simplices

[[4],
 [5],
 [4, 5],
 [6],
 [5, 6],
 [7],
 [6, 7],
 [8],
 [4, 8],
 [5, 8],
 [4, 5, 8],
 [9],
 [4, 9],
 [5, 9],
 [4, 5, 9],
 [6, 9],
 [5, 6, 9],
 [8, 9],
 [4, 8, 9],
 [5, 8, 9],
 [10],
 [5, 10],
 [6, 10],
 [5, 6, 10],
 [7, 10],
 [6, 7, 10],
 [9, 10],
 [5, 9, 10],
 [6, 9, 10],
 [11],
 [6, 11],
 [7, 11],
 [6, 7, 11],
 [10, 11],
 [6, 10, 11],
 [7, 10, 11],
 [12],
 [8, 12],
 [9, 12],
 [8, 9, 12],
 [13],
 [8, 13],
 [9, 13],
 [8, 9, 13],
 [10, 13],
 [9, 10, 13],
 [12, 13],
 [8, 12, 13],
 [9, 12, 13],
 [14],
 [9, 14],
 [10, 14],
 [9, 10, 14],
 [11, 14],
 [10, 11, 14],
 [13, 14],
 [9, 13, 14],
 [10, 13, 14],
 [15],
 [10, 15],
 [11, 15],
 [10, 11, 15],
 [14, 15],
 [10, 14, 15],
 [11, 14, 15]]

TDA Summary calaculation code is given below. 

In [31]:
import gudhi
import numpy as np

In [32]:
variable_name = 'Value'

In [33]:

st = gudhi.SimplexTree()
st.set_dimension(2)

for simplex in simplices:
    if len(simplex) == 1:
        st.insert([simplex[0]], filtration=0.0)

for simplex in simplices:
    if len(simplex) == 2:
        last_simplex = simplex[-1]
        filtration_value = filtered_df.loc[filtered_df['sortedID'] == last_simplex, variable_name].values[0]
        st.insert(simplex, filtration=filtration_value)

for simplex in simplices:
    if len(simplex) == 3:
        last_simplex = simplex[-1]
        filtration_value = filtered_df.loc[filtered_df['sortedID'] == last_simplex, variable_name].values[0]
        st.insert(simplex, filtration=filtration_value)

In [34]:
st.compute_persistence()
persistence = st.persistence()

In [35]:
# intervals_dim1 = st.persistence_intervals_in_dimension(1)
intervals_dim0 = st.persistence_intervals_in_dimension(0)

In [36]:
intervals_dim0

array([[ 0.,  6.],
       [ 0.,  7.],
       [ 0.,  8.],
       [ 0.,  9.],
       [ 0., 10.],
       [ 0., 11.],
       [ 0., 12.],
       [ 0., 13.],
       [ 0., 14.],
       [ 0., 15.],
       [ 0., 16.],
       [ 0., inf]])

In [37]:
# get the max value of the filtered_df to replace inf
max_value = filtered_df[variable_name].max()
print(f'max value: {max_value}')



max value: 16


In [38]:
# replace inf with a large number   #this needs to be fixed : previously used 16
intervals_dim0[:, 1][np.isinf(intervals_dim0[:, 1])] = max_value

In [39]:
intervals_dim0

array([[ 0.,  6.],
       [ 0.,  7.],
       [ 0.,  8.],
       [ 0.,  9.],
       [ 0., 10.],
       [ 0., 11.],
       [ 0., 12.],
       [ 0., 13.],
       [ 0., 14.],
       [ 0., 15.],
       [ 0., 16.],
       [ 0., 16.]])

In [40]:
# calculate topological summaries for dimension 1
H0_data_points = len(intervals_dim0)

print(f'Number of H0 data points: {H0_data_points}')

Number of H0 data points: 12


In [43]:
TL = 0
for interval in intervals_dim0:
        TL += interval[1] - interval[0]

print(f'Total length of H0 intervals: {TL}')

Total length of H0 intervals: 137.0


In [44]:
TML = 0
for interval in intervals_dim0:
    TML += (interval[1] + interval[0])/2

print(f'Total mean length of H0 intervals: {TML}')

Total mean length of H0 intervals: 68.5


In [None]:
AL = TL/len(intervals_dim0)
AML = TML/len(intervals_dim0)

In [46]:
TL/2

68.5