In [1]:
import pandas as pd
import geopandas as gpd

In [2]:
majBlackFN = '../data/DURHAM/MajBlack.shp'
keep1 = '../data/DURHAM/keep1.shp'
clusters  = '../data/DURHAM/clusters.shp'

In [3]:
#Read in blocks and assign a unique index
gdfBlocks = gpd.read_file('../data/DURHAM/DURHAM_blocks.shp')
gdfBlocks["OrgID"] = 0

In [4]:
#Select blocks that are majority black
gdfMajBlack = gdfBlocks.query('PctBlack >= 50')

In [5]:
#Of those, select blocks that have at least 50 BHH, these we'll keep (1)
gdf_Org1 = gdfMajBlack.query('BlackHH > 50').reset_index()
gdf_Org1.drop(['index', 'STATEFP10', 'COUNTYFP10', 
               'TRACTCE10', 'BLOCKCE', 'BLOCKID10',
               'GEOID10','PARTFLG'],axis=1,inplace=True)
gdf_Org1['OrgID'] = gdf_Org1.index + 1
gdf_Org1['OrgType'] = 'OriginalBlock'
gdf_Org1.to_file(keep1)

In [6]:
#Of those, select blocks that have fewer than 50 BHH; these we'll cluster
gdfMajBlack_LT50 = gdfMajBlack.query('BlackHH < 50')

In [7]:
#Cluster
gdfClusters = gpd.GeoDataFrame(geometry = list(gdfMajBlack_LT50.unary_union))
gdfClusters['ClusterID'] = gdfClusters.index
gdfClusters.crs = gdfMajBlack_LT50.crs
#gdfClusters.to_file('../data/DURHAM/clusters.shp')

In [8]:
#Spatially join the cluster ID to the original blocks
gdfMajBlack_LT50_2 = gpd.sjoin(gdfMajBlack_LT50,gdfClusters,
                               how='left',op='within').drop("index_right",axis=1)
#gdfMajBlack_LT50_2.to_file('../data/DURHAM/MajBlack1.shp')

In [14]:
#Compute the total BHH for the dissolved blocks and add as block attribute
gdfClusters_2 = gdfMajBlack_LT50_2.dissolve(by='ClusterID', aggfunc='sum')
gdfClusters_2['PctBlack'] = gdfClusters_2['P003003'] / gdfClusters_2['P003001'] * 100
gdfClusters_2['PctBlack18'] = gdfClusters_2['P010004'] / gdfClusters_2['P010001'] * 100

#Remove block clusters with fewer than 50 BHH; these are impractical
gdfClusters_2 = gdfClusters_2.query('BlackHH >= 50')
#gdfClusters_2.to_file('../data/DURHAM/clusters2.shp')

In [10]:
#Select clusters with fewer than 100 BHH, these we'll keep as org units(2)
gdf_Org2 = gdfClusters_2.query('BlackHH <= 100').reset_index()
gdf_Org2['OrgID'] = gdf_Org1['OrgID'].max() + gdf_Org2.index + 1
gdf_Org2['OrgType'] = 'Full block cluster'
gdf_Org2.to_file('../data/DURHAM/keep2.shp')

In [11]:
#Get a list of Cluster IDs for block clusters with more than 100 BHH;
# we'll cluster individual blocks with these IDs until BHH >= 100
clusterIDs = gdfClusters_2.query('BlackHH > 100').index.unique()

In [21]:
#Iterate through each clusterID
gdfs = []
for clusterID in clusterIDs:
    #Get the blocks in the cluster
    gdfBlks = gdfMajBlack_LT50_2.query('ClusterID == {}'.format(clusterID)).reset_index()
    #Get the first block in the block cluster
    gdfBlks['X'] = gdfBlks.geometry.centroid.x
    gdfNbrs = gdfBlks[gdfBlks['X'] == gdfBlks['X'].min()]
    #Get the number of BHH and the geometry
    BHH = gdfNbrs.BlackHH.sum()
    geom = gdfNbrs.geometry.unary_union
    
    #Iterate while BHH <= 100
    iter_count = 0
    while BHH < 100 and iter_count < 100:
        #Get the blocks that touch the current geometry
        gdfNbrs = gdfBlks[gdfBlks.geometry.intersects(geom)]
        #Compute the new aggregate BHH and Geometry
        BHH = gdfNbrs.BlackHH.sum()
        geom = gdfNbrs.geometry.unary_union
        #Increae the iter_count to catch infinite loops
        iter_count += 1
        
    #After 100 BHH are reached: select blocks intersecting the geom and assign an org unit ID
    gdfSelect = (gdfBlks[gdfBlks.geometry.intersects(geom)]
                 .reset_index()
                 .dissolve(by='ClusterID', aggfunc='sum')
                 .drop(['level_0','index','X'],axis=1)
                )
    
    gdfs.append(gdfSelect)
    

#Combine each cluster into a single dataframe and write to a file
gdf_Org3 = pd.concat(gdfs).reset_index()
gdf_Org3['OrgID'] = gdf_Org2['OrgID'].max() + gdf_Org3.index + 1
gdf_Org3['OrgType'] = 'Partial block cluster'
gdf_Org3.to_file('../data/Durham/Keep3.shp')

In [19]:
#Iterate through each clusterID
gdfs = []
for clusterID in clusterIDs:
    #Get the blocks in the cluster
    gdfBlks = gdfMajBlack_LT50_2.query('ClusterID == {}'.format(clusterID)).reset_index()
    #Set a switch to see if the block has been added yet
    gdfBlks['claimed'] = 0
    #Get the first block in the block cluster
    gdfBlks['X'] = gdfBlks.geometry.centroid.x
    gdfNbrs = gdfBlks[gdfBlks['X'] == gdfBlks['X'].min()]
    #Get the number of BHH and the geometry
    BHH = gdfNbrs.BlackHH.sum()
    geom = gdfNbrs.geometry.unary_union
    
    #Iterate while BHH <= 100
    iter_count = 0
    while BHH < 100 and iter_count < 100:
        #Get the blocks that touch the current geometry
        gdfNbrs = gdfBlks[gdfBlks.geometry.intersects(geom) & gdfBlks.claimed == 0]
        gdfBlks.loc[gdfBlks.geometry.intersects(geom),'claimed'] = 1
        #Compute the new aggregate BHH and Geometry
        BHH = gdfNbrs.BlackHH.sum()
        geom = gdfNbrs.geometry.unary_union
        #Increae the iter_count to catch infinite loops
        iter_count += 1
        
    #After 100 BHH are reached: select blocks intersecting the geom and assign an org unit ID
    gdfSelect = (gdfBlks[gdfBlks.geometry.intersects(geom)]
                 .reset_index()
                 .dissolve(by='ClusterID', aggfunc='sum')
                 .drop(['level_0','index','X'],axis=1)
                )
    
    gdfs.append(gdfSelect)
    
#Repeat, but with remaining unclaimed blocks with < 50 BHH
gdfsClaimed = pd.concat(gdfs).reset_index()


#Combine each cluster into a single dataframe and write to a file
gdf_Org3 = pd.concat(gdfs).reset_index()
gdf_Org3['OrgID'] = gdf_Org2['OrgID'].max() + gdf_Org3.index + 1
gdf_Org3['OrgType'] = 'Partial block cluster'
gdf_Org3.to_file('../data/Durham/Keep4.shp')

In [13]:
#Merge all three keepers
gdfAllOrgs = pd.concat((gdf_Org1, gdf_Org2, gdf_Org3),axis=0,sort=True)
gdfAllOrgs.to_file('../data/DURHAM/Orgs.shp')