In [1]:
import pandas as pd
import geopandas as gpd

In [19]:
majBlackFN = '../data/DURHAM/MajBlack.shp'
keep1 = '../data/DURHAM/keep1.shp'
clusters  = '../data/DURHAM/clusters.shp'

In [12]:
#Read in voter data and blocks and assign a unique index
gdfBlocks = gpd.read_file('../data/DURHAM/DURHAM_blocks.shp')
gdfBlocks["OrgID"] = 0
#gdfVoters = gpd.read_file('../data/DURHAM/DURHAM_voter_points.shp')

In [14]:
#Select blocks that are majority black
gdfMajBlack = gdfBlocks.query('PctBlack >= 50')

In [32]:
#Of those, select blocks that have at least 50 BHH, these we'll keep (1)
gdf_Org1 = gdfMajBlack.query('BlackHH > 50').reset_index()
gdf_Org1['OrgID'] = gdf_Org1.index + 1
gdf_Org1['OrgType'] = 'OriginalBlock'
#gdf_Org1.to_file(keep1)

In [10]:
#Of those, select blocks that have fewer than 50 BHH; these we'll cluster
gdfMajBlack_LT50 = gdfMajBlack.query('BlackHH < 50')
#gdfMajBlack_LT50.to_file(majBlackFN)

In [26]:
#Cluster
gdfClusters = gpd.GeoDataFrame(geometry = list(gdfMajBlack_LT50.unary_union))
gdfClusters['ClusterID'] = gdfClusters.index
gdfClusters.crs = gdfMajBlack_LT50.crs
#gdfClusters.to_file('../data/DURHAM/clusters.shp')

In [27]:
#Spatially join the cluster ID to the original blocks
gdfMajBlack_LT50_2 = gpd.sjoin(gdfMajBlack_LT50,gdfClusters,
                               how='left',op='within').drop("index_right",axis=1)
#gdfMajBlack_LT50_2.to_file('../data/DURHAM/MajBlack1.shp')

In [31]:
#Compute the total BHH for the dissolved blocks and add as block attribute
sumBHH = gdfMajBlack_LT50_2.groupby('ClusterID').agg({'BlackHH':'sum'})
gdfClusters_2 = pd.merge(gdfClusters,sumBHH,left_index=True,right_index=True) 
gdfClusters_2.to_file('../data/DURHAM/clusters2.shp')

In [37]:
#Remove block clusters with fewer than 50 BHH; these are impractical
gdfClusters_2 = gdfClusters_2.query('BlackHH >= 50')
gdfClusters_2.to_file('../data/DURHAM/clusters2.shp')

In [38]:
#Select clusters with fewer than 100 BHH, these we'll keep as org units(2)
gdf_Org2 = gdfClusters_2.query('BlackHH <= 100').reset_index()
gdf_Org2['OrgID'] = gdf_Org1['OrgID'].max() + gdf_Org2.index + 1
gdf_Org2.to_file('../data/DURHAM/keep2.shp')

In [42]:
#Get a list of Cluster IDs for block clusters with more than 100 BHH;
# we'll cluster individual blocks with these IDs until BHH >= 100
clusterIDs = gdfClusters_2.query('BlackHH > 100').ClusterID.unique()

array([  0,   1,   2,   4,   5,   6,   7,  49,  60,  72,  73,  76,  92,
        94,  95,  96,  99, 120, 149, 181, 186, 202, 204, 206, 214, 222, 236], dtype=int64)

In [56]:
#Iterate through each clusterID
for clusterID in clusterIDs:
    #Get the blocks in the cluster
    gdfBlks = gdfMajBlack_LT50_2.query('ClusterID == {}'.format(clusterID)).reset_index()
    #Get the first block in the block cluster
    gdfBlks['X'] = gdfBlks.geometry.centroid.x
    gdfNbrs = gdfBlks[gdfBlks['X'] == gdfBlks['X'].min()]
    #Get the number of BHH and the geometry
    BHH = gdfNbrs.BlackHH.sum()
    geom = gdfNbrs.geometry.unary_union
    
    #Get the blocks that touch the current geometry
    gdfNbrs1 = gdfBlks[gdfBlks.geometry.intersects(geom)]
    #Sort the selection on BlackHH
    gdfNbrs1.sort_values('BlackHH',ascending=False)
    #Add the first geometry, if aggregate BHH remains <= 100
    for r, i in gdfNbrs1.iterrows():
        #Skip if the sum would exceed 100 BHH
        if r.BlackHH + BHH > 100: 
            continue
        #Otherwise merge the BHH and the geom
        BHH += r.BlackHH
        geom = geom