-
Notifications
You must be signed in to change notification settings - Fork 33
/
aggregate.py
executable file
·171 lines (121 loc) · 5.28 KB
/
aggregate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python
"""
How to use aggregate census data to arbitrary geographies:
* Update config.py, adding SUMLEV_BLOCK to the list of summary levels to be loaded. Example:
SUMLEVS = [SUMLEV_BLOCK]
* Optionally, alse in config.py update the filter_geographies() function to reduce the data to a one or more counties. Example:
def filter_geographies(row_dict):
return row_dict['COUNTY'] == '003'
* Load the data for your state. Example:
./batch_sf.sh Hawaii staging
* Download the 2000 (NOT 2010) block-level shapefile for your state (or county), unzip it, and copy the enclosed .dbf file somewhere convenient. You can delete the rest.
* Run this aggregation script, providing the following parameters: the .dbf you just extracted, the shapefile you want to aggregate by, a name for the new summary level. Optionally you may also provide the name of an column in the .dbf to use as a unique id. Example:
./aggregate.py tl_2010_15003_tabblock00.dbf wards/wards.shp ward WARD_ID
* Rerun the crosswalk and compute_deltas scripts, limiting them to only the summary level you just created. Example:
./crosswalk ward
./compute_deltas ward
* You now have arbitrarily aggregated geographies in your Mongo database.
"""
import sys
import dbf
from osgeo import ogr
import config
import utils
if len(sys.argv) < 2:
sys.exit('You must provide a DBF file for the 2000 blocks that will be aggregated, a shapefile to aggregate by, and a name for the new summary level. You may optionally provide the name of a field within the shapefile to use as the unique ID.')
BLOCKS_DBF = sys.argv[1]
SHAPEFILE = sys.argv[2]
NEW_SUMLEV = sys.argv[3]
if len(sys.argv) == 5:
ID_FIELD = sys.argv[4]
else:
ID_FIELD = None
shapes = ogr.Open(SHAPEFILE)
shapes_layer = shapes[0]
def load_2000_block_points(filename):
"""
Builds a dictionary of internal points from the DBF associated with a
shapefile of 2000 blocks (the 2010 release of those blocks).
This works around inaccuracies in the internal points embedded in the
2000 geoheaders.
"""
table = dbf.Table(filename)
points = {}
for row in table:
points[row['blkidfp00']] = {
'lat': float(row['intptlat00']),
'lon': float(row['intptlon00']),
}
print 'Loaded %i points from %s' % (len(points), filename)
return points
def find_geometry_containing_point(lat, lon):
"""
Iterate over all aggregating geometries to see if any contain the given point.
"""
shapes_layer.ResetReading()
point = ogr.Geometry(ogr.wkbPoint)
point.SetPoint_2D(0, lon, lat)
for feature in shapes_layer:
geometry = feature.GetGeometryRef()
if geometry.Contains(point):
if ID_FIELD:
return feature.GetField(ID_FIELD)
return feature.GetFID()
return None
def make_custom_geoid(sumlev, feature_id):
"""
Generate a unique geoid for the given feature.
"""
return '%s_%s' % (sumlev, feature_id)
def aggregate_geographies(collection, dataset, points=None):
"""
Aggregate geographies in the given collection.
"""
print 'Aggregating collection "%s"' % collection.name
collection.remove({ 'sumlev': NEW_SUMLEV }, safe=True)
count_tested = 0
count_allocated = 0
count_new_features = 0
for geography in collection.find({ 'sumlev': config.SUMLEV_BLOCK }):
if points:
lat = points[geography['geoid']]['lat']
lon = points[geography['geoid']]['lon']
else:
lat = float(geography['metadata']['INTPTLAT'])
lon = float(geography['metadata']['INTPTLON'])
feature_id = find_geometry_containing_point(lat, lon)
count_tested += 1
if not feature_id:
continue
new_geoid = make_custom_geoid(NEW_SUMLEV, feature_id)
new_geography = collection.find_one({ 'geoid': new_geoid })
# Create an aggregating geography if it doesn't exist
if not new_geography:
new_geography = {
'sumlev': NEW_SUMLEV,
'geoid': new_geoid,
'metadata': {
'NAME': str(feature_id),
},
'data': {
dataset: {},
}
}
count_new_features += 1
# Update new geography with values from constituent block
for table in geography['data'][dataset]:
if table not in new_geography['data'][dataset]:
new_geography['data'][dataset][table] = {}
for field, value in geography['data'][dataset][table].items():
if field not in new_geography['data'][dataset][table]:
new_geography['data'][dataset][table][field] = 0
new_geography['data'][dataset][table][field] = float(new_geography['data'][dataset][table][field]) + float(value)
collection.save(new_geography, safe=True)
count_allocated += 1
print 'Tested: %i' % count_tested
print 'Allocated: %i' % count_allocated
print 'New Features: %i' % count_new_features
# Run!
points_2000 = load_2000_block_points(BLOCKS_DBF)
aggregate_geographies(utils.get_geography2000_collection(), '2000', points=points_2000)
aggregate_geographies(utils.get_geography_collection(), '2010')