Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added experimental sf 2000 loaders.

  • Loading branch information...
commit 340c769267cb4edce925187735590ce16610ac70 1 parent 1d64821
cgroskopf authored
View
2  censusweb/requirements.txt
@@ -10,3 +10,5 @@ pycrypto==2.3
python-memcached==1.47
simplejson==2.1.3
wsgiref==0.1.2
+pymongo==1.11
+csvkit==0.2
View
13 dataprocessing/batch_sf.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# See batch.sh for notes.
+
+#echo "use census;
+#db.dropDatabase();" | mongo
+
+./fetch_sf_data.sh
+
+./load_sf_geographies_2000.py data/degeo2000.csv
+./load_sf_data_2000.py data/sf_data_2000_delaware_1.csv
+
+#./load_pl_labels_2010.py data/pl_2010_data_labels.csv
View
3  dataprocessing/config.py
@@ -6,9 +6,10 @@
SUMLEV_COUNTY = '050'
SUMLEV_TRACT = '140'
SUMLEV_PLACE = '160'
+SUMLEV_BLOCK = '101'
# Summary levels to load
-SUMLEVS = [SUMLEV_NATION, SUMLEV_STATE, SUMLEV_COUNTY, SUMLEV_PLACE, SUMLEV_TRACT]
+SUMLEVS = [SUMLEV_NATION, SUMLEV_STATE, SUMLEV_COUNTY, SUMLEV_PLACE, SUMLEV_TRACT, SUMLEV_BLOCK]
# Mongo
CENSUS_DB = 'census'
View
25 dataprocessing/fetch_sf_data.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# NO NATIONAL DATA!!!
+
+rm -r data
+mkdir data
+cd data
+
+# 2000 - DELAWARE - SF1
+wget http://www2.census.gov/census_2000/datasets/Summary_File_1/Delaware/de00001_uf1.zip
+unzip de00001_uf1.zip
+
+wget http://www2.census.gov/census_2000/datasets/Summary_File_1/Delaware/degeo_uf1.zip
+unzip degeo_uf1.zip
+
+wget http://www.census.gov/support/2000/SF1/Access97.zip
+unzip Access97.zip
+mdb-export SF1.mdb SF10001 > sf_data_2000_headers_1.csv
+mdb-export SF1.mdb TABLES > sf_2000_data_labels.csv
+
+rm sf_data_2000_delaware_1.csv
+cat sf_data_2000_headers_1.csv > sf_data_2000_delaware_1.csv
+cat de00001.uf1 >> sf_data_2000_delaware_1.csv
+
+in2csv -f fixed -s ../census2000_geo_schema.csv degeo.uf1 > degeo2000.csv
View
61 dataprocessing/load_sf_data_2000.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+import sys
+
+from csvkit.unicsv import UnicodeCSVReader
+from pymongo import Connection
+
+import config
+import utils
+
+if len(sys.argv) < 2:
+ sys.exit('You must provide the filename of a CSV as an argument to this script.')
+
+FILENAME = sys.argv[1]
+
+YEAR = '2000'
+
+connection = Connection()
+db = connection[config.CENSUS_DB]
+collection = db[config.GEOGRAPHIES_2000_COLLECTION]
+
+with open(FILENAME) as f:
+ rows = UnicodeCSVReader(f)
+ headers = rows.next()
+
+ inserts = 0
+ row_count = 0
+
+ for row in rows:
+ row_count += 1
+ row_dict = dict(zip(headers, row))
+
+ xref = utils.xref_from_row_dict(row_dict)
+
+ geography = utils.find_geography_by_xref(collection, xref)
+
+ if not geography:
+ continue
+
+ if YEAR not in geography['data']:
+ geography['data'][YEAR] = {}
+
+ tables = {}
+
+ for k, v in row_dict.items():
+ t = 'SF' + k[3]
+
+ if t not in tables:
+ tables[t] = {}
+
+ tables[t][k] = int(v)
+
+ for k, v in tables.items():
+ geography['data'][YEAR][k] = v
+
+ collection.save(geography)
+ inserts += 1
+
+print 'Row count: %i' % row_count
+print 'Inserted: %i' % inserts
+
View
70 dataprocessing/load_sf_geographies_2000.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import sys
+
+from csvkit.unicsv import UnicodeCSVReader
+from pymongo import Connection
+
+import config
+import utils
+
+if len(sys.argv) < 2:
+ sys.exit('You must provide the filename of a CSV as an argument to this script.')
+
+FILENAME = sys.argv[1]
+
+connection = Connection()
+db = connection[config.CENSUS_DB]
+collection = db[config.GEOGRAPHIES_2000_COLLECTION]
+
+with open(FILENAME) as f:
+ rows = UnicodeCSVReader(f)
+ headers = rows.next()
+
+ inserts = 0
+ updates = 0
+ row_count = 0
+
+ for row in rows:
+ row_count += 1
+
+ geography = {
+ #'sumlev': '',
+ #'geoid': '',
+ #'metadata': {},
+ #'xrefs': [],
+ #'data': {}
+ #'xwalk': {}
+ #'shape': '' # TODO
+ }
+ row_dict = dict(zip(headers, row))
+
+ if row_dict['SUMLEV'] not in config.SUMLEVS:
+ continue
+
+ geography['sumlev'] = row_dict.pop('SUMLEV')
+ geography['geoid'] = utils.GEOID_COMPUTERS[geography['sumlev']](row_dict)
+
+ xref = utils.xref_from_row_dict(row_dict)
+
+ existing = collection.find_one(geography)
+ if existing:
+ if xref not in existing['xrefs']:
+ existing['xrefs'].append(xref)
+ collection.save(existing)
+
+ updates += 1
+
+ continue
+
+ geography['xrefs'] = [xref]
+ geography['data'] = {}
+ geography['metadata'] = row_dict
+
+ collection.save(geography)
+ inserts += 1
+
+print 'Row count: %i' % row_count
+print 'Inserted: %i' % inserts
+print 'Updated: %i' % updates
+
View
4 dataprocessing/utils.py
@@ -18,12 +18,16 @@ def geoid_tract(r):
def geoid_place(r):
return r['STATE'] + r['PLACE']
+def geoid_block(r):
+ return r['STATE'] + r['COUNTY'] + r['TRACT'] + r['BLOCK']
+
GEOID_COMPUTERS = {
config.SUMLEV_NATION: geoid_nation,
config.SUMLEV_STATE: geoid_state,
config.SUMLEV_COUNTY: geoid_county,
config.SUMLEV_TRACT: geoid_tract,
config.SUMLEV_PLACE: geoid_place,
+ config.SUMLEV_BLOCK: geoid_block,
}
def find_geography_by_xref(collection, xref):
Please sign in to comment.
Something went wrong with that request. Please try again.