Skip to content
This repository
Browse code

Added experimental sf 2000 loaders.

  • Loading branch information...
commit 340c769267cb4edce925187735590ce16610ac70 1 parent 1d64821
authored May 17, 2011
2  censusweb/requirements.txt
@@ -10,3 +10,5 @@ pycrypto==2.3
10 10
 python-memcached==1.47
11 11
 simplejson==2.1.3
12 12
 wsgiref==0.1.2
  13
+pymongo==1.11
  14
+csvkit==0.2
13  dataprocessing/batch_sf.sh
... ...
@@ -0,0 +1,13 @@
  1
+#!/bin/bash
  2
+
  3
+# See batch.sh for notes.
  4
+
  5
+#echo "use census; 
  6
+#db.dropDatabase();" | mongo
  7
+
  8
+./fetch_sf_data.sh
  9
+
  10
+./load_sf_geographies_2000.py data/degeo2000.csv
  11
+./load_sf_data_2000.py data/sf_data_2000_delaware_1.csv
  12
+
  13
+#./load_pl_labels_2010.py data/pl_2010_data_labels.csv
3  dataprocessing/config.py
@@ -6,9 +6,10 @@
6 6
 SUMLEV_COUNTY = '050'
7 7
 SUMLEV_TRACT = '140'
8 8
 SUMLEV_PLACE = '160'
  9
+SUMLEV_BLOCK = '101'
9 10
 
10 11
 # Summary levels to load
11  
-SUMLEVS = [SUMLEV_NATION, SUMLEV_STATE, SUMLEV_COUNTY, SUMLEV_PLACE, SUMLEV_TRACT]
  12
+SUMLEVS = [SUMLEV_NATION, SUMLEV_STATE, SUMLEV_COUNTY, SUMLEV_PLACE, SUMLEV_TRACT, SUMLEV_BLOCK]
12 13
 
13 14
 # Mongo
14 15
 CENSUS_DB = 'census'
25  dataprocessing/fetch_sf_data.sh
... ...
@@ -0,0 +1,25 @@
  1
+#!/bin/bash
  2
+
  3
+# NO NATIONAL DATA!!!
  4
+
  5
+rm -r data
  6
+mkdir data
  7
+cd data
  8
+
  9
+# 2000 - DELAWARE - SF1
  10
+wget http://www2.census.gov/census_2000/datasets/Summary_File_1/Delaware/de00001_uf1.zip
  11
+unzip de00001_uf1.zip
  12
+
  13
+wget http://www2.census.gov/census_2000/datasets/Summary_File_1/Delaware/degeo_uf1.zip
  14
+unzip degeo_uf1.zip
  15
+
  16
+wget http://www.census.gov/support/2000/SF1/Access97.zip
  17
+unzip Access97.zip
  18
+mdb-export SF1.mdb SF10001 > sf_data_2000_headers_1.csv
  19
+mdb-export SF1.mdb TABLES > sf_2000_data_labels.csv
  20
+
  21
+rm sf_data_2000_delaware_1.csv
  22
+cat sf_data_2000_headers_1.csv > sf_data_2000_delaware_1.csv
  23
+cat de00001.uf1 >> sf_data_2000_delaware_1.csv
  24
+
  25
+in2csv -f fixed -s ../census2000_geo_schema.csv degeo.uf1 > degeo2000.csv
61  dataprocessing/load_sf_data_2000.py
... ...
@@ -0,0 +1,61 @@
  1
+#!/usr/bin/env python
  2
+
  3
+import sys
  4
+
  5
+from csvkit.unicsv import UnicodeCSVReader
  6
+from pymongo import Connection
  7
+
  8
+import config
  9
+import utils
  10
+
  11
+if len(sys.argv) < 2:
  12
+    sys.exit('You must provide the filename of a CSV as an argument to this script.')
  13
+
  14
+FILENAME = sys.argv[1]
  15
+
  16
+YEAR = '2000'
  17
+
  18
+connection = Connection()
  19
+db = connection[config.CENSUS_DB] 
  20
+collection = db[config.GEOGRAPHIES_2000_COLLECTION]
  21
+
  22
+with open(FILENAME) as f:
  23
+    rows = UnicodeCSVReader(f)
  24
+    headers = rows.next()
  25
+
  26
+    inserts = 0
  27
+    row_count = 0
  28
+
  29
+    for row in rows:
  30
+        row_count += 1
  31
+        row_dict = dict(zip(headers, row))
  32
+
  33
+        xref = utils.xref_from_row_dict(row_dict)
  34
+
  35
+        geography = utils.find_geography_by_xref(collection, xref) 
  36
+
  37
+        if not geography:
  38
+            continue
  39
+
  40
+        if YEAR not in geography['data']:
  41
+            geography['data'][YEAR] = {}
  42
+
  43
+        tables = {}
  44
+
  45
+        for k, v in row_dict.items():
  46
+            t = 'SF' + k[3]
  47
+
  48
+            if t not in tables:
  49
+                tables[t] = {}
  50
+
  51
+            tables[t][k] = int(v)
  52
+
  53
+        for k, v in tables.items():
  54
+            geography['data'][YEAR][k] = v 
  55
+
  56
+        collection.save(geography)
  57
+        inserts += 1
  58
+
  59
+print 'Row count: %i' % row_count
  60
+print 'Inserted: %i' % inserts
  61
+
70  dataprocessing/load_sf_geographies_2000.py
... ...
@@ -0,0 +1,70 @@
  1
+#!/usr/bin/env python
  2
+
  3
+import sys
  4
+
  5
+from csvkit.unicsv import UnicodeCSVReader
  6
+from pymongo import Connection
  7
+
  8
+import config
  9
+import utils
  10
+
  11
+if len(sys.argv) < 2:
  12
+    sys.exit('You must provide the filename of a CSV as an argument to this script.')
  13
+
  14
+FILENAME = sys.argv[1]
  15
+
  16
+connection = Connection()
  17
+db = connection[config.CENSUS_DB]
  18
+collection = db[config.GEOGRAPHIES_2000_COLLECTION]
  19
+
  20
+with open(FILENAME) as f:
  21
+    rows = UnicodeCSVReader(f)
  22
+    headers = rows.next()
  23
+
  24
+    inserts = 0
  25
+    updates = 0
  26
+    row_count = 0
  27
+
  28
+    for row in rows:
  29
+        row_count += 1
  30
+
  31
+        geography = {
  32
+            #'sumlev': '',
  33
+            #'geoid': '',
  34
+            #'metadata': {},
  35
+            #'xrefs': [],
  36
+            #'data': {}
  37
+            #'xwalk': {}
  38
+            #'shape': ''     # TODO
  39
+        }
  40
+        row_dict = dict(zip(headers, row))
  41
+
  42
+        if row_dict['SUMLEV'] not in config.SUMLEVS:
  43
+            continue
  44
+
  45
+        geography['sumlev'] = row_dict.pop('SUMLEV')
  46
+        geography['geoid'] = utils.GEOID_COMPUTERS[geography['sumlev']](row_dict)
  47
+
  48
+        xref = utils.xref_from_row_dict(row_dict) 
  49
+
  50
+        existing = collection.find_one(geography)
  51
+        if existing:
  52
+            if xref not in existing['xrefs']:
  53
+                existing['xrefs'].append(xref)
  54
+                collection.save(existing)
  55
+
  56
+                updates += 1
  57
+
  58
+            continue
  59
+
  60
+        geography['xrefs'] = [xref]
  61
+        geography['data'] = {}
  62
+        geography['metadata'] = row_dict
  63
+
  64
+        collection.save(geography)
  65
+        inserts += 1
  66
+
  67
+print 'Row count: %i' % row_count
  68
+print 'Inserted: %i' % inserts
  69
+print 'Updated: %i' % updates
  70
+
4  dataprocessing/utils.py
@@ -18,12 +18,16 @@ def geoid_tract(r):
18 18
 def geoid_place(r):
19 19
     return r['STATE'] + r['PLACE']
20 20
 
  21
+def geoid_block(r):
  22
+    return r['STATE'] + r['COUNTY'] + r['TRACT'] + r['BLOCK']
  23
+
21 24
 GEOID_COMPUTERS = {
22 25
     config.SUMLEV_NATION: geoid_nation,
23 26
     config.SUMLEV_STATE: geoid_state,
24 27
     config.SUMLEV_COUNTY: geoid_county,
25 28
     config.SUMLEV_TRACT: geoid_tract,
26 29
     config.SUMLEV_PLACE: geoid_place,
  30
+    config.SUMLEV_BLOCK: geoid_block,
27 31
 }
28 32
 
29 33
 def find_geography_by_xref(collection, xref):

0 notes on commit 340c769

Please sign in to comment.
Something went wrong with that request. Please try again.