Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100755 169 lines (140 sloc) 6.203 kB
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
1 #!/usr/bin/env python
2
bf42e3e Implemented SF field mapping.
cgroskopf authored
3 import csv
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
4 import re
5 import sys
6
7 from csvkit.unicsv import UnicodeCSVReader
8
d8193df @JoeGermuska factor out some things to utils
JoeGermuska authored
9 import config, utils
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
10 import logging
11
c9ff7fc Labels tests.
cgroskopf authored
12 TABLE_NAME_PATTERN = re.compile(r'^(?P<name>.+)\s+\[(?P<size>\d+)\].*$')
d8193df @JoeGermuska factor out some things to utils
JoeGermuska authored
13
bf42e3e Implemented SF field mapping.
cgroskopf authored
14
15 KEY_MAPPINGS = {}
16
17 with open('field_mappings_2000_2010.csv', 'rU') as f:
18 reader = csv.DictReader(f)
19
20 for row in reader:
21 # Skip fields that don't map
22 if not row['field_2000']:
23 continue
24
25 if not row['field_2010']:
26 continue
27
28 # TODO - skipping computed fields
29 if '-' in row['field_2000'] or '+' in row['field_2000']:
30 continue
31
32 if '-' in row['field_2010'] or '+' in row['field_2010']:
33 continue
34
35 KEY_MAPPINGS[row['field_2010']] = row['field_2000']
36
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
37 YEAR = '2010'
38
39 def is_skipworthy_row(row):
40 chk = set(row)
41 if len(chk) == 1 and '' in chk:
42 return True
43 if row[:3] == ['','',''] and row[3].startswith('NOTE'):
44 return True
45 if row[1] and not row[0]: # section header
46 return True
47 return False
48
49 def dictify_row(row):
50 row = map(unicode.strip,row)
51 if is_skipworthy_row(row): return None
52 table_id, line, indent = row[0:3]
53 continuation = not table_id and not line and not indent
54 if table_id:
55 if table_id.endswith('.'): table_id = table_id[:-1]
56 if indent:
57 indent = int(indent)
58 if line:
59 line = int(line)
60 return {
61 'table_id': table_id,
62 'line': line,
63 'indent': indent,
64 'labels': row[3:9],
65 'continuation': continuation
66 }
67
68 if __name__ == '__main__':
69 if len(sys.argv) < 2:
70 sys.exit('You must provide the filename of a CSV as an argument to this script.')
71
72 FILENAME = sys.argv[1]
73
74 with open(FILENAME) as f:
75 rows = UnicodeCSVReader(f, encoding='latin-1')
76 headers = rows.next()
77
78 inserts = 0
79 row_count = 0
80 skipped = 0
81
82 table = None
83 tables = {}
84 hierarchy = []
85 last_key = ''
86 last_indent = 0
87
88 for row in rows:
89 row_count += 1
90 if not row: continue
91 row = map(unicode.strip,row)
92 row = dictify_row(row)
93 if row:
94 if row['continuation']:
95 idx = last_processed['indent'] + 1
96 fragment = row['labels'][idx]
97 last_processed['text'] += ' %s' % fragment
98 continue
99
100 table = tables.setdefault(row['table_id'],{ 'key': row['table_id'], 'year': '2010', 'labels': {} })
101
102 if not row['line']: # we probably have a table name or a universe
103 if row['labels'][0].startswith("Universe:"):
104 parts = row['labels'][0].split(":", 2)
105 table['universe'] = parts[1].strip()
106 else:
107 # we know that they have extra labels for "indents" for avg/median that we just want to skip
108 if not row['labels'][0].startswith('Average') and not row['labels'][0].startswith('Median'):
109 match = TABLE_NAME_PATTERN.match(row['labels'][0])
110 if not match:
111 if not row['labels'][0]: continue
112 fix_row = rows.next()
113 dfr = dictify_row(fix_row)
114 row['labels'][0] += ' %s' % dfr['labels'][1]
115 match = TABLE_NAME_PATTERN.match(row['labels'][0])
116 if not match:
117 logging.warn( "Expected a table name at row %i [%s]" % ( row_count, row['labels'][0] ) )
118 continue
119 name_dict = match.groupdict()
120 table['name'] = name_dict['name']
121 table['size'] = int(name_dict['size'])
122 else: # there's a line number
d8193df @JoeGermuska factor out some things to utils
JoeGermuska authored
123 key = utils.generate_stat_key(row['table_id'],row['line'])
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
124 parent = parent_key = None
125 if row['indent'] > 0:
126 chk_line = row['line']
127 while parent is None and chk_line > 1:
128 chk_line -= 1
d8193df @JoeGermuska factor out some things to utils
JoeGermuska authored
129 parent_key = utils.generate_stat_key(row['table_id'],chk_line)
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
130 chk_parent = table['labels'][parent_key]
131 if chk_parent['indent'] == row['indent'] - 1:
132 parent = chk_parent
133 parent['has_children'] = True
134 parent_key = parent['key']
135
136 last_processed = {
137 'key': key,
138 'text': row['labels'][row['indent']],
139 'indent': row['indent'],
140 'parent': parent_key,
141 'has_children': False, #maybe! we'll reset this later in the loop if we discover otherwise. look up.
bf42e3e Implemented SF field mapping.
cgroskopf authored
142 'key_2000': KEY_MAPPINGS[key] if key in KEY_MAPPINGS else None,
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
143 } # keep it around for later
bf42e3e Implemented SF field mapping.
cgroskopf authored
144
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
145 table['labels'][key] = last_processed # but also save it...
146 # Save final table
147 # sanity check:
148 for k,v in tables.items():
149 if not k:
150 print "still have an empty key!"
151 else:
152 if k != v['key']:
153 raise AssertionError("Keys don't match for k=%s" % k)
154 try:
155 if len(v['labels']) != v['size']:
156 raise AssertionError("Not enough labels for k=%s expected %i got %i" % (k,v['size'],len(v['labels'])))
157 except KeyError:
158 print "Unexpectedly missing size for table %s keys: %s" % (k, ','.join(v.keys()))
159
70eb7a3 @JoeGermuska push mongo collections back into dataprocessing/utils
JoeGermuska authored
160 collection = utils.get_label_collection()
65d1949 @onyxfish Added safe checks to all database writes. Closes #46.
onyxfish authored
161 collection.remove({ 'dataset': 'SF1' }, safe=True)
162 collection.save({ 'dataset': 'SF1', 'tables': tables}, safe=True)
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
163
97a5fb5 @JoeGermuska minor tweaks to make console spew slightly more informative
JoeGermuska authored
164 print 'load_sf_labels_2010:'
165 print ' Row count: %i' % row_count
166 print ' Skipped: %i' % skipped
167 print ' Tables: %i' % len(tables)
e9aa10a @JoeGermuska sf1 loader
JoeGermuska authored
168
Something went wrong with that request. Please try again.