-
Notifications
You must be signed in to change notification settings - Fork 33
/
load_sf_labels_2010.py
executable file
·168 lines (140 loc) · 6.06 KB
/
load_sf_labels_2010.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python
import csv
import re
import sys
from csvkit.unicsv import UnicodeCSVReader
import config, utils
import logging
TABLE_NAME_PATTERN = re.compile(r'^(?P<name>.+)\s+\[(?P<size>\d+)\].*$')
KEY_MAPPINGS = {}
with open('field_mappings_2000_2010.csv', 'rU') as f:
reader = csv.DictReader(f)
for row in reader:
# Skip fields that don't map
if not row['field_2000']:
continue
if not row['field_2010']:
continue
# TODO - skipping computed fields
if '-' in row['field_2000'] or '+' in row['field_2000']:
continue
if '-' in row['field_2010'] or '+' in row['field_2010']:
continue
KEY_MAPPINGS[row['field_2010']] = row['field_2000']
YEAR = '2010'
def is_skipworthy_row(row):
chk = set(row)
if len(chk) == 1 and '' in chk:
return True
if row[:3] == ['','',''] and row[3].startswith('NOTE'):
return True
if row[1] and not row[0]: # section header
return True
return False
def dictify_row(row):
row = map(unicode.strip,row)
if is_skipworthy_row(row): return None
table_id, line, indent = row[0:3]
continuation = not table_id and not line and not indent
if table_id:
if table_id.endswith('.'): table_id = table_id[:-1]
if indent:
indent = int(indent)
if line:
line = int(line)
return {
'table_id': table_id,
'line': line,
'indent': indent,
'labels': row[3:9],
'continuation': continuation
}
if __name__ == '__main__':
if len(sys.argv) < 2:
sys.exit('You must provide the filename of a CSV as an argument to this script.')
FILENAME = sys.argv[1]
with open(FILENAME) as f:
rows = UnicodeCSVReader(f, encoding='latin-1')
headers = rows.next()
inserts = 0
row_count = 0
skipped = 0
table = None
tables = {}
hierarchy = []
last_key = ''
last_indent = 0
for row in rows:
row_count += 1
if not row: continue
row = map(unicode.strip,row)
row = dictify_row(row)
if row:
if row['continuation']:
idx = last_processed['indent'] + 1
fragment = row['labels'][idx]
last_processed['text'] += ' %s' % fragment
continue
table = tables.setdefault(row['table_id'],{ 'key': row['table_id'], 'year': '2010', 'labels': {} })
if not row['line']: # we probably have a table name or a universe
if row['labels'][0].startswith("Universe:"):
parts = row['labels'][0].split(":", 2)
table['universe'] = parts[1].strip()
else:
# we know that they have extra labels for "indents" for avg/median that we just want to skip
if not row['labels'][0].startswith('Average') and not row['labels'][0].startswith('Median'):
match = TABLE_NAME_PATTERN.match(row['labels'][0])
if not match:
if not row['labels'][0]: continue
fix_row = rows.next()
dfr = dictify_row(fix_row)
row['labels'][0] += ' %s' % dfr['labels'][1]
match = TABLE_NAME_PATTERN.match(row['labels'][0])
if not match:
logging.warn( "Expected a table name at row %i [%s]" % ( row_count, row['labels'][0] ) )
continue
name_dict = match.groupdict()
table['name'] = name_dict['name']
table['size'] = int(name_dict['size'])
else: # there's a line number
key = utils.generate_stat_key(row['table_id'],row['line'])
parent = parent_key = None
if row['indent'] > 0:
chk_line = row['line']
while parent is None and chk_line > 1:
chk_line -= 1
parent_key = utils.generate_stat_key(row['table_id'],chk_line)
chk_parent = table['labels'][parent_key]
if chk_parent['indent'] == row['indent'] - 1:
parent = chk_parent
parent['has_children'] = True
parent_key = parent['key']
last_processed = {
'key': key,
'text': row['labels'][row['indent']],
'indent': row['indent'],
'parent': parent_key,
'has_children': False, #maybe! we'll reset this later in the loop if we discover otherwise. look up.
'key_2000': KEY_MAPPINGS[key] if key in KEY_MAPPINGS else None,
} # keep it around for later
table['labels'][key] = last_processed # but also save it...
# Save final table
# sanity check:
for k,v in tables.items():
if not k:
print "still have an empty key!"
else:
if k != v['key']:
raise AssertionError("Keys don't match for k=%s" % k)
try:
if len(v['labels']) != v['size']:
raise AssertionError("Not enough labels for k=%s expected %i got %i" % (k,v['size'],len(v['labels'])))
except KeyError:
print "Unexpectedly missing size for table %s keys: %s" % (k, ','.join(v.keys()))
collection = utils.get_label_collection()
collection.remove({ 'dataset': 'SF1' }, safe=True)
collection.save({ 'dataset': 'SF1', 'tables': tables}, safe=True)
print 'load_sf_labels_2010:'
print ' Row count: %i' % row_count
print ' Skipped: %i' % skipped
print ' Tables: %i' % len(tables)