-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractdata
executable file
·164 lines (146 loc) · 6.6 KB
/
extractdata
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/python3
"""Extract the data in the budget appendix tables into a CSV file.
extractdata <filename>
Produces several CSV files, of which only master.csv is currently useful.
"""
# The other csv files will eventually be suitable for loading into an SQL
# database, but that part of the code isn't quite finished yet (there are some
# missing cross links between the tables and no table table).
import sys
import csv
from xml.etree.ElementTree import ElementTree
import argparse
parser = argparse.ArgumentParser(description='Extract table data from US budget appendix')
parser.add_argument('--db', action='store_true', default=False,
help='Generate normalized CSV files suitable for SQL load')
parser.add_argument('sourcexml', help='XML source of budget appendix')
args = parser.parse_args()
def setup_var(varname, header):
globals()[varname+'_seq'] = 0
if args.db:
this = globals()[varname+'_file'] = csv.writer(open(varname+'.csv', 'w'))
this.writerow(header)
def handle_tree(tree):
for agency in tree.iterfind('.//agency'):
handle_agency(agency)
setup_var('agency', ['agency-seq', 'agency-code', 'agency-name'])
def handle_agency(agency):
global agency_code, agency_name, agency_seq
agency_seq += 1
agency_code = agency.attrib['agency-code']
agency_name = agency.find('header').text.strip()
if args.db:
agency_file.writerow([agency_seq, agency_code, agency_name])
for bureau in agency.iterfind('.//bureau'):
handle_bureau(bureau)
setup_var('bureau', ['agency-seq', 'bureau-code', 'bureau-name'])
def handle_bureau(bureau):
global bureau_code, bureau_name
bureau_code = bureau.attrib['bureau-code']
bureau_name = bureau.find('header').text.strip()
if args.db:
bureau_file.writerow([agency_seq, bureau_code, bureau_name])
for account in bureau.iterfind('.//account'):
if account.attrib['level-in-dtd'].strip() != 'ACCOUNT':
continue
handle_account(account)
setup_var('account', ['account-seq', 'bureau-code', 'treasury-code', 'account_name', 'account-deleted'])
def handle_account(account):
global treasury_code, account_code, account_name, account_deleted
account_code = account.attrib['account-code']
treasury_code = account.attrib['treasury-code']
account_name = account.find('header').text.strip()
account_deleted = account.attrib['account-deleted']
if args.db:
account_file.writerow([account_seq, bureau_code, treasury_code, account_name, account_deleted])
for schedule in account.iterfind('.//schedule'):
handle_schedule(schedule)
master = csv.writer(open('master.csv', 'w'))
master.writerow(['agency-code',
'agency-name',
'bureau-code',
'bureau-name',
'treasury-code',
'account-code',
'account-name',
'account-deleted',
'treasury-code',
'schedule-code',
'schedule-treasury-id',
'schedule-name',
'col3-head',
'col4-head',
'col5-head',
'stub-hierarchy',
'row-num',
'col1',
'col2',
'col3',
'col4',
'col5'
])
setup_var('schedule', ['schedule_seq', 'account_seq', 'bureau_code', 'treasury-id', 'schedule-code', 'schedule-treasury-id', 'schedule-name', 'col3_head', 'col4_head', 'col5_head'])
setup_var('tables', ['schedule_seq', 'row-num', 'stub-hierarchy', 'col1', 'col2', 'col3', 'col4', 'col5'])
def handle_schedule(schedule):
global schedule_seq, schedule_code, schedule_name, schedule_treasury_id, table_type, col3_head, col4_head, col5_head
schedule_seq += 1
schedule_code = schedule.attrib['schedule-code']
tables = schedule.findall('table')
assert len(tables) == 1
table = tables[0]
table_title = table.find('ttitle')
if table_title is not None:
table_title = table_title.text
schedule_name = table_title if table_title is None else table_title.strip()
schedule_treasury_id = None
table_head = schedule.find('.//thead')
if table_head is not None:
rows = table_head.find('row').findall('entry')
schedule_treasury_id = rows[0].text
schedule_treasury_id = schedule.find('.//thead').find('row').find('entry').text
if schedule_treasury_id is not None:
schedule_treasury_id = schedule_treasury_id.strip()
expected_prefix = 'Identification code '
if schedule_treasury_id.startswith(expected_prefix):
schedule_treasury_id = schedule_treasury_id[len(expected_prefix):]
schedule_treasury_id = schedule_treasury_id.replace(chr(8211), '-')
col3_head = rows[1].text.strip()
col4_head = rows[2].text.strip()
if len(rows) < 4:
col5_head = 'NA'
else:
col5_head = rows[3].text.strip()
assert len(rows) < 5
if args.db:
schedule_file.writerow([schedule_seq, account_seq, bureau_code, treasury_code, schedule_code, schedule_treasury_id, schedule_name, col3_head, col4_head, col5_head])
table_body = schedule.find('.//tbody')
if table_body is not None:
for row_num, row in enumerate(table_body.findall('row')):
entries = row.findall('entry')
if len(entries) > 1 and 'stub-hierarchy' in entries[1].attrib:
stub_hierarchy = entries[1].attrib['stub-hierarchy']
else:
stub_hierarchy = None
entries = [entry.text.strip() if entry.text else None for entry in entries]
entries.extend([None] * (5 - len(entries)))
if args.db:
tables_file.writerow([schedule_seq, row_num, stub_hierarchy] + entries)
master.writerow([agency_code,
agency_name,
bureau_code,
bureau_name,
treasury_code,
account_code,
account_name,
account_deleted,
treasury_code,
schedule_code,
schedule_treasury_id,
schedule_name,
col3_head,
col4_head,
col5_head,
stub_hierarchy,
row_num,
] + entries)
handle_tree(ElementTree(file=args.sourcexml))