-
Notifications
You must be signed in to change notification settings - Fork 2
/
uscore-data-script.rb
295 lines (260 loc) · 16.4 KB
/
uscore-data-script.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
require 'pry'
require 'fhir_models'
require 'fileutils'
require './lib/time.rb'
require './lib/constraints.rb'
require './lib/modifications.rb'
require './lib/bulk_data_converter.rb'
RAND_SEED = 3
start = Time.now.to_i
if ARGV && ARGV.length >= 1 && ARGV.include?('mrburns')
puts 'Generating Mr. Burns...'
MRBURNS=true
DataScript::Constraints::CONSTRAINTS_MRBURNS_DOES_NOT_NEED.each do |key|
DataScript::Constraints::CONSTRAINTS.delete(key)
end
DataScript::Constraints::CONSTRAINTS.merge!(DataScript::Constraints::CONSTRAINTS_MRBURNS)
DataScript::Constraints::REQUIRED_PROFILES.delete('http://hl7.org/fhir/us/core/StructureDefinition/us-core-medication')
else
MRBURNS = false
end
puts 'Generating Synthetic Patients with Synthea...'
output = 'output'
output_raw = 'output/raw'
output_raw_fhir = 'output/raw/fhir'
FileUtils.rm Dir.glob("./#{output}/*.log")
Dir.mkdir(output) unless File.exists?(output)
Dir.mkdir(output_raw) unless File.exists?(output_raw)
Dir.mkdir(output_raw_fhir) unless File.exists?(output_raw_fhir)
FileUtils.rm Dir.glob("./#{output_raw_fhir}/*.json")
# Manually list out the classpath, because it needs to be loaded in a specific order...
CLASSPATH='lib/synthea/synthea.jar:lib/synthea/SimulationCoreLibrary_v1.5_slim.jar:lib/synthea/hapi-fhir-structures-dstu3-4.1.0.jar:lib/synthea/hapi-fhir-structures-dstu2-4.1.0.jar:lib/synthea/hapi-fhir-structures-r4-4.1.0.jar:lib/synthea/org.hl7.fhir.dstu3-4.1.0.jar:lib/synthea/org.hl7.fhir.r4-4.1.0.jar:lib/synthea/org.hl7.fhir.utilities-4.1.0.jar:lib/synthea/hapi-fhir-base-4.1.0.jar:lib/synthea/gson-2.8.5.jar:lib/synthea/json-path-2.4.0.jar:lib/synthea/freemarker-2.3.26-incubating.jar:lib/synthea/h2-1.4.196.jar:lib/synthea/guava-28.0-jre.jar:lib/synthea/graphviz-java-0.2.2.jar:lib/synthea/commons-csv-1.5.jar:lib/synthea/jackson-dataformat-csv-2.8.8.jar:lib/synthea/snakeyaml-1.25.jar:lib/synthea/commons-math3-3.6.1.jar:lib/synthea/commons-text-1.7.jar:lib/synthea/cql-engine-1.3.10-SNAPSHOT.jar:lib/synthea/cql-to-elm-1.3.17.jar:lib/synthea/cql-1.3.17.jar:lib/synthea/elm-1.3.17.jar:lib/synthea/model-1.3.17.jar:lib/synthea/jaxb-runtime-2.3.0.jar:lib/synthea/jaxb-core-2.3.0.jar:lib/synthea/jaxb-api-2.3.0.jar:lib/synthea/activation-1.1.1.jar:lib/synthea/quick-1.3.17.jar:lib/synthea/qdm-1.3.17.jar:lib/synthea/jaxb2-basics-0.9.4.jar:lib/synthea/jaxb2-basics-tools-0.9.4.jar:lib/synthea/jcl-over-slf4j-1.7.28.jar:lib/synthea/jul-to-slf4j-1.7.25.jar:lib/synthea/slf4j-log4j12-1.7.25.jar:lib/synthea/jsbml-1.4.jar:lib/synthea/jsbml-arrays-1.4.jar:lib/synthea/jsbml-comp-1.4.jar:lib/synthea/jsbml-distrib-1.3.1.jar:lib/synthea/jsbml-dyn-1.4.jar:lib/synthea/jsbml-fbc-1.4.jar:lib/synthea/jsbml-groups-1.4.jar:lib/synthea/jsbml-render-1.4.jar:lib/synthea/jsbml-layout-1.4.jar:lib/synthea/jsbml-multi-1.4.jar:lib/synthea/jsbml-qual-1.4.jar:lib/synthea/jsbml-req-1.4.jar:lib/synthea/jsbml-spatial-1.4.jar:lib/synthea/jsbml-tidy-1.4.jar:lib/synthea/jsbml-core-1.4.jar:lib/synthea/biojava-ontology-4.0.0.jar:lib/synthea/log4j-slf4j-impl-2.1.jar:lib/synthea/slf4j-api-1.7.28.jar:lib/synthea/commons-math-2.2.jar:lib/synthea/jfreechart-1.5.0.jar:lib/synthea/json-smart-2.3.jar:lib/synthea/commons-lang3-3.9.jar:lib/synthea/commons-codec-1.12.jar:lib/synthea/batik-codec-1.9.jar:lib/synthea/batik-rasterizer-1.9.jar:lib/synthea/batik-svgrasterizer-1.9.jar:lib/synthea/batik-transcoder-1.9.jar:lib/synthea/batik-bridge-1.9.jar:lib/synthea/batik-script-1.9.jar:lib/synthea/batik-anim-1.9.jar:lib/synthea/batik-svg-dom-1.9.jar:lib/synthea/batik-dom-1.9.jar:lib/synthea/batik-css-1.9.jar:lib/synthea/xmlgraphics-commons-2.2.jar:lib/synthea/commons-io-2.6.jar:lib/synthea/ucum-1.0.2.jar:lib/synthea/jsr305-3.0.2.jar:lib/synthea/j2v8_macosx_x86_64-4.6.0.jar:lib/synthea/j2v8_linux_x86_64-4.6.0.jar:lib/synthea/j2v8_win32_x86_64-4.6.0.jar:lib/synthea/j2v8_win32_x86-4.6.0.jar:lib/synthea/commons-exec-1.3.jar:lib/synthea/jackson-databind-2.10.1.jar:lib/synthea/jackson-core-2.10.1.jar:lib/synthea/jackson-annotations-2.10.1.jar:lib/synthea/jaxb2-fluent-api-3.0.jar:lib/synthea/hamcrest-all-1.3.jar:lib/synthea/hamcrest-json-0.2.jar:lib/synthea/jaxb-impl-2.3.0.1.jar:lib/synthea/jaxb-core-2.3.0.1.jar:lib/synthea/javax.activation-1.2.0.jar:lib/synthea/eclipselink-2.6.0.jar:lib/synthea/validation-api-1.1.0.Final.jar:lib/synthea/antlr4-4.5.jar:lib/synthea/jopt-simple-4.7.jar:lib/synthea/stax-ex-1.7.8.jar:lib/synthea/FastInfoset-1.2.13.jar:lib/synthea/accessors-smart-1.2.jar:lib/synthea/failureaccess-1.0.1.jar:lib/synthea/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar:lib/synthea/checker-qual-2.8.1.jar:lib/synthea/error_prone_annotations-2.3.2.jar:lib/synthea/j2objc-annotations-1.3.jar:lib/synthea/animal-sniffer-annotations-1.17.jar:lib/synthea/xpp3-1.1.4c.jar:lib/synthea/xpp3_xpath-1.1.4c.jar:lib/synthea/json-simple-1.1.1.jar:lib/synthea/junit-4.12.jar:lib/synthea/batik-parser-1.9.jar:lib/synthea/batik-gvt-1.9.jar:lib/synthea/batik-svggen-1.9.jar:lib/synthea/batik-awt-util-1.9.jar:lib/synthea/batik-xml-1.9.jar:lib/synthea/batik-util-1.9.jar:lib/synthea/xalan-2.7.2.jar:lib/synthea/serializer-2.7.2.jar:lib/synthea/xml-apis-1.3.04.jar:lib/synthea/jaxb2-basics-runtime-0.9.4.jar:lib/synthea/javaparser-1.0.11.jar:lib/synthea/jsonassert-1.1.1.jar:lib/synthea/hamcrest-core-1.3.jar:lib/synthea/log4j-1.2.17.jar:lib/synthea/javax.persistence-2.1.0.jar:lib/synthea/commonj.sdo-2.1.1.jar:lib/synthea/javax.json-1.0.4.jar:lib/synthea/antlr4-runtime-4.5.jar:lib/synthea/ST4-4.0.8.jar:lib/synthea/antlr-runtime-3.5.2.jar:lib/synthea/txw2-2.3.0.jar:lib/synthea/istack-commons-runtime-3.0.5.jar:lib/synthea/log4j-1.2-api-2.3.jar:lib/synthea/log4j-core-2.3.jar:lib/synthea/woodstox-core-5.0.1.jar:lib/synthea/jigsaw-2.2.6.jar:lib/synthea/xstream-1.3.1.jar:lib/synthea/staxmate-2.3.0.jar:lib/synthea/jtidy-r938.jar:lib/synthea/asm-5.0.4.jar:lib/synthea/batik-ext-1.9.jar:lib/synthea/xml-apis-ext-1.3.04.jar:lib/synthea/batik-constants-1.9.jar:lib/synthea/batik-i18n-1.9.jar:lib/synthea/commons-beanutils-1.9.2.jar:lib/synthea/json-20090211.jar:lib/synthea/commons-collections-3.2.1.jar:lib/synthea/org.abego.treelayout.core-1.0.1.jar:lib/synthea/log4j-api-2.3.jar:lib/synthea/stax2-api-3.1.4.jar:lib/synthea/xpp3_min-1.1.4c.jar:lib/synthea/commons-logging-1.0.4.jar'
CONFIG='--exporter.fhir.use_us_core_ig=true --exporter.baseDirectory=./output/raw --exporter.hospital.fhir.export=false --exporter.practitioner.fhir.export=false --exporter.groups.fhir.export=true'
if MRBURNS
system( "java -cp #{CLASSPATH} App -s #{RAND_SEED} -a 80-81 -g M -p 50 #{CONFIG} --exporter.years_of_history=0 > output/synthea.log" )
else
system( "java -cp #{CLASSPATH} App -s #{RAND_SEED} -p 160 #{CONFIG} > output/synthea.log" )
end
tok = Time.now.to_i
puts " Generated data in #{DataScript::TimeUtilities.pretty(tok - start)}."
puts 'Loading FHIR Bundles...'
records = []
all_group = nil
input_folder = File.join(File.dirname(__FILE__), './output/raw/fhir')
Dir.foreach(input_folder) do |file|
next unless file.end_with?('.json')
next if file.start_with?('hospitalInformation', 'practitionerInformation')
json = File.open("#{input_folder}/#{file}", 'r:UTF-8', &:read)
bundle = FHIR.from_contents(json)
if bundle.resourceType == 'Group'
all_group = bundle
elsif bundle.resourceType == 'Bundle'
records << bundle
end
end
tik = Time.now.to_i
puts " Loaded #{records.length} FHIR Bundles in #{DataScript::TimeUtilities.pretty(tik - tok)}."
# Constraints to test
constraints = DataScript::Constraints.new
# Selections (plural) is the patient bundles selected that satisfy the constraints
selections = []
# Selection (singular) is the latest selected patient bundle
selection = nil
# Satsified is the list of the named constraints (as strings) that the current selections satisfy.
satisfied = []
# Unsatisfied is the list of the named constraints (as strings) that have yet to be satisified.
unsatisfied = DataScript::Constraints::CONSTRAINTS.keys - satisfied
puts 'Selecting patients by constraints...'
# while there are more constraints to satisfy, there are still patient records remaining to choose from,
# or the remaining patient records are still useful (e.g. they are satisfying additional constraints)
until unsatisfied.empty? || records.empty? || selection&.total == 0
# Score each patient against the unsatisfied constraints
puts ' Scoring Records...'
records.each do |bundle|
constraints.satisfied?([bundle], unsatisfied)
bundle.total = unsatisfied.length - constraints.violations.length
end
# Sort the patients, first worse, last best
# and select the patient that satisfies the most remaining constraints
records.sort! {|a, b| a.total <=> b.total}
# puts " #{records.map {|b| b.total}}" # debug scores
selection = records.pop
# Recalculate constraint satisfaction variables
constraints.satisfied?([selection])
selection_unsatisfied = constraints.violations
selection_satisfied = DataScript::Constraints::CONSTRAINTS.keys - selection_unsatisfied
satisfied = satisfied.append(selection_satisfied).flatten.uniq
unsatisfied = DataScript::Constraints::CONSTRAINTS.keys - satisfied
# Add the currently selected patient to our list, as long as it is a useful addition
if selection.total > 0
puts " Selected: #{selection_satisfied}"
selections << selection
else
puts " Done."
end
end
selections.each {|bundle| bundle.total = nil}
# How many profiles are supported?
selection = nil
profiles_present = constraints.profiles_present(selections)
profiles_present.append('http://hl7.org/fhir/us/core/StructureDefinition/us-core-medication') if MRBURNS
profiles_missing = DataScript::Constraints::REQUIRED_PROFILES - profiles_present
puts 'Selecting patients by profile...' unless profiles_missing.empty?
until profiles_missing.empty? || records.empty? || selection&.total == 0
# Score each patient against the unsatisfied constraints
puts ' Scoring Records...'
records.each do |bundle|
bundle_present = constraints.profiles_present([bundle])
bundle.total = (bundle_present - profiles_present).length
end
# Sort the patients, first worse, last best
# and select the patient that satisfies the most remaining constraints
records.sort! {|a, b| a.total <=> b.total}
# puts " #{records.map {|b| b.total}}" # debug scores
selection = records.pop
bundle_present = constraints.profiles_present([selection])
bundle_extras = bundle_present - profiles_present
profiles_present = profiles_present.append(bundle_present).flatten.uniq
profiles_missing = DataScript::Constraints::REQUIRED_PROFILES - profiles_present
if selection.total > 0
puts " Selected: #{bundle_extras}"
selections << selection
else
puts " Done."
end
end
selections.each {|bundle| bundle.total = nil}
tok = Time.now.to_i
puts " Selected #{selections.length} patients (#{DataScript::TimeUtilities.pretty(tok - tik)})."
# post-process selections
puts 'Modifying selected patients...'
patient_bundle_absent_name = DataScript::Modifications.modify!(selections, RAND_SEED)
tik = Time.now.to_i
puts " Modified patients (#{DataScript::TimeUtilities.pretty(tik - tok)})."
group = selections.pop
puts 'Final constraint testing...'
if constraints.satisfied?(selections)
puts ' All constraints satisfied.'
else
puts " #{constraints.violations.length} remaining constraints violated: #{constraints.violations}"
end
profiles_present = constraints.profiles_present(selections)
profiles_missing = DataScript::Constraints::REQUIRED_PROFILES - profiles_present
if profiles_missing.empty?
puts ' All profiles present.'
else
puts " Missing #{profiles_missing.length} profiles: #{profiles_missing}"
end
# Add the Group back
selections << group
# Remove the patient with primitive extensions
# because we need to write out their JSON separately.
if patient_bundle_absent_name
selections.delete(patient_bundle_absent_name)
records.delete(patient_bundle_absent_name)
end
# Save selections
tik = Time.now.to_i
output_data = 'output/data'
output_validation = 'output/validation'
puts "Overwriting selections into ./#{output_data}"
Dir.mkdir(output_data) unless File.exists?(output_data)
FileUtils.rm Dir.glob("./#{output_data}/*.json")
selections.each do |bundle|
if bundle.resourceType == 'Bundle'
id = bundle.entry.first.resource.id
else
id = bundle.id
end
filename = "#{output_data}/#{id}.json"
file = File.open(filename,'w:UTF-8')
json_string = bundle.to_json
# json_string.gsub!('"value": "DATAABSENTREASONEXTENSIONGOESHERE"', "\"_value\": { \"extension\": [ #{DataScript::Modifications.data_absent_reason.to_json} ] }")
file.write( json_string )
file.close
end
patient_without_name_json = nil
if patient_bundle_absent_name
# we need to manually manipulate the JSON for this one bundle,
# because the fhir_models gem does not support primitive extensions.
json = JSON.parse( patient_bundle_absent_name.to_json )
json['entry'][0]['resource']['name'] = [{
'_family' => {
'extension' => [ DataScript::Modifications.data_absent_reason.to_hash ]
},
'_given' => [{
'extension' => [ DataScript::Modifications.data_absent_reason.to_hash ]
}]
}]
patient_without_name_json = JSON.unparse(json['entry'][0]['resource'])
json = JSON.pretty_unparse(json)
# json.gsub!('"value": "DATAABSENTREASONEXTENSIONGOESHERE"', "\"_value\": { \"extension\": [ #{DataScript::Modifications.data_absent_reason.to_json} ] }")
filename = "#{output_data}/#{patient_bundle_absent_name.entry.first.resource.id}.json"
file = File.open(filename,'w:UTF-8')
file.write(json)
file.close
# run FHIR validator on output
puts 'Running FHIR validator on output.'
validation_file = "#{output_validation}/#{patient_bundle_absent_name.entry.first.resource.id}.txt"
system( "java -jar lib/org.hl7.fhir.validator.jar #{filename} -version 4.0.1 -ig hl7.fhir.us.core#3.1.0 > #{validation_file}" )
end
tok = Time.now.to_i
puts " Saved #{selections.length + (patient_bundle_absent_name ? 1 : 0)} files (#{DataScript::TimeUtilities.pretty(tok - tik)})."
# Save the selection records in the Bulk Data Format
tik = Time.now.to_i
puts 'Saving *selected* records in Bulk Data ndjson format...'
converter = DataScript::BulkDataConverter.new('selected')
selections.each do |bundle|
converter.convert_to_bulk_data(bundle)
end
converter.convert_to_bulk_data(patient_bundle_absent_name, patient_without_name_json) if patient_bundle_absent_name
converter.close
tok = Time.now.to_i
puts " Saved #{selections.length + (patient_bundle_absent_name ? 1 : 0)} records as ndjson (#{DataScript::TimeUtilities.pretty(tok - tik)})."
# Save *ALL* the records in the Bulk Data Format
unless MRBURNS
tik = Time.now.to_i
puts 'Saving *all* records in Bulk Data ndjson format...'
converter = DataScript::BulkDataConverter.new('all')
records.each do |bundle|
converter.convert_to_bulk_data(bundle)
end
converter.convert_to_bulk_data(patient_bundle_absent_name, patient_without_name_json) if patient_bundle_absent_name
converter.convert_to_bulk_data(group)
converter.convert_to_bulk_data(all_group)
converter.close
tok = Time.now.to_i
puts " Saved #{records.length + (patient_bundle_absent_name ? 1 : 0)} records as ndjson (#{DataScript::TimeUtilities.pretty(tok - tik)})."
end
puts 'Cleaning...'
['Claim','ExplanationOfBenefit','ImagingStudy'].each do |resourceType|
FileUtils.rm Dir.glob("./#{output}/**/#{resourceType}.ndjson")
end
# Validating
tik = Time.now.to_i
output_validation = 'output/validation'
puts "Validating... Output logged in ./#{output_validation}"
Dir.mkdir(output_validation) unless File.exists?(output_validation)
FileUtils.rm Dir.glob("./#{output_validation}/*.txt")
selections.each do |bundle|
if bundle.resourceType == 'Bundle'
id = bundle.entry.first.resource.id
else
id = bundle.id
end
# run FHIR validator on output
filename = "#{output_data}/#{id}.json"
validation_file = "#{output_validation}/#{id}.txt"
system( "java -jar lib/org.hl7.fhir.validator.jar #{filename} -version 4.0.1 -ig hl7.fhir.us.core > #{validation_file}" )
end
if patient_bundle_absent_name
filename = "#{output_data}/#{patient_bundle_absent_name.entry.first.resource.id}.json"
# run FHIR validator on output
validation_file = "#{output_validation}/#{patient_bundle_absent_name.entry.first.resource.id}.txt"
system( "java -jar lib/org.hl7.fhir.validator.jar #{filename} -version 4.0.1 -ig hl7.fhir.us.core > #{validation_file}" )
end
tok = Time.now.to_i
puts " Validated #{selections.length + (patient_bundle_absent_name ? 1 : 0)} files (#{DataScript::TimeUtilities.pretty(tok - tik)})."
# Print the amount of time it took...
stop = Time.now.to_i
puts "Complete (#{DataScript::TimeUtilities.pretty(stop - start)})"