Skip to content

Commit

Permalink
Adjustments to end of pipeline to use more gzipped files and get pubm…
Browse files Browse the repository at this point in the history
…ed ids from final release
  • Loading branch information
jakelever committed Jul 12, 2023
1 parent 6eb30ac commit c2866cb
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 13 deletions.
15 changes: 6 additions & 9 deletions pipeline/Snakefile
Expand Up @@ -114,19 +114,16 @@ rule finalFilter:

rule combineWithLastRelease:
input: join(outDir,"alldocuments.filtered.json"),
output: join(outDir,"coronacentral.json")
shell: "python combineWithLastRelease.py --inJSON {input} --lastRelease ../last_release/coronacentral.json.gz --outJSON {output}"
output: join(outDir,"coronacentral.json.gz")
shell: "python combineWithLastRelease.py --inJSON {input} --lastRelease ../last_release/coronacentral.json.gz --outJSONGZ {output}"

rule recordProcessedFiles:
input: join(outDir,"alldocuments.json")
input:
initial=join(outDir,"alldocuments.json"),
final=join(outDir,"coronacentral.json.gz")
output: join(outDir,"process_record.json.gz")
shell: "python recordProcessedFiles.py --pubmedDir ../data/pubmed_corona/ --initialDocuments {input} --outJSONGZ {output}"
shell: "python recordProcessedFiles.py --pubmedDir ../data/pubmed_corona/ --initialDocuments {input.initial} --finalRelease {input.final} --outJSONGZ {output}"

rule gzip:
input: join(outDir,"coronacentral.json")
output: join(outDir,"coronacentral.json.gz")
shell: "gzip -9 -c {input} > {output}"

rule getAltmetricData:
input: join(outDir,"coronacentral.json")
output: join(outDir,"altmetric.json")
Expand Down
4 changes: 2 additions & 2 deletions pipeline/combineWithLastRelease.py
Expand Up @@ -6,7 +6,7 @@ def main():
parser = argparse.ArgumentParser('Combine the output with the last release of CoronaCentral')
parser.add_argument('--inJSON',required=True,type=str,help='Input JSON documents')
parser.add_argument('--lastRelease',required=True,type=str,help='coronacentral.json.gz file from last release')
parser.add_argument('--outJSON',required=True,type=str,help='Output JSON')
parser.add_argument('--outJSONGZ',required=True,type=str,help='Output JSON GZ file')
args = parser.parse_args()

print("Loading last release documents...")
Expand All @@ -28,7 +28,7 @@ def main():
print(f"Added {added} documents from current run to last release")

print("Saving data...")
with open(args.outJSON,'w',encoding='utf8') as f:
with gzip.open(args.outJSONGZ,'wt',encoding='utf8') as f:
json.dump(documents,f)
print(f"Saved {len(documents)} documents")

Expand Down
4 changes: 2 additions & 2 deletions pipeline/recordProcessedFiles.py
Expand Up @@ -7,7 +7,7 @@ def main():
parser = argparse.ArgumentParser('Note which files and Pubmed IDs have been processed')
parser.add_argument('--pubmedDir',required=True,type=str,help='Directory with PubMed files')
parser.add_argument('--initialDocuments',required=True,type=str,help='Initial documents file')
parser.add_argument('--finalRelease',required=True,type=str,help='Final file')
parser.add_argument('--finalRelease',required=True,type=str,help='Final file (gzipped)')
parser.add_argument('--outJSONGZ',required=True,type=str,help='Output JSON GZ')
args = parser.parse_args()

Expand All @@ -18,7 +18,7 @@ def main():
documents = json.load(f)
pubmed_ids = [ d['pubmed_id'] for d in documents if d['pubmed_id'] ]

with open(args.finalRelease) as f:
with gzip.open(args.finalRelease,'rt') as f:
documents = json.load(f)
pubmed_ids += [ d['pubmed_id'] for d in documents if d['pubmed_id'] ]

Expand Down

0 comments on commit c2866cb

Please sign in to comment.