/
tool_runner.py
111 lines (90 loc) · 5.23 KB
/
tool_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import subprocess
import sys
from argparse import ArgumentParser
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Union
def genargs(prog: Optional[str] = None) -> ArgumentParser:
parser = ArgumentParser(prog)
parser.add_argument(
"--tool", help="Tool to start the script", required=True)
parser.add_argument(
"--dump", help="Path to the Wikidata JSON file", required=True)
return parser
def main(argv: Optional[Union[str, List[str]]] = None, prog: Optional[str] = None) -> int:
if isinstance(argv, str):
argv = argv.split()
opts = genargs(prog).parse_args(argv if argv is not None else sys.argv[1:])
opts.dump = Path(opts.dump)
if opts.tool == 'wdumper':
run_wdumper(opts.dump)
elif opts.tool == 'wdf':
run_wdf(opts.dump)
elif opts.tool == 'wdsub':
run_wdsub(opts.dump)
elif opts.tool == 'kgtk':
run_kgtk(opts.dump)
else:
print(
'ERROR: Enter a valid tool. Valid tools are: "wdumper", "wdf", "wdsub", "kgtk"')
return 1
def run_wdumper(dump: Path) -> int:
print('Starting a new run of WDumper ...')
print('=================================')
start_time = datetime.now()
process = subprocess.Popen('./WDUMPER/wdumper/build/install/wdumper/bin/wdumper-cli {0} {1}'.format(str(dump), './gene_protein_disease_chemicals.json'), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
process.wait()
end_time = datetime.now()
run_time = end_time - start_time
print('=================================')
print('DONE WDumper, Exec time: {0}'.format(run_time.total_seconds()))
with open('wdumper_run_'+datetime.today().strftime('%Y-%m-%d-%H:%M:%S')+'.txt', 'w') as file:
file.write(str(run_time))
def run_wdf(dump: Path) -> int:
print('Starting a new run of Wikibase Dump Filter ...')
print('=================================')
start_time = datetime.now()
process = subprocess.Popen('cat {0} | gzip -d | ./WDF/wikibase-dump-filter/node_modules/.bin/wikibase-dump-filter --claim P31:Q11173,Q12136,Q7187,Q8054 > ./wdf.ndjson'.format(str(dump)), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
process.wait()
end_time = datetime.now()
run_time = end_time - start_time
print('=================================')
print('DONE WDF, Exec time: {0}'.format(run_time.total_seconds()))
with open('wdf_run_'+datetime.today().strftime('%Y-%m-%d-%H:%M:%S')+'.txt', 'w') as file:
file.write(str(run_time))
def run_wdsub(dump: Path) -> int:
print('Starting a new run of WDSub ...')
print('=================================')
start_time = datetime.now()
process = subprocess.Popen('./WDSUB/wdsubroot-0.0.28/bin/wdsubroot dump -s ./gene_protein_disease_chemicals.shex -o ./wdsub_output.ttl.gz {0} --schemaFormat ShEXC --dumpMode WholeEntity --dumpFormat Turtle'.format(str(dump)), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
process.wait()
end_time = datetime.now()
run_time = end_time - start_time
print('=================================')
print('DONE WDSub, Exec time: {0}'.format(run_time.total_seconds()))
with open('wdsub_run_'+datetime.today().strftime('%Y-%m-%d-%H:%M:%S')+'.txt', 'w') as file:
file.write(str(run_time))
def run_kgtk(dump: Path) -> int:
print('Starting a new run of KGTK ...')
print('=================================')
start_time = datetime.now()
process = subprocess.Popen('kgtk --debug --timing --progress import-wikidata -i {0} --node nodefile.tsv --edge edgefile.tsv --qual qualfile.tsv --use-mgzip-for-input True --use-mgzip-for-output True --use-shm True --procs 6 --mapper-batch-size 5 --max-size-per-mapper-queue 3 --single-mapper-queue True --collect-results True --collect-seperately True --collector-batch-size 10 --collector-queue-per-proc-size 3 --progress-interval 500000 --fail-if-missing False'.format(str(dump)), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
process.wait()
end_time_import = datetime.now()
command = '''kgtk query --gc ./wikidata.sqlite3.db -i edgefile.tsv --match '(n1)-[:P31]->(class), (n1)-[p]->(n2)' --where 'class IN ["Q11173","Q12136","Q7187","Q8054"]' --return 'n1, p, n2' > ./kgtk_output.tsv'''
process = subprocess.Popen(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
process.wait()
end_time = datetime.now()
total_run_time = end_time - start_time
import_time = end_time_import - start_time
query_time = end_time - end_time_import
print('=================================')
print('DONE KGTK, Exec total time: {0}, import Wikidata time: {1}({2}%), query time: {3}({4}%)'.format(total_run_time.total_seconds(),import_time.total_seconds(),round((import_time.total_seconds()/total_run_time.total_seconds())*100,1),query_time.total_seconds(),round((query_time.total_seconds()/total_run_time.total_seconds())*100,1)))
with open('kgtk_run_'+datetime.today().strftime('%Y-%m-%d-%H:%M:%S')+'.txt', 'w') as file:
file.write(str(total_run_time))
file.write('\n')
file.write(str(import_time))
file.write('\n')
file.write(str(query_time))
if __name__ == '__main__':
main(sys.argv[1:])