-
Notifications
You must be signed in to change notification settings - Fork 1
/
nice_plugin.py
135 lines (111 loc) · 5.31 KB
/
nice_plugin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def search_in_nice(data, http):
from definitions import unify_collector, no_full_stop, is_doi, empty_guidelines
collector = [[],[]] # [list() for n in data]
#collector[0] = search_on_nice_site(data[0], http, "standard")
#collector[1] = search_on_nice_site('"' + no_full_stop(data[1]) + '"' + " " + data[2], http, "standard")
if is_doi(data[0]):
collector[0] = search_on_nice_site(data[0], http, "evidence")
else: collector[0] = empty_guidelines()
collector[1] = search_on_nice_site('"' + no_full_stop(data[1]) + '"' + " " + data[2], http, "evidence")
result = unify_collector(collector)
return result
##########################################################
def search_on_nice_site(data, http, stype):
from definitions import empty_guidelines, simplify_text
if stype == "standard":
main_url = "https://www.nice.org.uk/search?ps=100&pa="
elif stype == "evidence":
main_url = "https://www.evidence.nhs.uk/search?ps=100&pa="
searchq = simplify_text(data, ausnahmen=[" ", "-", "/", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", ".", "_", "(", ")", '"']).replace(" ", "+")
loop = 1
runner = 1
result = empty_guidelines()
while loop == 1:
site = http.request("GET", main_url + str(runner) + "&q=" + searchq).data.decode("utf-8", "ignore")
if stype == "standard":
result = parse_nice_site_results(site, result)
elif stype == "evidence":
result = parse_nice_evidence_results(site, result)
runner = runner + 1
if (len(result[1]) == 0) | ((len(result[1]) % 100) != 0):
loop = 0
return result
##########################################################
def parse_nice_evidence_results(cont, result):
from definitions import clear_text_from_inclusions, find_earliest_end, clearer_text
# ["DOI", "Title", "hit quot", "id", "evidence-level", "society", "url", "in_awmf", "in_nice", "in_tripdb"]
start = cont.find('class="media"')
while start > -1:
end = cont.find('class="media"', start + 5)
if end == -1: end = len(cont)
block = cont[start:end]
# url
startb = block.find("media-heading")
startb = block.find('href="', startb) + len('href="')
endb = block.find('"', startb)
result[6].append(block[startb : endb])
# title
startb = block.find(">", endb) + 1
endb = block.find("</a>", startb)
title = clear_text_from_inclusions(block[startb: endb]).strip()
result[1].append(title)
# society/Journal
startb = block.find('class="media-meta"', endb)
endb = block.find('</div>', startb)
subblock = block[startb : endb]
if subblock.find('class="media-source"') > -1:
startsb = subblock.find('class="media-source"')
startsb = subblock.find(">", startsb) + 1
endsb = subblock.find("<", startsb)
result[5].append(subblock[startsb : endsb])
elif subblock.find("Publisher:") > -1:
startsb = subblock.find("Publisher:") + len("Publisher:")
endsb = len(subblock)
result[5].append(clearer_text(subblock[startsb : endsb]).strip())
else:
result[5].append(" ")
# evidence-level
startb = block.find("read-summary pull-left", endb)
startb = block.find("<a href", startb)
startb = block.find(">", startb) + 1
endb = block.find("</a", startb)
result[4].append(block[startb: endb].strip())
result[0].append(" ")
result[2].append(" ")
result[3].append(" ")
result[7].append("0")
result[8].append("1")
result[9].append("0")
start = cont.find('class="media"', start + 5)
return result
##########################################################
def parse_nice_site_results(cont, result):
from definitions import clear_text_from_inclusions, find_earliest_end
# ["DOI", "Title", "hit quot", "id", "evidence-level", "society", "url", "in_awmf", "in_nice", "in_tripdb"]
start = cont.find('class="media-body"')
while start > -1:
end = cont.find('class="media-body"', start + 5)
if end == -1: end = len(cont)
block = cont[start:end]
# title
startb = block.find("media-heading")
startb = block.find(">", startb) + 1
endb = block.find("</a>", startb)
title = clear_text_from_inclusions(block[startb : endb]).strip()
result[1].append(title)
# id
id = title[title.rfind("(") + 1 : title.rfind(")")].strip()
result[3].append(id)
# evidence-level
startb = block.find("documenttype")
startb = block.find(">", startb) + 1
endb = block.find("<", startb)
result[4].append(block[startb : endb].strip())
result[0].append(" ")
result[2].append(" ")
result[5].append(" ")
result[7].append("0")
result[8].append("1")
result[9].append("0")
start = cont.find('class="media-body"', start + 5)
return result