-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentenceadjust.py
141 lines (113 loc) · 4.13 KB
/
sentenceadjust.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 21 14:16:27 2019
@author: Henry
BRAT strips spaces from sentences, PubTator does not. This causes errors.
Plan:
Divide document into paragraph chunks
Search through every line in paragraph
Add a pipe (|) after every newline, replacing the space after the break
For each pubtator term, scan from beginning of paragraph to pubtator-stated
location. Count number of pipes. Adjust using number of pipes.
The idea here is is to encode BRAT's line breaks into PubTator's information.
"""
#find id
xmldoc=urlxml[3]
import re
paragraphre=re.compile('<passage>.+?<infon key="section_type">.+?<\/infon><infon key="type">paragraph<\/infon><offset>[0-9]+<\/offset><text>.+?<\/text>')
paragraphs=re.findall(paragraphre,xmldoc)
paragraphs2=[None]*len(paragraphs)
for i in range(len(paragraphs)):
holder=re.findall('>.*?<',paragraphs[i])
paragraphs2[i]=holder[-1][1:(len(holder[-1])-1)]
#load brat sentence texts
brattxt=[]
fo=open('./PMC_sentences/'+pmid_list[0]+'.txt')
for i in fo:
brattxt.append(i[:-1])
fo.close()
pubstr=""
for i in paragraphs2:
pubstr=pubstr+'|'+i
pubstr=pubstr[1:]
bratstr=""
for i in brattxt:
bratstr=bratstr+'|'+i
bratstr=bratstr[1:]
#adjust for sentences
handext=hand_master[0]
adjust=bratstr.count('|',0,int(handext[0][0]))
for j in range(len(handext)):
handext[j][0]=int(handext[j][0])-adjust
handext[j][1]=int(handext[j][1])-adjust
for i in range(1,len(handext)):
adjust=bratstr.count('|',int(handext[i-1][0]),int(handext[i][0]))
for j in range(i,len(handext)):
handext[j][0]=int(handext[j][0])-adjust
handext[j][1]=int(handext[j][1])-adjust
#adjust for PubMed section title offsets
#titlere=re.compile('<infon key="section_type">.+?<\/infon><offset>.*?<\/offset><text>.+?<\/text>')
#titles=re.findall(titlere,xmldoc)
#titles2=[None]*len(titles)
#for i in range(len(titles)):
# holder=re.findall('>.*?<',titles[i])
# titles2[i]=holder[-1][1:(len(holder[-1])-1)]
#
#offsetre=re.compile('<offset>(\d*?)<\/offset>')
#offsetnum=[None]*len(titles)
#for i in range(len(titles)):
# offsety=re.search(offsetre,titles[i])
# if offsety:
# offsetnum[i]=offsety.group(1)
titlere=re.compile('<infon key="section_type">.+?<infon key="type">(.+?)<\/infon><offset>(\d+?)<\/offset><text>(.+?)<\/text>')
titles=re.findall(titlere,xmldoc)
paragraph_list=[]
for i in range(len(titles)):
if titles[i][0]=='paragraph':
paragraph_list.append(i)
difflist=[]
for i in range(1,len(paragraph_list)):
difference=int(titles[paragraph_list[i]][1])-int(titles[(paragraph_list[i-1]+1)][1])
difflist.append(difference)
firstparagraphflag=paragraph_list[0]
difflist2=[]
for i in range(len(difflist)):
firstoffset=int(titles[firstparagraphflag][1])
ticks=int(titles[paragraph_list[i+1]][1])-firstoffset
diffhold=[ticks,difflist[i]]
difflist2.append(diffhold)
for i in difflist2[1:]:
i[1]=i[1]+0
difflist3=[]
for i in range(len(difflist2)):
totals=0
for j in range(i+1):
totals=totals+difflist2[j][1]
holder=[difflist2[i][0], totals]
difflist3.append(holder)
difflist3firsts=[i[0] for i in difflist3]
difflist3seconds=[i[1] for i in difflist3]
difflistdict=dict(zip(difflist3firsts,difflist3seconds))
publistadjust=pub_master[0][:]
publistadjust2=publistadjust
for i in range(len(publistadjust)):
pos=publistadjust[i][0]
if pos>=difflist3firsts[0]:
diffdictlookup=max(k for k in difflist3firsts if k<=pos)
offsetpos=difflistdict[diffdictlookup]
else:
offsetpos=0
print(offsetpos)
holder=[int(publistadjust[i][0])-offsetpos, int(publistadjust[i][1])-offsetpos, publistadjust[i][2]]
publistadjust2[i]=holder
for i in range(1,len(titles)):
print(int(titles[i][1])-int(titles[i-1][1])-len(titles[i-1][2]))
realoffsets=[]
for i in range(1,len(paragraph_list)):
realoffsets.append(int(titles[paragraph_list[i]][1])-int(titles[paragraph_list[i-1]][1])-len(titles[paragraph_list[i-1]][2])-1)
realoffsets2=[]
tally=0
for i in range(len(realoffsets)):
tally=tally+realoffsets[i]
realoffsets2.append(tally)
difflistdict2=dict(zip(difflist3firsts,realoffsets2))