forked from maximilianh/pubMunch
/
pubCompare
executable file
·203 lines (168 loc) · 7.04 KB
/
pubCompare
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python2.7
# try to infer PMIDs by comparing issn/vol/issue/page against medline
# load default python packages
import logging, optparse, sys, os, marshal, unicodedata
from os.path import *
# add <scriptDir>/lib/ to package search path
sys.path.insert(0, join(dirname(abspath(__file__)), "lib"))
import pubGeneric, pubConf, maxCommon
noIssuePage = 0
noIssn = 0
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def getFingerprint2(row):
" fp2 is based on authors, year, title words "
if row.year=="":
return None
authors = row.authors.split(";")
authors = [x.strip() for x in authors]
famNames = [a.split(",")[0] for a in authors][:5]
famNameStr = "-".join(famNames)
if len(famNameStr)<5:
return None
famNameStr = remove_accents(famNameStr).lower()
#page = row.page.split()[0].split("-")[0].split(",")[0]
title1 = row.title.split(".")[0]
titleWords = title1.strip().strip(".").split()
if len(titleWords) <= 3:
return None
title = (titleWords[0]+"-"+titleWords[-1]).lower()
fprint = "|".join((famNameStr, row.year, title))
logging.debug("fingerprint2: %s" % fprint)
return fprint
def getFingerprint(row, withEIssn = False):
global noIssuePage
global noIssn
if row.page=="" or (row.page!="" and (row.vol=="" or row.issue=="")):
noIssuePage +=1
logging.debug("No issue and no page: %s" % str(row))
return None
issn = row.printIssn
if issn=="" or withEIssn:
issn = row.eIssn
if issn=="":
if not withEIssn:
logging.debug("No issn: %s" % str(row))
noIssn += 1
return None
issue=row.issue
if len(issue)!=0:
issue = issue.split()[0].split("-")[0]
page = row.page.split()[0].split("-")[0]
fprint = "|".join((issn, row.vol, issue, page))
logging.debug("fingerprint1: %s" % fprint)
return fprint
def addFprint(data, fprint, articleId):
#if fprint!=None and fprint not in skipPrints:
#if fprint in data:
#del data[fprint]
#skipPrints.add(fprint)
#else:
# do not filter for duplicates anymore
# we can have duplicates on both sides (medline and elsevier)
# just get one match
data[fprint] = articleId
def lookupFprint(fprint, artMap, artIds):
if fprint==None:
return None
artId = artMap.get(fprint, None)
if artId==None:
return None
match = artIds[artId]
extId, doi, pmid = match
return (artId, extId, doi, str(pmid))
def main(args, options):
step, dataSet1, dataSet2 = args
dir1 = pubConf.resolveTextDir(dataSet1)
dir2 = pubConf.resolveTextDir(dataSet2)
pubGeneric.setupLogging("", options)
mapStoreFname = join(dir1, "fingerprints.marshal")
outputFname = join(dir2, "%s.ids.tab" % dataSet1)
noMatchFname = join(dir2, "%s.ids.noMatch.tab" % dataSet1)
noPrintFname = join(dir2, "%s.ids.noFingerprint.tab" % dataSet1)
global noIssuePage
if step=="map":
map1 = {}
map2 = {}
artIds = {}
#skipPrints = set()
logging.info("Indexing %s" % dir1)
count = 0
for row in maxCommon.iterTsvDir(dir1, ext=".articles.gz"):
articleId = int(row.articleId)
fprint = getFingerprint(row)
addFprint(map1, fprint, articleId)
fprint2 = getFingerprint2(row)
addFprint(map2, fprint2, articleId)
artIds[articleId] = (row.externalId, row.doi, int(row.pmid))
count += 1
logging.info("Processed %d articles" % count)
logging.info("No Issn = %d, no issue,page = %d" % (noIssn, noIssuePage))
logging.info("Writing results to %s" % mapStoreFname)
ofh = open(mapStoreFname, "w")
data = (map1, map2, artIds)
marshal.dump(data, ofh)
elif step=="lookup":
logging.info("Reading %s" % mapStoreFname)
ifh = open(mapStoreFname)
map1, map2, artIds = marshal.load(ifh)
ofh = open(outputFname, "w")
headers = ["fingerprint", "artId1", "extId1", "doi1", "articleId", "extId", "doi", "pmid"]
ofh.write("\t".join(headers))
ofh.write("\n")
noPrints = []
noMatches = []
for row in maxCommon.iterTsvDir(dir2, ext=".articles.gz"):
# first try a lookup with the normal fingerprint
fprint = getFingerprint(row)
matchFprint = fprint
artData = lookupFprint(fprint, map1, artIds)
if artData==None:
# retry normal fingerprint with eIssn if the first one failed
fprint = getFingerprint(row, withEIssn=True)
matchFprint = fprint
artData = lookupFprint(fprint, map1, artIds)
if artData==None:
# if eIssn failed, try fingerprint 2
fprint2 = getFingerprint2(row)
if fprint2==None:
logging.debug("No fingerprint: %s" % str(row))
noPrints.append(row)
matchFprint = fprint2
artData = lookupFprint(fprint2, map2, artIds)
if artData==None:
if not fprint2==None:
noMatches.append(row)
logging.debug("No match: %s" % str(row))
logging.debug("fingerprints: %s and %s" % (fprint, fprint2))
continue
artId, extId, doi, pmid = artData
row = (matchFprint, row.articleId, row.externalId, row.doi, artId, extId, doi)
row = [unicode(x) for x in row]
ofh.write("\t".join(row).encode("utf8"))
ofh.write("\n")
mfh = open(noPrintFname, "w")
for p in noPrints:
#logging.info("No fingerprint: %s" % str(p))
mfh.write("\t".join(p).encode("utf8")+"\n")
mfh = open(noMatchFname, "w")
for m in noMatches:
#logging.info("No match: %s" % str(m))
mfh.write("\t".join(m).encode("utf8")+"\n")
logging.info("No fingerprint %d, no match %d" % (len(noPrints), len(noMatches)))
logging.info("Results written to %s" % outputFname)
logging.info("non-matching articles written to %s and %s" % (noPrintFname, noMatchFname))
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] step <dataset> - infer PMIDs from a dataset by comparing with medline. Uses fingerprints of papers.
When compared to data from crossref and a random collection of 10.000 DOIs, achieves a precision
of 99.99%.
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
#parser.add_option("-s", "--wordList", dest="wordList", action="store", help="optional list of words to use")
(options, args) = parser.parse_args()
if args==[]:
parser.print_help()
exit(1)
main(args, options)