Skip to content

Commit

Permalink
End python-2.7 Support (cidgoh#33)
Browse files Browse the repository at this point in the history
* pass partialMatchedResourceListSet as list to retainedPhrase()

* Updated tests for retainedPhrase()

* Updated small_simple expected test output

* Clean up test output files

* Sort outputs before printing, GComponent -> Component, add newline at end

* Show more detail when test output doesn't match expected output

* Print ontofetch/ontohelper to stderr. Update test output

* Use print function in py2.7

* Remove unused pretty printer

* Stop testing py27 start testing py37

* Drop py37 testing
  • Loading branch information
dfornika committed Oct 2, 2018
1 parent 3f39921 commit ee8582a
Show file tree
Hide file tree
Showing 25 changed files with 172 additions and 178 deletions.
4 changes: 0 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
language: python
python:
- "2.7"
- "3.6"
os:
- linux
Expand All @@ -12,9 +11,6 @@ git:
# TODO: remove these two settings after refactoring pipeline
env:
- PYTHONHASHSEED=0
matrix:
allow_failures:
- python: "2.7"

install:
- sudo apt-get update -qq
Expand Down
9 changes: 5 additions & 4 deletions lexmapr/ontofetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
**************************************************************************
"""

from __future__ import print_function
import json
import sys
import os
Expand Down Expand Up @@ -183,7 +184,7 @@ def __main__(self):
(main_ontology_file, output_file_basename) = self.onto_helper.check_ont_file(args[0], options)

# Load main ontology file into RDF graph
print("Fetching and parsing " + main_ontology_file + " ...")
print("Fetching and parsing " + main_ontology_file + " ...", file=sys.stderr)

try:
# ISSUE: ontology file taken in as ascii; rdflib doesn't accept
Expand All @@ -200,17 +201,17 @@ def __main__(self):

# Load self.struct with ontology metadata
self.onto_helper.set_ontology_metadata(self.onto_helper.queries['ontology_metadata'])
print("Metadata:", json.dumps(self.onto_helper.struct['metadata'], sort_keys=False, indent=4, separators=(',', ': ')))
print("Metadata:", json.dumps(self.onto_helper.struct['metadata'], sort_keys=False, indent=4, separators=(',', ': ')), file=sys.stderr)

# Retrieve all subclasses of 'owl:Thing' in given ontology
# and place in self.onto_helper.struct.specifications
# To retrieve just a given term like BFO:entity
# specBinding = {'root': rdflib.URIRef(self.get_expanded_id('BFO:0000001'))}
print('Doing term hierarchy query')
print('Doing term hierarchy query', file=sys.stderr)
specBinding = {'root': rdflib.URIRef(self.onto_helper.get_expanded_id('owl:Thing'))}
entities = self.onto_helper.do_query_table(self.queries['tree'], specBinding )

print('Doing terms', len(entities))
print('Doing terms', len(entities), file=sys.stderr)
self.do_entities(entities)

self.onto_helper.do_output_json(self.onto_helper.struct, output_file_basename)
Expand Down
3 changes: 2 additions & 1 deletion lexmapr/ontohelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from __future__ import print_function
import os
import json
import sys
Expand Down Expand Up @@ -276,7 +277,7 @@ def do_ontology_includes(self, main_ontology_file):
ORDER BY (?import_file)
""")

print("It has %s import files ..." % len(imports))
print("It has %s import files ..." % len(imports), file=sys.stderr)

for result_row in imports:

Expand Down
28 changes: 12 additions & 16 deletions lexmapr/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,8 @@ def retainedPhrase(termList):
wordList = []
retainedSet = []
returnedSet = []
termList = termList.replace("{", "")
termList = termList.replace("}", "")
#termList = termList.replace("'", "")
lst = termList.split("',")
# print("ddddddddddddddddeeeee " + str(lst))
for x in lst:
for x in termList:
x.replace("'", "")
lst2 = x.split(":")
a = lst2[0]
a = a.replace("=", ",")
Expand Down Expand Up @@ -966,11 +962,11 @@ def find_full_term_match(sample, lookup_table, cleaned_sample, status_addendum):
ret.update({
"matched_term": matched_term,
"all_match_terms_with_resource_ids":
str(list(retained_tokens)),
str(sorted(list(retained_tokens))),
"retained_terms_with_resource_ids":
str(list(retained_tokens)),
str(sorted(list(retained_tokens))),
"match_status_macro_level": "Full Term Match",
"match_status_micro_level": str(list(final_status)),
"match_status_micro_level": str(sorted(list(final_status))),
})
# Return
return ret
Expand Down Expand Up @@ -1355,7 +1351,7 @@ def run(args):

# Write to all headers
if args.format == "full":
fw.write("\t" + full_term_match["matched_term"] + "\t"
fw.write("\t" + str([full_term_match["matched_term"]]) + "\t"
+ full_term_match["all_match_terms_with_resource_ids"]
+ "\t"
+ full_term_match["retained_terms_with_resource_ids"]
Expand Down Expand Up @@ -1391,7 +1387,7 @@ def run(args):
status_addendum)

partial_matches = set(component_and_token_matches["component_matches"]) # Makes a set of all matched components from the above processing
status = "GComponent Match" #Note: GComponent instead of is used as tag to help sorting later in result file
status = "Component Match"

# Iterate over token_matches in component_and_token_matches
for token in component_and_token_matches["token_matches"]:
Expand Down Expand Up @@ -1468,7 +1464,7 @@ def run(args):

# If size of set is more than one member, looks for the retained matched terms by defined criteria
if (len(partialMatchedResourceListSet) > 0):
retainedSet = retainedPhrase(str(partialMatchedResourceListSet))
retainedSet = retainedPhrase(list(partialMatchedResourceListSet))
logger.debug("retainedSet " + str(retainedSet))
# HERE SHOULD HAVE ANOTHER RETAING SET

Expand All @@ -1477,7 +1473,7 @@ def run(args):
# In case it is for componet matching and we have at least one component matched
if (len(partial_matches) > 0):
if args.format == 'full':
fw.write('\t' + str(list(partial_matches)) + '\t' + str(list(partialMatchedResourceListSet)) + '\t' + str(list(retainedSet)) + '\t' + str(len(retainedSet)) + '\t' + status + '\t' + str(list(final_status)) + '\t' + str(list(remSetDiff)))
fw.write('\t' + str(sorted(list(partial_matches))) + '\t' + str(sorted(list(partialMatchedResourceListSet))) + '\t' + str(sorted(list(retainedSet))) + '\t' + str(len(retainedSet)) + '\t' + status + '\t' + str(sorted(list(final_status))) + '\t' + str(sorted(list(remSetDiff))))

compctr = 0
if args.format == 'full':
Expand All @@ -1488,7 +1484,7 @@ def run(args):
fw.write("\t" + str(memb))

if args.format == 'full':
for comp in retainedSet:
for comp in sorted(list(retainedSet)):
compctr += 1
if (compctr == 1):
fw.write("Component" + str(compctr) + "-> " + str(comp))
Expand All @@ -1497,9 +1493,9 @@ def run(args):
trigger = True
else: # In case of no matching case
if args.format == 'full':
fw.write('\t' + str(list(partial_matches)) + '\t' + str(list(partial_matches_with_ids)) + '\t\t' + "\t" + "Sorry No Match" + "\t" + str(list(remaining_tokens)))

fw.write('\t' + str(sorted(list(partial_matches))) + '\t' + str(sorted(list(partial_matches_with_ids))) + '\t\t' + "\t" + "Sorry No Match" + "\t" + str(sorted(list(remaining_tokens))))

fw.write('\n')
#Output files closed
if fw is not sys.stdout:
fw.close()
2 changes: 1 addition & 1 deletion lexmapr/tests/output/empty.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Sample_Id Sample_Desc Cleaned_Sample Phrase_POS_Tagged Probable_Candidate_Terms Matched_Term All_matched_Terms_with_Resource_IDs Retained_Terms_with_Resource_IDs Number of Components(In case of Component Match) Match_Status(Macro Level) Match_Status(Micro Level) Remaining_Tokens Different Components(In case of Component Match)
Sample_Id Sample_Desc Cleaned_Sample Phrase_POS_Tagged Probable_Candidate_Terms Matched_Term All_matched_Terms_with_Resource_IDs Retained_Terms_with_Resource_IDs Number of Components(In case of Component Match) Match_Status(Macro Level) Match_Status(Micro Level) Remaining_Tokens Different Components(In case of Component Match)
2 changes: 1 addition & 1 deletion lexmapr/tests/output/empty_not_full.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Sample_Id Sample_Desc Cleaned_Sample
Sample_Id Sample_Desc Cleaned_Sample
10 changes: 5 additions & 5 deletions lexmapr/tests/output/small_simple.tsv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Sample_Id Sample_Desc Cleaned_Sample Phrase_POS_Tagged Probable_Candidate_Terms Matched_Term All_matched_Terms_with_Resource_IDs Retained_Terms_with_Resource_IDs Number of Components(In case of Component Match) Match_Status(Macro Level) Match_Status(Micro Level) Remaining_Tokens Different Components(In case of Component Match)
small_simple1 Chicken Breast chicken breast [('chicken', 'NN'), ('breast', 'NN')] chicken breast chicken breast ['chicken breast:CandidateTerm_FoodOn_246'] ['chicken breast:CandidateTerm_FoodOn_246'] Full Term Match ['Change of Case in Input Data']
small_simple2 Baked Potato baked potato [('baked', 'VBN'), ('potato', 'NN')] potato baked potato ['potato (baked):FOODON_03302196'] ['potato (baked):FOODON_03302196'] Full Term Match ['Permutation of Tokens in Bracketed Resource Term']
small_simple3 Canned Corn canned corn [('canned', 'VBN'), ('corn', 'NN')] corn canned corn ['corn (canned):FOODON_03302665'] ['corn (canned):FOODON_03302665'] Full Term Match ['Permutation of Tokens in Bracketed Resource Term']
small_simple4 Frozen Yogurt frozen yogurt [('frozen', 'JJ'), ('yogurt', 'NN')] yogurt frozen yogurt ['frozen yogurt:FOODON_03307445'] ['frozen yogurt:FOODON_03307445'] Full Term Match ['Change of Case in Input Data']
small_simple5 Apple Pie apple pie [('apple', 'NN'), ('pie', 'NN')] apple pie ['pie', 'apple food product'] ['pie:CandidateTerm_FoodOn_839', 'apple food product:FOODON_00001611'] ["apple food product:FOODON_00001611'", 'pie:CandidateTerm_FoodOn_839'] 2 GComponent Match ['Suffix Addition- food product to the Input'] [] Component1-> apple food product:FOODON_00001611', Component2-> pie:CandidateTerm_FoodOn_839
small_simple1 Chicken Breast chicken breast [('chicken', 'NN'), ('breast', 'NN')] chicken breast ['chicken breast'] ['chicken breast:CandidateTerm_FoodOn_246'] ['chicken breast:CandidateTerm_FoodOn_246'] Full Term Match ['Change of Case in Input Data']
small_simple2 Baked Potato baked potato [('baked', 'VBN'), ('potato', 'NN')] potato ['baked potato'] ['potato (baked):FOODON_03302196'] ['potato (baked):FOODON_03302196'] Full Term Match ['Permutation of Tokens in Bracketed Resource Term']
small_simple3 Canned Corn canned corn [('canned', 'VBN'), ('corn', 'NN')] corn ['canned corn'] ['corn (canned):FOODON_03302665'] ['corn (canned):FOODON_03302665'] Full Term Match ['Permutation of Tokens in Bracketed Resource Term']
small_simple4 Frozen Yogurt frozen yogurt [('frozen', 'JJ'), ('yogurt', 'NN')] yogurt ['frozen yogurt'] ['frozen yogurt:FOODON_03307445'] ['frozen yogurt:FOODON_03307445'] Full Term Match ['Change of Case in Input Data']
small_simple5 Apple Pie apple pie [('apple', 'NN'), ('pie', 'NN')] apple pie ['apple food product', 'pie'] ['apple food product:FOODON_00001611', 'pie:CandidateTerm_FoodOn_839'] ['apple food product:FOODON_00001611', 'pie:CandidateTerm_FoodOn_839'] 2 Component Match ['Suffix Addition- food product to the Input'] [] Component1-> apple food product:FOODON_00001611, Component2-> pie:CandidateTerm_FoodOn_839
2 changes: 1 addition & 1 deletion lexmapr/tests/output/small_simple_not_full.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ small_simple1 Chicken Breast chicken breast chicken breast ['chicken breast:Cand
small_simple2 Baked Potato baked potato baked potato ['potato (baked):FOODON_03302196']
small_simple3 Canned Corn canned corn canned corn ['corn (canned):FOODON_03302665']
small_simple4 Frozen Yogurt frozen yogurt frozen yogurt ['frozen yogurt:FOODON_03307445']
small_simple5 Apple Pie apple pie apple food product:FOODON_00001611' pie:CandidateTerm_FoodOn_839
small_simple5 Apple Pie apple pie pie:CandidateTerm_FoodOn_839 apple food product:FOODON_00001611
18 changes: 9 additions & 9 deletions lexmapr/tests/output/test_abbreviations.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Sample_Id Sample_Desc Cleaned_Sample Phrase_POS_Tagged Probable_Candidate_Terms Matched_Term All_matched_Terms_with_Resource_IDs Retained_Terms_with_Resource_IDs Number of Components(In case of Component Match) Match_Status(Macro Level) Match_Status(Micro Level) Remaining_Tokens Different Components(In case of Component Match)
small_simple1 Breast breast [('breast', 'NN')] breast breast ['breast:CandidateTerm_FoodOn_188'] ['breast:CandidateTerm_FoodOn_188'] Full Term Match ['Change of Case in Input Data']
small_simple2 Baked Potato baked potato [('baked', 'VBN'), ('potato', 'NN')] potato baked potato ['potato (baked):FOODON_03302196'] ['potato (baked):FOODON_03302196'] Full Term Match ['Permutation of Tokens in Bracketed Resource Term']
small_simple3 fld fluid [('fluid', 'NN')] fluid fluid ['fluid:CandidateTerm_OtherOntology_76'] ['fluid:CandidateTerm_OtherOntology_76'] Full Term Match ['Abbreviation-Acronym Treatment', 'A Direct Match with Cleaned Sample']
small_simple4 froz fld frozen fluid [('frozen', 'JJ'), ('fluid', 'NN')] fluid ['fluid', 'preservation by freezing'] ['preservation by freezing:FOODON_03470136', 'fluid:CandidateTerm_OtherOntology_76'] ["fluid:CandidateTerm_OtherOntology_76'", 'preservation by freezing:FOODON_03470136'] 2 GComponent Match ['Synonym Usage', 'Abbreviation-Acronym Treatment'] ['froz'] Component1-> fluid:CandidateTerm_OtherOntology_76', Component2-> preservation by freezing:FOODON_03470136
small_simple5 fld apple fluid apple [('fluid', 'NN'), ('apple', 'NN')] fluid apple ['fluid', 'apple food product'] ['apple food product:FOODON_00001611', 'fluid:CandidateTerm_OtherOntology_76'] ["fluid:CandidateTerm_OtherOntology_76'", 'apple food product:FOODON_00001611'] 2 GComponent Match ['Abbreviation-Acronym Treatment', 'Suffix Addition- food product to the Input'] [] Component1-> fluid:CandidateTerm_OtherOntology_76', Component2-> apple food product:FOODON_00001611
small_simple6 csf cerebrospinal fluid [('cerebrospinal', 'JJ'), ('fluid', 'NN')] fluid cerebrospinal fluid ['cerebrospinal fluid:ENVO_02000029'] ['cerebrospinal fluid:ENVO_02000029'] Full Term Match ['Change Case and Abbreviation-Acronym Treatment', 'A Direct Match with Cleaned Sample']
small_simple7 csf f cerebrospinal fluid fahrenheit [('cerebrospinal', 'JJ'), ('fluid', 'NN'), ('fahrenheit', 'NN')] fluid fahrenheit ['cerebrospinal fluid', 'fluid', 'cerebrospinal'] ['cerebrospinal:Quality-BodyRelated', 'cerebrospinal fluid:ENVO_02000029', 'fluid:CandidateTerm_OtherOntology_76'] ['cerebrospinal fluid:ENVO_02000029'] 1 GComponent Match ['Using Semantic Tagging Resources', 'Change Case and Abbreviation-Acronym Treatment'] ['f'] Component1-> cerebrospinal fluid:ENVO_02000029
small_simple8 Pie csf pie cerebrospinal fluid [('pie', 'JJ'), ('cerebrospinal', 'JJ'), ('fluid', 'NN')] fluid ['cerebrospinal fluid', 'fluid', 'pie', 'cerebrospinal'] ['cerebrospinal:Quality-BodyRelated', 'cerebrospinal fluid:ENVO_02000029', 'pie:CandidateTerm_FoodOn_839', 'fluid:CandidateTerm_OtherOntology_76'] ['cerebrospinal fluid:ENVO_02000029', 'pie:CandidateTerm_FoodOn_839'] 2 GComponent Match ['Using Semantic Tagging Resources', 'Change Case and Abbreviation-Acronym Treatment'] [] Component1-> cerebrospinal fluid:ENVO_02000029, Component2-> pie:CandidateTerm_FoodOn_839
small_simple9 csf fld cerebrospinal fluid fluid [('cerebrospinal', 'JJ'), ('fluid', 'NN'), ('fluid', 'NN')] fluid fluid ['cerebrospinal fluid', 'fluid', 'cerebrospinal'] ['cerebrospinal:Quality-BodyRelated', 'cerebrospinal fluid:ENVO_02000029', 'fluid:CandidateTerm_OtherOntology_76'] ['cerebrospinal fluid:ENVO_02000029'] 1 GComponent Match ['Abbreviation-Acronym Treatment', 'Using Semantic Tagging Resources', 'Change Case and Abbreviation-Acronym Treatment'] [] Component1-> cerebrospinal fluid:ENVO_02000029
small_simple1 Breast breast [('breast', 'NN')] breast ['breast'] ['breast:CandidateTerm_FoodOn_188'] ['breast:CandidateTerm_FoodOn_188'] Full Term Match ['Change of Case in Input Data']
small_simple2 Baked Potato baked potato [('baked', 'VBN'), ('potato', 'NN')] potato ['baked potato'] ['potato (baked):FOODON_03302196'] ['potato (baked):FOODON_03302196'] Full Term Match ['Permutation of Tokens in Bracketed Resource Term']
small_simple3 fld fluid [('fluid', 'NN')] fluid ['fluid'] ['fluid:CandidateTerm_OtherOntology_76'] ['fluid:CandidateTerm_OtherOntology_76'] Full Term Match ['A Direct Match with Cleaned Sample', 'Abbreviation-Acronym Treatment']
small_simple4 froz fld frozen fluid [('frozen', 'JJ'), ('fluid', 'NN')] fluid ['fluid', 'preservation by freezing'] ['fluid:CandidateTerm_OtherOntology_76', 'preservation by freezing:FOODON_03470136'] ['fluid:CandidateTerm_OtherOntology_76', 'preservation by freezing:FOODON_03470136'] 2 Component Match ['Abbreviation-Acronym Treatment', 'Synonym Usage'] ['froz'] Component1-> fluid:CandidateTerm_OtherOntology_76, Component2-> preservation by freezing:FOODON_03470136
small_simple5 fld apple fluid apple [('fluid', 'NN'), ('apple', 'NN')] fluid apple ['apple food product', 'fluid'] ['apple food product:FOODON_00001611', 'fluid:CandidateTerm_OtherOntology_76'] ['apple food product:FOODON_00001611', 'fluid:CandidateTerm_OtherOntology_76'] 2 Component Match ['Abbreviation-Acronym Treatment', 'Suffix Addition- food product to the Input'] [] Component1-> apple food product:FOODON_00001611, Component2-> fluid:CandidateTerm_OtherOntology_76
small_simple6 csf cerebrospinal fluid [('cerebrospinal', 'JJ'), ('fluid', 'NN')] fluid ['cerebrospinal fluid'] ['cerebrospinal fluid:ENVO_02000029'] ['cerebrospinal fluid:ENVO_02000029'] Full Term Match ['A Direct Match with Cleaned Sample', 'Change Case and Abbreviation-Acronym Treatment']
small_simple7 csf f cerebrospinal fluid fahrenheit [('cerebrospinal', 'JJ'), ('fluid', 'NN'), ('fahrenheit', 'NN')] fluid fahrenheit ['cerebrospinal', 'cerebrospinal fluid', 'fluid'] ['cerebrospinal fluid:ENVO_02000029', 'cerebrospinal:Quality-BodyRelated', 'fluid:CandidateTerm_OtherOntology_76'] ['cerebrospinal fluid:ENVO_02000029'] 1 Component Match ['Change Case and Abbreviation-Acronym Treatment', 'Using Semantic Tagging Resources'] ['f'] Component1-> cerebrospinal fluid:ENVO_02000029
small_simple8 Pie csf pie cerebrospinal fluid [('pie', 'JJ'), ('cerebrospinal', 'JJ'), ('fluid', 'NN')] fluid ['cerebrospinal', 'cerebrospinal fluid', 'fluid', 'pie'] ['cerebrospinal fluid:ENVO_02000029', 'cerebrospinal:Quality-BodyRelated', 'fluid:CandidateTerm_OtherOntology_76', 'pie:CandidateTerm_FoodOn_839'] ['cerebrospinal fluid:ENVO_02000029', 'pie:CandidateTerm_FoodOn_839'] 2 Component Match ['Change Case and Abbreviation-Acronym Treatment', 'Using Semantic Tagging Resources'] [] Component1-> cerebrospinal fluid:ENVO_02000029, Component2-> pie:CandidateTerm_FoodOn_839
small_simple9 csf fld cerebrospinal fluid fluid [('cerebrospinal', 'JJ'), ('fluid', 'NN'), ('fluid', 'NN')] fluid fluid ['cerebrospinal', 'cerebrospinal fluid', 'fluid'] ['cerebrospinal fluid:ENVO_02000029', 'cerebrospinal:Quality-BodyRelated', 'fluid:CandidateTerm_OtherOntology_76'] ['cerebrospinal fluid:ENVO_02000029'] 1 Component Match ['Abbreviation-Acronym Treatment', 'Change Case and Abbreviation-Acronym Treatment', 'Using Semantic Tagging Resources'] [] Component1-> cerebrospinal fluid:ENVO_02000029

0 comments on commit ee8582a

Please sign in to comment.