Skip to content

Commit

Permalink
resource names get capitalized, #72
Browse files Browse the repository at this point in the history
  • Loading branch information
janetzki committed Feb 5, 2017
1 parent 5866c0d commit f71c08e
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 8 deletions.
6 changes: 6 additions & 0 deletions helper_functions/uri_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ def convert_to_dbpedia_resource_uri(uri):
return 'http://dbpedia.org/resource/' + entity_name


def capitalize(uri):
prefix, entity_name = uri.rsplit('/', 1)
entity_name = entity_name[0].upper() + entity_name[1:]
return prefix + '/' + entity_name


def strip_cleaned_name(uri):
"""
http://dbpedia.org/resource/Alain_Connes -> 'Alain Connes'
Expand Down
3 changes: 3 additions & 0 deletions pattern_recognition/fact_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ def test(fact_extractor):
print(fact_extractor.extract_facts_from_html(
'Merkel was educated in Templin and at the <a href="/wiki/University_of_Leipzig" class="mw-redirect" title="University of Leipzig">University of Leipzig</a>, where she studied <a href="/wiki/Physics" title="Physics">physics</a> from 1973 to 1978.',
'Angela Merkel'))
print(fact_extractor.extract_facts_from_html(
'He loves <a href="/wiki/pyrotechnic">pyrotechnic</a>.',
'Me'))


if __name__ == '__main__':
Expand Down
9 changes: 2 additions & 7 deletions wikipedia_connector/tagged_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@

class TaggedSentence(object):
def __init__(self, sentence, links, relative_position):
if 'divorce' in sentence:
pass
self.sentence = []
sentence = TaggedSentence.__clean_input(sentence)
tokens = stanford_tokenizer.tokenize(sentence)
Expand Down Expand Up @@ -176,7 +174,7 @@ def __init__(self, token, target_url=None):
self._link = target_url
# TODO if target url is set look for dbpedia redirects as aliases
if target_url is not None and len(target_url) > 0:
self._link = target_url[0].upper() + target_url[1:] # Hotfix for issue #72, TODO: find better solution
self._link = uri_rewriting.capitalize(target_url) # Hotfix for issue #72, TODO: find better solution

@property
def text(self):
Expand All @@ -187,10 +185,7 @@ def link(self):
return self._link

def is_link(self):
if self._link:
return True
else:
return False
return self._link is not None

def __str__(self):
string = self._text
Expand Down
3 changes: 2 additions & 1 deletion wikipedia_connector/wikipedia_dump_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ def get_wikipedia_html_from_dump(self, resource):
page = self._extract_wikipedia_page_via_offset(offset)
text = WikipediaDumpExtractor._extract_wikipedia_text_from_page(page)
if not WikipediaDumpExtractor._is_wikimarkup_consistent(text):
pass
if self.warnings:
print('[WARN] Wikimarkup is inconsistent.')
html_text = WikipediaDumpExtractor._make_wikipedia_text_to_html(text)
self._test_cleaning(html_text)
return html_text
Expand Down

0 comments on commit f71c08e

Please sign in to comment.