resource names get capitalized, #72

janetzki · Feb 5, 2017 · f71c08e · f71c08e
1 parent 5866c0d
commit f71c08e
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 8 deletions.
diff --git a/helper_functions/uri_rewriting.py b/helper_functions/uri_rewriting.py
@@ -17,6 +17,12 @@ def convert_to_dbpedia_resource_uri(uri):
     return 'http://dbpedia.org/resource/' + entity_name
 
 
+def capitalize(uri):
+    prefix, entity_name = uri.rsplit('/', 1)
+    entity_name = entity_name[0].upper() + entity_name[1:]
+    return prefix + '/' + entity_name
+
+
 def strip_cleaned_name(uri):
     """
     http://dbpedia.org/resource/Alain_Connes -> 'Alain Connes'

diff --git a/pattern_recognition/fact_extractor.py b/pattern_recognition/fact_extractor.py
@@ -194,6 +194,9 @@ def test(fact_extractor):
     print(fact_extractor.extract_facts_from_html(
         'Merkel was educated in Templin and at the <a href="/wiki/University_of_Leipzig" class="mw-redirect" title="University of Leipzig">University of Leipzig</a>, where she studied <a href="/wiki/Physics" title="Physics">physics</a> from 1973 to 1978.',
         'Angela Merkel'))
+    print(fact_extractor.extract_facts_from_html(
+        'He loves <a href="/wiki/pyrotechnic">pyrotechnic</a>.',
+        'Me'))
 
 
 if __name__ == '__main__':

diff --git a/wikipedia_connector/tagged_sentence.py b/wikipedia_connector/tagged_sentence.py
@@ -15,8 +15,6 @@
 
 class TaggedSentence(object):
     def __init__(self, sentence, links, relative_position):
-        if 'divorce' in sentence:
-            pass
         self.sentence = []
         sentence = TaggedSentence.__clean_input(sentence)
         tokens = stanford_tokenizer.tokenize(sentence)
@@ -176,7 +174,7 @@ def __init__(self, token, target_url=None):
         self._link = target_url
         # TODO if target url is set look for dbpedia redirects as aliases
         if target_url is not None and len(target_url) > 0:
-            self._link = target_url[0].upper() + target_url[1:]  # Hotfix for issue #72, TODO: find better solution
+            self._link = uri_rewriting.capitalize(target_url)  # Hotfix for issue #72, TODO: find better solution
 
     @property
     def text(self):
@@ -187,10 +185,7 @@ def link(self):
         return self._link
 
     def is_link(self):
-        if self._link:
-            return True
-        else:
-            return False
+        return self._link is not None
 
     def __str__(self):
         string = self._text

diff --git a/wikipedia_connector/wikipedia_dump_extractor.py b/wikipedia_connector/wikipedia_dump_extractor.py
@@ -145,7 +145,8 @@ def get_wikipedia_html_from_dump(self, resource):
         page = self._extract_wikipedia_page_via_offset(offset)
         text = WikipediaDumpExtractor._extract_wikipedia_text_from_page(page)
         if not WikipediaDumpExtractor._is_wikimarkup_consistent(text):
-            pass
+            if self.warnings:
+                print('[WARN]   Wikimarkup is inconsistent.')
         html_text = WikipediaDumpExtractor._make_wikipedia_text_to_html(text)
         self._test_cleaning(html_text)
         return html_text