Fix incorrectly chopped excerpts in .awik/.wik

Excerpts with Unicode text were not truncated to fit a single IRC message properly
apertium · Dec 17, 2013 · 95d837a · 95d837a
1 parent 062fdab
commit 95d837a
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 6 deletions.
diff --git a/modules/apertium_wiki.py b/modules/apertium_wiki.py
@@ -60,12 +60,12 @@ def awik(phenny, input):
       else:
         text = page.findall(".//*[@id='mw-content-text']")[0]
 
-   sentences = text.text_content().split(". ")   
+   sentences = text.text_content().split(". ")
    sentence = '"' + sentences[0] + '"'
-   
-   maxlength = 430 - len(' - ' + wikiuri % (format_term_display(term)))
+
+   maxlength = 430 - len((' - ' + wikiuri % (format_term_display(term))).encode('utf-8'))
    if len(sentence.encode('utf-8')) > maxlength: 
-      sentence = sentence[:maxlength]
+      sentence = sentence.encode('utf-8')[:maxlength].decode('utf-8', 'ignore')
       words = sentence[:-5].split(' ')
       words.pop()
       sentence = ' '.join(words) + ' [...]'

diff --git a/modules/wikipedia.py b/modules/wikipedia.py
@@ -66,9 +66,9 @@ def parse_wiki_page(url, term, section = None):
     sentences = text.text_content().split(". ")   
     sentence = '"' + sentences[0] + '"'
 
-    maxlength = 440 - len(' - ' + url)
+    maxlength = 430 - len((' - ' + url).encode('utf-8'))
     if len(sentence.encode('utf-8')) > maxlength: 
-        sentence = sentence[:maxlength]
+        sentence = sentence.encode('utf-8')[:maxlength].decode('utf-8', 'ignore')
         words = sentence[:-5].split(' ')
         words.pop()
         sentence = ' '.join(words) + ' [...]'