Permalink
Browse files

Fixed encoding nonsense.

The encoding problem was fixed by using BeautifulSoup. This module takes care of
properly encoding data as well as reducing the amount of code required to
retrieve the title of a page.

For this to work you'll have to have BeautifulSoup installed:

    $ pip install beautifulsoup

I also made some changes to the method get_html_and_response(). Instead of
returning some vague object (addinfourl or something) it returns the raw
response body in the second index.

See #11 for more information.

Signed-off-by: Yorick Peterse <yorick@isset.nl>
  • Loading branch information...
1 parent 14c9d64 commit f1833f524a01fb996318b27e4ff727eb20663023 Yorick Peterse committed Apr 6, 2012
Showing with 13 additions and 19 deletions.
  1. +9 −16 plugins/urls.py
  2. +4 −3 plugins/util/http.py
View
@@ -1,6 +1,8 @@
import re
-from util import hook, urlnorm, http
-from urllib2 import Request
+
+from BeautifulSoup import BeautifulSoup
+from util import hook, urlnorm, http
+from urllib2 import Request
ignore = ['buttbot']
ignore_hosts = ['youtube.com', 'twitter.com', 'youtu.be']
@@ -13,25 +15,16 @@ def show_title(match, nick='', chan='', say=None):
if not nick in ignore:
page, response = http.get_html_and_response(url)
- title = page.xpath('//title')
message = ''
- # Only ignore URLs of which "twitter" or "youtube" is part of the
- # domain and not just part some some URI segment.
if host not in ignore_hosts:
- # Don't show the title if there isn't one
- if title:
- titleList = []
- short_url = 'Not Found'
+ parser = BeautifulSoup(response)
+ title = parser.title.string.strip()
- for i in title:
- if i.text_content():
- titleList.append(i.text_content().strip())
-
- if titleList:
- titleList = ''.join(titleList)
- message = 'URL title: %s' % (titleList)
+ if title:
+ message = 'URL title: %s' % (title)
+ # Shorten URLs that are over 80 characters.
if len(url) >= 80:
short_url = http.get(
'http://is.gd/create.php',
View
@@ -30,12 +30,13 @@ def get_html(*args, **kwargs):
# Returns a tuple containing the return value of html.fromstring in the first
-# index and the raw response object in the second index.
+# index and the raw response body in the second index.
def get_html_and_response(*args, **kwargs):
response = open(*args, **kwargs)
- htmlobj = html.fromstring(response.read())
+ text = response.read()
+ htmlobj = html.fromstring(text)
- return (htmlobj, response)
+ return (htmlobj, text)
def get_xml(*args, **kwargs):

0 comments on commit f1833f5

Please sign in to comment.