Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add automatic hashtag parsing from note-like posts #57

Merged
merged 12 commits into from Jun 4, 2015
4 changes: 4 additions & 0 deletions redwind/admin.py
Expand Up @@ -275,6 +275,10 @@ def save_post(post):
post.audience = util.multiline_string_to_list(audience)

tags = request.form.getlist('tags')
if post.post_type != "article" and post.content:
# parse out hashtags as tag links from note-like posts
post.content, htags = util.parse_hashtags(post.content)
tags += htags
tags = list(filter(None, map(util.normalize_tag, tags)))
post.tags = [Tag.query.filter_by(name=tag).first() or Tag(tag)
for tag in tags]
Expand Down
4 changes: 2 additions & 2 deletions redwind/templates/admin/venues.jinja2
Expand Up @@ -24,8 +24,8 @@
{{venue.name}}
</a>
{% if current_user.is_authenticated() %}
<a href="{{url_for('edit_venue', id=venue.id)}}"><i class="glyphicon glyphicon-edit"></i> Edit</a>
<a href="{{url_for('delete_venue', id=venue.id)}}"><i class="glyphicon glyphicon-trash"></i> Delete</a>
<a href="{{url_for('admin.edit_venue', id=venue.id)}}"><i class="glyphicon glyphicon-edit"></i> Edit</a>
<a href="{{url_for('admin.delete_venue', id=venue.id)}}"><i class="glyphicon glyphicon-trash"></i> Delete</a>
{% endif %}
</li>
{% endfor %}
Expand Down
52 changes: 41 additions & 11 deletions redwind/util.py
Expand Up @@ -36,6 +36,7 @@
INSTAGRAM_RE = re.compile(r'https?://(?:www\.|mobile\.)?instagram\.com/p/([a-zA-Z0-9_\-]+)/?')
PEOPLE_RE = re.compile(r"\[\[([\w ]+)(?:\|([\w\-'. ]+))?\]\]")
RELATIVE_PATH_RE = re.compile('\[([^\]]*)\]\(([^/)]+)\)')
HASHTAG_RE = re.compile('(?<!\w)#(\w\w+)')

AT_USERNAME_RE = re.compile(r"""(?<!\w)@([a-zA-Z0-9_]+)(?=($|[\s,:;.?'")]))""")
LINK_RE = re.compile(
Expand Down Expand Up @@ -160,15 +161,7 @@ def person_to_microcard(contact, nick, soup):
return a_tag


def autolink(plain, url_processor=url_to_link,
person_processor=person_to_microcard):
"""Replace bare URLs in a document with an HTML <a> representation
"""
blacklist = ('a', 'script', 'pre', 'code', 'embed', 'object',
'audio', 'video')
soup = bs4.BeautifulSoup(plain)

def bs4_sub(regex, repl):
def bs4_sub(soup, regex, repl, blacklist):
"""Process text elements in a BeautifulSoup document with a regex and
replacement string.

Expand Down Expand Up @@ -199,6 +192,15 @@ def bs4_sub(regex, repl):
for offset, node in enumerate(nodes):
parent.insert(ii + offset, node)


def autolink(plain, url_processor=url_to_link,
person_processor=person_to_microcard):
"""Replace bare URLs in a document with an HTML <a> representation
"""
blacklist = ('a', 'script', 'pre', 'code', 'embed', 'object',
'audio', 'video')
soup = bs4.BeautifulSoup(plain)

def link_repl(m):
url = (m.group(1) or 'http://') + m.group(2)
return url_processor(url, soup)
Expand All @@ -216,10 +218,10 @@ def process_nick(m):
return m.group(0)

if url_processor:
bs4_sub(LINK_RE, link_repl)
bs4_sub(soup, LINK_RE, link_repl, blacklist)

if person_processor:
bs4_sub(AT_USERNAME_RE, process_nick)
bs4_sub(soup, AT_USERNAME_RE, process_nick, blacklist)

return ''.join(str(t) for t in soup.body.contents) if soup.body else ''

Expand Down Expand Up @@ -359,6 +361,8 @@ def format_as_text(html, link_fn=None):
for a in soup.find_all('a'):
if link_fn:
link_fn(a)
elif a.text[0] == '#':
a.replace_with(a.text)
else:
a.replace_with(a.get('href') or '[link]')

Expand Down Expand Up @@ -471,3 +475,29 @@ def find_first_syndicated(originals):
find_first_syndicated(post.repost_of),
find_first_syndicated(post.like_of),
)


def parse_hashtags(s):
""" Parses out hashtags from a string and replaces them with links, then
returns the new string and a list of tags encountered
"""
blacklist = ('a', 'script', 'pre', 'code', 'embed', 'object',
'audio', 'video')
soup = bs4.BeautifulSoup(s)
tags = []

def process_hashtag(m):
url = '/tags/' + m.group(1).lower()
a = soup.new_tag('a', href=url)
a.string = m.group(0)

tags.append(m.group(1).lower())

return a

try:
bs4_sub(soup, HASHTAG_RE, process_hashtag, blacklist)
s = ''.join(str(t) for t in soup.body.contents) if soup.body else ''
return s, tags
except TypeError:
return s, []
52 changes: 52 additions & 0 deletions tests/util_test.py
Expand Up @@ -141,3 +141,55 @@ def simple_name_marker(contact, name, soup):
assert out == util.autolink(
inp, person_processor=simple_name_marker,
url_processor=None)


def test_parsing_hashtags():
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks so much for adding test cases! could you add a couple more :)

  1. inside a larger string

  2. inside a URL (e.g. http://example.com/path#fragment)

"""Exercise the #-tag matching regex
"""

test_cases = [
('#hashtag should be linked',
'<a href="/tags/hashtag">#hashtag</a> should be linked',
['hashtag']),
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

heh, I wonder if this will fix the problem where markdown thinks a # at the beginning of a note means it should be a header.

('hashtag should not be linked',
'hashtag should not be linked',
[]),
('match #hashtags in the middle',
'match <a href="/tags/hashtags">#hashtags</a> in the middle',
['hashtags']),
('match a tag at the #end',
'match a tag at the <a href="/tags/end">#end</a>',
['end']),
('#1 should not be linked',
'#1 should not be linked',
[]),
('#12345 should be linked',
'<a href="/tags/12345">#12345</a> should be linked',
['12345']),
('#.foobar should not be linked',
'#.foobar should not be linked',
[]),
('#foo.bar should be partially linked',
'<a href="/tags/foo">#foo</a>.bar should be partially linked',
['foo']),
('capital letters in #HashTags will be lowercased',
'capital letters in <a href="/tags/hashtags">#HashTags</a> will be lowercased',
['hashtags']),
('duplicate #hashtags should parse both #hashtags fine',
'duplicate <a href="/tags/hashtags">#hashtags</a> should parse both <a href="/tags/hashtags">#hashtags</a> fine',
['hashtags','hashtags']),
('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent tincidunt aliquam sem, in tempus elit lacinia vel. Integer accumsan cursus purus et euismod. Nullam ultricies nunc sit amet ante consequat porta. Pellentesque et porta odio. Sed et neque cursus, iaculis lorem nec, laoreet odio. Donec molestie volutpat vestibulum. Curabitur rhoncus elit ut massa pretium luctus. #Nullam sollicitudin ligula vitae tincidunt suscipit. Maecenas in neque porta, scelerisque metus at, mollis nunc. Fusce accumsan imperdiet velit, in tincidunt tellus aliquam ac. Nullam iaculis vel urna sed vulputate. Aliquam erat volutpat. Etiam et tortor turpis. Vivamus mattis enim lacus, in aliquet nulla blandit.',
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent tincidunt aliquam sem, in tempus elit lacinia vel. Integer accumsan cursus purus et euismod. Nullam ultricies nunc sit amet ante consequat porta. Pellentesque et porta odio. Sed et neque cursus, iaculis lorem nec, laoreet odio. Donec molestie volutpat vestibulum. Curabitur rhoncus elit ut massa pretium luctus. <a href="/tags/nullam">#Nullam</a> sollicitudin ligula vitae tincidunt suscipit. Maecenas in neque porta, scelerisque metus at, mollis nunc. Fusce accumsan imperdiet velit, in tincidunt tellus aliquam ac. Nullam iaculis vel urna sed vulputate. Aliquam erat volutpat. Etiam et tortor turpis. Vivamus mattis enim lacus, in aliquet nulla blandit.',
['nullam']),
('this hash#tag will not be parsed',
'this hash#tag will not be parsed',
[]),
('http://example.com/path#fragment',
'http://example.com/path#fragment',
[]),
]

for inp, out, tags in test_cases:
res, ts = util.parse_hashtags(inp)
assert out == res
assert tags == ts