Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
173 lines (146 sloc) 7.77 KB
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import re
import sys
import os.path
if sys.version < '3':
from urlparse import urlparse
from urllib import unquote_plus
string_type = unicode
else:
from urllib.parse import urlparse, unquote_plus
xrange = range
string_type = str
def auto_link_re():
return re.compile('(?:\\@[_a-zA-Z0-9]{1,17})|(?:(?:(?:(?:http|https|irc)?:\\/\\/(?:(?:[!$&-.0-9;=?A-Z_a-z]|(?:\\%[a-fA-F0-9]{2}))+(?:\\:(?:[!$&-.0-9;=?A-Z_a-z]|(?:\\%[a-fA-F0-9]{2}))+)?\\@)?)?(?:(?:(?:[a-zA-Z0-9][-a-zA-Z0-9]*\\.)+(?:(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])|(?:biz|b[abdefghijmnorstvwyz])|(?:cat|com|coop|c[acdfghiklmnoruvxyz])|d[ejkmoz]|(?:edu|e[cegrstu])|f[ijkmor]|(?:gov|g[abdefghilmnpqrstuwy])|h[kmnrtu]|(?:info|int|i[delmnoqrst])|j[emop]|k[eghimnrwyz]|l[abcikrstuvy]|(?:mil|museum|m[acdeghklmnopqrstuvwxyz])|(?:name|net|n[acefgilopruz])|(?:org|om)|(?:pro|p[aefghklmnrstwy])|qa|r[eouw]|s[abcdeghijklmnortuvyz]|(?:tel|travel|t[cdfghjklmnoprtvwz])|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]))|(?:(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])))(?:\\:\\d{1,5})?)(?:\\/(?:(?:[!#&-;=?-Z_a-z~])|(?:\\%[a-fA-F0-9]{2}))*)?)(?=\\b|\\s|$)', flags=re.IGNORECASE)
# ccTLD compressed regular expression clauses (re)created.
# .mobi .jobs deliberately excluded to discourage layer violations.
# see http://flic.kr/p/2kmuSL for more on the problematic new gTLDs
# part of $re derived from Android Open Source Project, Apache 2.0
# with a bunch of subsequent fixes/improvements (e.g. ttk.me/t44H2)
# thus auto_link_re is also Apache 2.0 licensed
# http://www.apache.org/licenses/LICENSE-2.0
# - Tantek 2010-046 (moved to auto_link_re 2012-062)
def auto_link(text, do_embed=False, max_url_length=0):
""" auto_link: param 1: text; param 2: do embeds or not
auto_link is idempotent, works on plain text or typical markup.
"""
regex = auto_link_re()
ms = regex.findall(text)
if not ms:
return text
mlen = len(ms)
sp = regex.split(text)
text = ''
for i in xrange(mlen):
mi = ms[i]
spliti = sp[i]
text = text + spliti
if sp[i + 1].startswith('/'): # regex omits end/ before </a
sp[i + 1] = sp[i + 1][1:]
mi = mi + '/' # include / in the match
spe = spliti[-2:]
# avoid 2x-linking, don't link CSS @-rules, attr values, asciibet
if ((not spe or not re.match('(?:\\=[\\"\\\']?|t;)', spe)) and
sp[i + 1].strip()[:3] != '</a' and
(mi not in ['charset', 'font', 'font-face', 'import', 'media',
'namespace', 'page', 'ABCDEFGHIJKLMNOPQ'])):
afterlink = ''
afterchar = mi[-1]
while (afterchar in '.!?,;"\')]}' and # trim punc from
(afterchar != ')' or '(' not in mi)): # 1 () pair
afterlink = afterchar + afterlink
mi = mi[:-1]
afterchar = mi[-1]
fe = 0
if do_embed:
_, fe = os.path.splitext(mi.lower())
wmi = web_address_to_uri(mi, True)
wp = urlparse(wmi)
prot = wp.scheme + ':'
hn = wp.netloc
pa = wp.path
ih = wmi.startswith('http')
display_url = mi
if max_url_length:
display_url = string_type(wp.netloc + wp.path)
if wp.query:
display_url = display_url + '?' + wp.query
if len(display_url) > max_url_length:
display_url = string_type(display_url[:max_url_length]) + ''
if (fe and
(fe == '.jpeg' or fe == '.jpg' or fe == '.png' or
fe == '.gif' or fe == '.svg')):
alt = 'a ' + 'photo' if 'photo' in mi else fe[1:]
text = (text + '<a class="auto-link figure" href="' +
wmi + '"><img alt="' + alt + '" src="' +
wmi + '"/></a>' + afterlink)
elif fe and (fe == '.mp4' or fe == '.mov' or fe == '.ogv'):
text = (text + '<a class="auto-link figure" href="' +
wmi, '"><video controls="controls" src="' +
wmi, '"></video></a>' + afterlink)
elif ih and hn == 'vimeo.com' and pa[1:].isdigit():
text = (text + '<a class="auto-link" href="' +
wmi + '">' + display_url + '</a> <iframe class="vimeo-player auto-link figure" width="480" height="385" style="border:0" src="' + prot + '//player.vimeo.com/video/' +
pa[1:] + '"></iframe>' + afterlink)
elif (hn == 'youtu.be' or
((hn == 'youtube.com' or hn == 'www.youtube.com')
and 'watch?v=' in mi)):
if hn == 'youtu.be':
yvid = pa[1:]
else:
offs = mi.index('watch?v=')
yvid = mi[offs + 8:].split('&', 1)[0]
text = (text + '<a class="auto-link" href="' +
wmi + '">' + display_url + '</a> <iframe class="youtube-player auto-link figure" width="480" height="385" style="border:0" src="' + prot + '//www.youtube.com/embed/' +
yvid + '"></iframe>' +
afterlink)
elif mi.startswith('@'):
if (sp[i + 1][:1] == '.' and
spliti != '' and ctype_email_local(spliti[-1])):
# if email address, simply append info, no linking
text = text + display_url + afterlink
else:
# treat it as a Twitter @-username reference and link it
text = (text + '<a class="auto-link h-x-username" href="' +
wmi + '">' + display_url + '</a>' +
afterlink)
elif wp.fragment:
fragmentioned = unquote_plus(wp.fragment)
if ' ' in fragmentioned and do_embed:
if fragmentioned.startswith('#'):
fragmentioned = fragmentioned[1:]
text = (text + '<blockquote class="auto-mention"><a class="auto-link" href="' +
wmi + '"><cite>' + wp.netloc + '</cite><p>' + fragmentioned
+ '</p></a></blockquote>' + afterlink)
else:
text = (text + '<a class="auto-link" href="' +
wmi + '">' + display_url + '</a>' +
afterlink)
else:
text = text + mi
return text + sp[mlen]
def web_address_to_uri(wa, addhttp=False):
if (not wa or wa.startswith('http://')
or wa.startswith('https://')
or wa.startswith('irc://')):
return wa
if (wa.startswith('Http://') or wa.startswith('Https://')):
# handle iOS4 overcapitalization of input entries
return 'h' + wa[1:]
# TBI: may want to handle typos as well like:
# missing/extra : or / http:/ http///
# missing letter in protocol: ttps htps htts, ttp htp htt, ir ic rc
# use strtolower(substr($wa, 0, 6)); // handle capitals in URLs
if wa[0] == '@':
return 'https://twitter.com/' + wa[1:]
if addhttp:
wa = 'http://' + wa
return wa
def ctype_email_local(s):
"""close enough. no '.' because this is used for last char of.
"""
return re.match("^[a-zA-Z0-9_%+-]+$", s)
if __name__ == '__main__':
text = auto_link("it's pretty interesting how twitter auto-links it... http://example.com/a_link_(with_parens) vs. (http://example.com/a_link_without)")
print(text)