Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
48 lines (40 sloc) 1.58 KB
import re
emoticons_str = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\'-]? # optional nose
[\)\]\(\[dDpP/\:\>\<\}\{@\|\\] # mouth
|
[\)\]\(\[dDpP/\:\>\<\}\{@\|\\] # mouth
[\-o\*\'-]? # optional nose
[:;=8] # eyes
[<>]?
|
<3 # heart
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'([a-zA-Z0-9_%\.+-]+@[a-zA-Z0-9\.-]+\.[a-zA-Z0-9-.]+)', # <<<<<<< V.2 -process email addresses
# r'(.+@.+)', # <<<<<<< V.1 -process email addresses
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
tweet = '@Priscilla: The world is flat! #flat-world :-> email me at
priscilla.Presley%23@dodo.gmail.com, https://neverland.ut'
print(preprocess(tweet))
You can’t perform that action at this time.