Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 69 lines (54 sloc) 2.026 kb
e757197 cleanup emoticons regexes
brendano authored
1 """ emoticon recognition via patterns. tested on english-language twitter, but
2 probably works for other social media dialects. """
f3d02d2 checkpointing stuff
brendano authored
3
4 __author__ = "Brendan O'Connor (anyall.org, brenocon@gmail.com)"
e757197 cleanup emoticons regexes
brendano authored
5 __version__= "april 2009"
f3d02d2 checkpointing stuff
brendano authored
6
2729119 tokenizer
brendano authored
7 #from __future__ import print_function
8 import re,sys
9
10 mycompile = lambda pat: re.compile(pat, re.UNICODE)
11 #SMILEY = mycompile(r'[:=].{0,1}[\)dpD]')
12 #MULTITOK_SMILEY = mycompile(r' : [\)dp]')
13
e757197 cleanup emoticons regexes
brendano authored
14 NormalEyes = r'[:=]'
15 Wink = r'[;]'
2729119 tokenizer
brendano authored
16
e757197 cleanup emoticons regexes
brendano authored
17 NoseArea = r'(|o|O|-)' ## rather tight precision, \S might be reasonable...
f3d02d2 checkpointing stuff
brendano authored
18
e757197 cleanup emoticons regexes
brendano authored
19 HappyMouths = r'[D\)\]]'
20 SadMouths = r'[\(\[]'
21 Tongue = r'[pP]'
22 OtherMouths = r'[doO/\\]' # remove forward slash if http://'s aren't cleaned
2729119 tokenizer
brendano authored
23
e757197 cleanup emoticons regexes
brendano authored
24 Happy_RE = mycompile( '(\^_\^|' + NormalEyes + NoseArea + HappyMouths + ')')
25 Sad_RE = mycompile(NormalEyes + NoseArea + SadMouths)
2729119 tokenizer
brendano authored
26
e757197 cleanup emoticons regexes
brendano authored
27 Wink_RE = mycompile(Wink + NoseArea + HappyMouths)
28 Tongue_RE = mycompile(NormalEyes + NoseArea + Tongue)
29 Other_RE = mycompile( '('+NormalEyes+'|'+Wink+')' + NoseArea + OtherMouths )
2729119 tokenizer
brendano authored
30
e757197 cleanup emoticons regexes
brendano authored
31 Emoticon = (
32 "("+NormalEyes+"|"+Wink+")" +
33 NoseArea +
34 "("+Tongue+"|"+OtherMouths+"|"+SadMouths+"|"+HappyMouths+")"
2729119 tokenizer
brendano authored
35 )
e757197 cleanup emoticons regexes
brendano authored
36 Emoticon_RE = mycompile(Emoticon)
2729119 tokenizer
brendano authored
37
e757197 cleanup emoticons regexes
brendano authored
38 #Emoticon_RE = "|".join([Happy_RE,Sad_RE,Wink_RE,Tongue_RE,Other_RE])
39 #Emoticon_RE = mycompile(Emoticon_RE)
2729119 tokenizer
brendano authored
40
41 def analyze_tweet(text):
e757197 cleanup emoticons regexes
brendano authored
42 h= Happy_RE.search(text)
43 s= Sad_RE.search(text)
2729119 tokenizer
brendano authored
44 if h and s: return "BOTH_HS"
45 if h: return "HAPPY"
46 if s: return "SAD"
47 return "NA"
48
49 # more complex & harder, so disabled for now
e757197 cleanup emoticons regexes
brendano authored
50 #w= Wink_RE.search(text)
51 #t= Tongue_RE.search(text)
52 #a= Other_RE.search(text)
2729119 tokenizer
brendano authored
53 #h,w,s,t,a = [bool(x) for x in [h,w,s,t,a]]
54 #if sum([h,w,s,t,a])>1: return "MULTIPLE"
55 #if sum([h,w,s,t,a])==1:
56 # if h: return "HAPPY"
57 # if s: return "SAD"
58 # if w: return "WINK"
59 # if a: return "OTHER"
60 # if t: return "TONGUE"
61 #return "NA"
62
e757197 cleanup emoticons regexes
brendano authored
63 if __name__=='__main__':
64 for line in sys.stdin:
65 import sane_re
66 sane_re._S(line[:-1]).show_match(Emoticon_RE, numbers=False)
67 #print(analyze_tweet(line.strip()), line.strip(), sep="\t")
695fe20 a few mixed emotion mouths
brendano authored
68
Something went wrong with that request. Please try again.