In [1]:
spark

In [2]:
import os
import sys
import re
from unicode_codes import EMOJI_UNICODE, EMOJI_UNICODE_SET

In [3]:
DATA_DIR = 'small_data'

In [4]:
sys.version

'3.6.4 (default, Jan 17 2018, 15:26:12) \n[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.39.2)]'

In [5]:
def get_re(match, window=2):
    word_or_emoji_re = "(['#@]?\w[\w'-]*|\W)?"

    r = "{} " + (window - 1) * "*?{} " \
        + "*?({}) " + (window - 1) * "*{} " + "*{}"
    a = [word_or_emoji_re] * window + [match] + [word_or_emoji_re] * window

    return re.compile(r.format(*a))

In [6]:
match = EMOJI_UNICODE[':pistol:']
match_sql = u"%{}%".format(match)

match_re_1 = get_re(match, window=1)

In [7]:
files = spark.read.json(os.path.join(DATA_DIR, "01", "*"))

In [8]:
cnt = files.filter(files.delete.isNull()).count()

In [9]:
files_filtered = files.filter(files.delete.isNull()) \
                      .filter(files.text.like(match_sql)).cache()

In [10]:
tweets = files_filtered.rdd \
                       .map(lambda x: x.text)

In [11]:
langs = files_filtered.rdd \
                      .map(lambda x: x.lang)

In [12]:
locs = files_filtered.rdd \
                     .map(lambda x: (x.geo, x.user.time_zone))

In [13]:
locs.take(10)

[(None, 'Brussels'),
 (None, 'Eastern Time (US & Canada)'),
 (None, 'Brasilia'),
 (None, None),
 (None, None),
 (None, None),
 (None, 'Athens'),
 (None, None),
 (None, 'Central Time (US & Canada)'),
 (None, None)]

In [14]:
langs_cnt = langs.countByValue()

In [15]:
langs_cnt

defaultdict(int,
            {'ar': 8,
             'en': 107,
             'es': 17,
             'fr': 7,
             'in': 8,
             'it': 1,
             'ja': 7,
             'pl': 1,
             'pt': 6,
             'ro': 1,
             'ru': 3,
             'sv': 1,
             'th': 3,
             'tl': 9,
             'tr': 1,
             'und': 17})

In [16]:
cnt_tweets = tweets.count()

In [17]:
results = tweets.flatMap(lambda text: re.findall(match_re_1, text)) \
                .cache()
before = results.filter(lambda t: (t[0] in EMOJI_UNICODE_SET)) \
                .map(lambda t: (t[0], 1))
after = results.filter(lambda t: (t[2] in EMOJI_UNICODE_SET)) \
               .map(lambda t: (t[2], 1))

In [18]:
cnt_match = results.count()

In [19]:
print('There are: {} tweets total with {} tweets containing: {}'.format(cnt, cnt_tweets, match))
print('There are: {} matches of the character'.format(cnt_match))

There are: 465656 tweets total with 197 tweets containing: 🔫
There are: 222 matches of the character


In [20]:
before_cnt = before.reduceByKey(lambda x, y: x + y)
after_cnt = after.reduceByKey(lambda x, y: x + y)

In [21]:
before_cnt.takeOrdered(15, key=lambda x: -x[1])

[('😂', 15),
 ('🔫', 11),
 ('😭', 10),
 ('💥', 9),
 ('😩', 6),
 ('🔪', 5),
 ('😊', 5),
 ('💣', 4),
 ('😃', 3),
 ('😫', 3),
 ('💂', 2),
 ('👀', 2),
 ('😓', 2),
 ('💵', 2),
 ('🔥', 2)]

In [22]:
after_cnt.takeOrdered(15, key=lambda x: -x[1])

[('🔫', 41),
 ('🔪', 7),
 ('💣', 5),
 ('😩', 5),
 ('😀', 4),
 ('👈', 2),
 ('💥', 2),
 ('⚡', 1),
 ('👮', 1),
 ('🐼', 1),
 ('🚨', 1),
 ('😭', 1),
 ('💨', 1),
 ('🚫', 1),
 ('🏃', 1)]