In [1]:
spark

In [2]:
import os
import sys
import re
from unicode_codes import EMOJI_UNICODE, EMOJI_UNICODE_SET
from utils import get_re

In [3]:
import pandas as pd

In [4]:
DATA_DIR = 'small_data/01'

In [5]:
print(sys.version)

3.6.4 (default, Jan 17 2018, 15:26:12) 
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.39.2)]


#### Match strings (char, sql, and regex)

In [7]:
match = EMOJI_UNICODE[':pistol:']
match_sql = u"%{}%".format(match)

match_re = get_re(match, window=1)

#### Load json files

In [8]:
files = spark.read.json(os.path.join(DATA_DIR, '*'))
files_filtered = files.filter(files.delete.isNull())

#### Find tweets with match char

In [9]:
files_match = files_filtered.filter(files_filtered.text.like(match_sql))

In [10]:
tweets = files_match.select(files_match['text'],
                            files_match['lang'],
                            files_match['geo']).rdd.cache()

#### Find tweets with any emoji char

In [11]:
tweets_all = files_filtered.select('text').rdd

In [12]:
allemoji = tweets_all.flatMap(lambda row: EMOJI_UNICODE_SET.intersection(list(row.text))) \
                     .map(lambda t: (t, 1))

#### Regex match on tweets with match

In [29]:
results = tweets.flatMap(lambda row: re.findall(match_re, row.text)) \
                .cache()

In [14]:
before = results.filter(lambda t: (t[0] in EMOJI_UNICODE_SET)) \
                .map(lambda t: (t[0], 1))
after = results.filter(lambda t: (t[2] in EMOJI_UNICODE_SET)) \
               .map(lambda t: (t[2], 1))

#### Counts

In [15]:
langs_cnt = tweets.map(lambda row: row.lang).countByValue()

In [16]:
cnt = files_filtered.count()

In [17]:
cnt_tweets = tweets.count()

In [18]:
cnt_match = results.count()

In [19]:
print('There are: {} tweets total with {} tweets containing: {}'.format(cnt, cnt_tweets, match))
print('There are: {} matches of the character'.format(cnt_match))

There are: 465656 tweets total with 197 tweets containing: 🔫
There are: 222 matches of the character


In [20]:
allemoji_cnt = allemoji.reduceByKey(lambda x, y: x + y)
before_cnt = before.reduceByKey(lambda x, y: x + y)
after_cnt = after.reduceByKey(lambda x, y: x + y)

#### Tables of adjacent emoji chars and all emoji chars

In [21]:
bd = before_cnt.takeOrdered(15, key=lambda x: -x[1])

In [22]:
ad = after_cnt.takeOrdered(15, key=lambda x: -x[1])

In [23]:
alld = allemoji_cnt.takeOrdered(15, key=lambda x: -x[1])

In [24]:
df_before = pd.DataFrame(bd, columns=['Emoji', 'CountBefore'])
df_after = pd.DataFrame(ad, columns=['Emoji', 'CountAfter'])
df_lang = pd.DataFrame(list(langs_cnt.items()), columns=['Lang', 'Count'])
df_allemoji = pd.DataFrame(alld, columns=['Emoji', 'Count'])

In [25]:
df_before

Unnamed: 0,Emoji,CountBefore
0,😂,15
1,🔫,11
2,😭,10
3,💥,9
4,😩,6
5,🔪,5
6,😊,5
7,💣,4
8,😃,3
9,😫,3


In [26]:
df_after

Unnamed: 0,Emoji,CountAfter
0,🔫,41
1,🔪,7
2,💣,5
3,😩,5
4,😀,4
5,👈,2
6,💥,2
7,⚡,1
8,👮,1
9,🐼,1


In [27]:
df_allemoji

Unnamed: 0,Emoji,Count
0,😂,4582
1,♥,2830
2,❤,2669
3,💕,2359
4,😍,2277
5,😭,1989
6,😊,1430
7,😘,1334
8,👌,1187
9,☺,1131


In [28]:
df_lang

Unnamed: 0,Lang,Count
0,en,107
1,und,17
2,ja,7
3,es,17
4,in,8
5,pt,6
6,ar,8
7,sv,1
8,fr,7
9,tl,9
