In [1]:
from __future__ import print_function
import os
from operator import itemgetter

from utils.twitter import download_data
from utils.nlp_utils import is_positive, is_negative, profanity_check, contains_nes, contains_blacklisted_pos_tags



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/t-igshal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
DATA_FOLDER = 'data'
TWITTER_DATASET_FILENAME = 'twitter_en_big.txt'
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

In [3]:
FULL_FILENAME = os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME)

if not os.path.exists(FULL_FILENAME):
    download_data(FULL_FILENAME)

In [4]:
with open(os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME), 'r', encoding='utf-8') as twitter_in:
    lines = list(set(map(lambda x: x.lower().strip(), twitter_in.readlines())))

In [5]:
print('# unique utterances: {}'.format(len(lines)))

# unique utterances: 4691739


In [6]:
def utterance_is_ok(in_utterance):
    return (not contains_blacklisted_pos_tags(in_utterance)) \
           and profanity_check(in_utterance) \
           and (not contains_nes(in_utterance))

In [7]:
positive_utterances = [utterance for utterance in lines
                       if is_positive(utterance, positive_threshold=0.9, negative_threshold=0.1)]

In [8]:
positive_utterances = list(filter(lambda x: utterance_is_ok(x), positive_utterances))

In [15]:
print('# positive utterances after filtering: {}'.format(len(positive_utterances)))
print('\n'.join(positive_utterances[:100]))

# positive utterances after filtering: 3201
lol amazing
clearly lmfao
smart? yeah.
i appreciate!!
beautifully legal 👌
lol. thanks.
yes it's pretty good!!
free entertainment
still super happy n proud!!
yay, good job bff
cool, thanks!
proudly sharing
good save lol
ha! good luck! 😙
neat! thanks for sharing!
soooo awesome!! congratulations!! 😃
yes, awesome.
better? :-)
kindly please play
congratulations! 🌟
well thanks! :d
yes, they fit perfectly!!
lol, well played!
gorgeous stunning beauty
outstanding !
beautiful dear friend, hugs &amp; blessings
stunning. truly.
freelancing?
well? lol 📲
- congrats!
that's a great compliment. thanks!
perfect solution.
haha thanks?
hilarious, !
kindly play
true. how lovely dear friend
wow whimsical
wow!! love it!!
haha! cute.
yes agreed.
congrats friend!
thanks joy!
lol - i win.
thanks love 💜
lol, agreed!
great honor.
sweet dreams. beautiful photo
nice! thanks!
haha your amazing!! thanks haha
lol yeah ok
super cute!!! have fun!!!
i love love love him
peace,

In [10]:
with open(os.path.join(DATA_FOLDER, 'twitter_ood_positive.txt'), 'w') as positive_out:
    for utterance in positive_utterances:
        print(utterance, file=positive_out)

In [11]:
negative_utterances = [utterance for utterance in lines
                       if is_negative(utterance, positive_threshold=0.3, negative_threshold=0.8)]

In [12]:
negative_utterances = list(filter(lambda x: utterance_is_ok(x), negative_utterances))

In [16]:
print('# negative utterances after filtering: {}'.format(len(negative_utterances)))
print('\n'.join(negative_utterances[:100]))

# negative utterances after filtering: 1896
staaaappp hating smh
no unfortunately not
stop stealing
sad! no leadership!
rejection??
really frustrating :(
totally horrifying!
hopelessly naive.
thats harsh :/
frustrating sir frustrating
worst people
... hate? seriously, hate? what?
seriously 🙈
? seriously?
no arguing that
panic attacks are never good 💜
ugh, the worst.
-god! how vile! disgusting!
smh that's disappointing
dire straits-why worry
worst batman
sorry sorry
seriously doubt that
ironic, no?
a boring, boring, boring choice.
no you're wrong
wrong. bigly. sad!
be no problem
no, you're crying
seriously terrifying
omg no worries
irony overload
or terrible irony?
freaking insane
unhealthy *
hated gays...
im choking stop
smh smh smh smh
i screamed. screamed.
. ignorant fool.
ugh, i hate that!!!
pressure!! 😳
creepy freaky weirdo!
smh poor wayne
i am crying, seriously
is an evil scumbag.
unfortunately 😐
they violated smh
no freaking fg!!!!
unfortunately no
stolen valor smh
no doubt. he's

In [14]:
with open(os.path.join(DATA_FOLDER, 'twitter_ood_negative.txt'), 'w') as negative_out:
    for utterance in negative_utterances:
        print(utterance, file=negative_out)