In [1]:
from __future__ import print_function
import os
from operator import itemgetter

from utils.twitter import download_data
from utils.nlp_utils import is_positive, is_negative, profanity_check, contains_nes, contains_blacklisted_pos_tags

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ishalyminov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
DATA_FOLDER = 'data'
TWITTER_DATASET_FILENAME = 'twitter_en_big.txt'
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

In [3]:
FULL_FILENAME = os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME)

if not os.path.exists(FULL_FILENAME):
    download_data(FULL_FILENAME)

In [4]:
with open(os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME), 'r', encoding='utf-8') as twitter_in:
    lines = list(set(map(lambda x: x.lower().strip(), twitter_in.readlines())))

In [8]:
print('# unique utterances: {}'.format(len(lines)))

# unique utterances: 4691739


In [9]:
def utterance_is_ok(in_utterance):
    return (not contains_blacklisted_pos_tags(in_utterance)) \
           and profanity_check(in_utterance) \
           and (not contains_nes(in_utterance))

In [10]:
positive_utterances = [utterance for utterance in lines
                       if is_positive(utterance, positive_threshold=0.9, negative_threshold=0.1)]

In [11]:
positive_utterances = list(filter(lambda x: utterance_is_ok(x), positive_utterances))

In [12]:
print('# positive utterances after filtering: {}'.format(len(positive_utterances)))
print('\n'.join(positive_utterances[:100]))

# positive utterances after filtering: 2468
encouraging
winning. :)
lol alright alright
congrats! welcome
amazing textures! good luck!
interesting, thanks.
honestly truth
i support u
super helpful! thanks!
- congrats!
yes! yes! wow
yay my favorite holiday
wow!!! absolutely amazing!
thanks charity
yes!!! well deserved!!! congrats!
ha hah ha ha ha
yes!! please share!
yes, thanks
thanks, friend.
thanks friends
awesome :-d
definitely cute!
thanks joy!
cheers! love
wow good deal! ok
love, love!!
awesome!! thanks :)
honest, raw, truthful, proud, survivor,
awesome kindness!
- congrats. great school great love
haha yes ! 😋
amazing. congrats.
thanks rich!
yeah, she's special alright. thanks!
positivity 💜
beautiful!! i love architecture!!
wow! wonderful!
yes. good luck!
cheers! support . . . .
wow she's talented. thanks .
you're welcome lovely! good luck! :*
yay! vindicated!
ha!! is pretty good.
excellent - thanks!
also love love love
thanks gorgeous
congratulations, !
yes, thankfully. :-)
wow g

In [13]:
with open(os.path.join(DATA_FOLDER, 'twitter_ood_positive.txt'), 'w') as positive_out:
    for utterance in positive_utterances:
        print(utterance, file=positive_out)

In [14]:
negative_utterances = [utterance for utterance in lines
                       if is_negative(utterance, positive_threshold=0.3, negative_threshold=0.8)]

In [15]:
negative_utterances = list(filter(lambda x: utterance_is_ok(x), negative_utterances))

In [16]:
print('# negative utterances after filtering: {}'.format(len(negative_utterances)))
print('\n'.join(negative_utterances[:100]))

# negative utterances after filtering: 1449
disgusting and murderous!
stop embarrassing yourself
what's worst?
ridiculous! a non-issue, a distraction
what's wrong? :(
blocking this fool.
stop i might cry
obsessed!!!
fail. pathetic.
ugh what a nerd
my fault :/
no scandal here
that's horrendous!! those poor terrified horses!!
ugh that's absolutely awful :/
stop i hate it
nothing like optimism
low key sad
strangely, no.
obsessed with shameless
tired and irritated
:( eeeh sorry!!
no tears. none.
pay up fool
sorry, i misunderstood.
anxiety is exhausting.
a sad, sad, situation! 😢
hopelessness.
seriously ?
ugh! what a witch
sarcastic or no
wrong !!!!!!!!!!!!!!!!!!!!!!!!
foolishness
outrageous!!!
so terribly sad :(
so sad. terrorists reign!
i predict fatality.
unfortunately probably puking 😞
no pictures? :(
... hate? seriously, hate? what?
doubtful, very doubtful.
disgusting puppets!!!!
haters are looming
r u complaining
wicked wicked wicked wicked
horrible and horrifying
serious crush!! 😘 ser

In [17]:
with open(os.path.join(DATA_FOLDER, 'twitter_ood_negative.txt'), 'w') as negative_out:
    for utterance in negative_utterances:
        print(utterance, file=negative_out)