In [42]:
from __future__ import print_function
import os
from operator import itemgetter
import json

from utils.twitter import download_data
from utils.nlp_utils import is_positive, is_negative, profanity_check, contains_nes, contains_blacklisted_pos_tags

In [40]:
DATA_FOLDER = 'data'
TWITTER_DATASET_FILENAME = 'twitter_en_big.txt'
TWITTER_DATASET_FILENAME = 'reddit.txt'
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

In [3]:
FULL_FILENAME = os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME)

if not os.path.exists(FULL_FILENAME):
    download_data(FULL_FILENAME)

In [22]:
def utterance_is_ok(in_utterance):
    return (not contains_blacklisted_pos_tags(in_utterance)) \
           and profanity_check(in_utterance) \
           and (not contains_nes(in_utterance))

In [4]:
with open(os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME), 'r', encoding='utf-8') as twitter_in:
    lines = list(set(map(lambda x: x.lower().strip(), twitter_in.readlines())))

In [5]:
print('# unique utterances: {}'.format(len(lines)))

# unique utterances: 4691739


In [13]:
utterances_short = [utt for utt in lines if len(utt.split()) < 6]

In [44]:
utterances_sorry = [utt for utt in utterances_short if 'sorry' in utt]

In [45]:
twitter_sorry_filtered = [utt for utt in utterances_sorry if utterance_is_ok(utt)]

In [46]:
twitter_sorry_filtered

['sorry sorry',
 'lol sorry man',
 "unreal, i'm sorry",
 'i’m sorry who are you',
 'im so sorry for you',
 "a7: (sorry i'm late) napping",
 "i'm sorry, what....",
 "sorry folks. he's back.",
 'sorry honey athlete probs',
 "they're awful, i'm sorry.",
 "(miz voice) oh....you're sorry!?!?!?",
 'awww...so sorry',
 'sorry for this ignorant person.',
 '"sorry about that"',
 'modernmoneypublicpurpose.com! sorry for the lag.',
 'not me, sorry!',
 "i'm so sorry, man...",
 'sold sorry!',
 'sorry, test post',
 "monogaramen i'm sorry",
 "i'm sorry it's so hard.",
 'sigh. sorry 💜',
 'sorry you got harassed. xo',
 "lmao i'm sorry.",
 'sorry thats my other account',
 'sorry sorry, erm. asda?',
 'sorry about the mangled prepositions.',
 'sorry where are you? 🤔',
 "i'm sorry...",
 'not yet, sorry!',
 'sorry. deal with it.',
 'sorry tweeted wrong account ...',
 'sorry twitter name',
 "sorry, can't help you...",
 "i'm really sorry.",
 'sorry for being in',
 "i'm sorry. 😔",
 'aw, sorry that happened!',
 

In [47]:
with open(os.path.join(DATA_FOLDER, 'twitter_ood_breakdown.txt'), 'w') as twitter_out:
    for utterance in twitter_sorry_filtered:
        print(utterance, file=twitter_out)

In [43]:
reddit_lines_filtered_short = set([])
with open(os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME), 'r', encoding='utf-8') as twitter_in:
    for line_json in twitter_in:
        line = json.loads(line_json)['body'].lower().strip()
        if not line or line in reddit_lines_filtered_short:
            continue
        if len(line.split()) < 6:
            reddit_lines_filtered_short.add(line)

In [61]:
reddit_mistake = [line for line in reddit_lines_filtered_short if 'mistake' in line]

In [62]:
reddit_mistake_filtered = [utt for utt in reddit_mistake if utterance_is_ok(utt)]

In [58]:
reddit_mistake_filtered

['sorry about that mistake.',
 'not making that mistake again',
 'fair enough. my mistake.',
 'yeah, fixed. my mistake',
 'my mistake, thanks for clarifying.',
 'https://www.erieinsurance.com/auto-insurance/driving-safety/driving-mistakes',
 'ah right, my mistake',
 '*gasp* miggy made a mistake?!',
 "don't learn from mistakes?",
 'understood.  mistakes were made.',
 "because people don't make mistakes?",
 "i don't ever mkae mistakes.",
 "i'm sorry.\n\n\nmistakes were made.",
 'a costly mistake.',
 'same same. mistakes.',
 'lol honest mistake',
 'except it was a mistake',
 'mwd on. big mistake.',
 'mistake is more like it.',
 'easily [mistaken](http://media-cache-ak0.pinimg.com/736x/8e/a1/0f/8ea10ffe22d35d9d771b0c9de80bb9b2.jpg).',
 'eh we all make mistakes',
 'i stand with my mistake',
 "this mistake isn't really reasonable.",
 'my mistake. thanks!',
 'that is a huge mistake',
 'conflicts with mistake :\\',
 "my mistake. won't happen again.",
 'rookie mistake by the keeper.',
 'well. m

In [63]:
with open(os.path.join(DATA_FOLDER, 'reddit_ood_breakdown.txt'), 'w') as reddit_out:
    for utterance in reddit_mistake_filtered:
        print(utterance, file=reddit_out)