In [1]:
from __future__ import print_function
import os
from operator import itemgetter
import json

from utils.twitter import download_data
from utils.nlp_utils import is_positive, is_negative, profanity_check, contains_nes, contains_blacklisted_pos_tags

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ishalyminov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
DATA_FOLDER = 'data'
TWITTER_DATASET_FILENAME = 'twitter_en_big.txt'
REDDIT_DATASET_FILENAME = 'reddit.txt'
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

In [4]:
FULL_FILENAME = os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME)

if not os.path.exists(FULL_FILENAME):
    download_data(FULL_FILENAME)

In [5]:
def utterance_is_ok(in_utterance):
    return (not contains_blacklisted_pos_tags(in_utterance)) \
           and profanity_check(in_utterance) \
           and (not contains_nes(in_utterance))

In [6]:
with open(os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME), 'r', encoding='utf-8') as twitter_in:
    lines = list(set(map(lambda x: x.lower().strip(), twitter_in.readlines())))

In [7]:
print('# unique utterances: {}'.format(len(lines)))

# unique utterances: 4691739


In [8]:
utterances_short = [utt for utt in lines if len(utt.split()) < 6]

In [9]:
utterances_sorry = [utt for utt in utterances_short if 'sorry' in utt]

In [10]:
twitter_sorry_filtered = [utt for utt in utterances_sorry if utterance_is_ok(utt)]

In [11]:
twitter_sorry_filtered[:100]

["sorry here's your money back",
 "i'm sorry about that.",
 'ah, sorry for my misunderstanding.',
 "i'm sorry man",
 'totally agree so sorry',
 'sorry! us too....',
 'that completely sucks, i’m sorry',
 "i'm sorry, what?",
 "that's horrible i'm sorry",
 'im sorry who?',
 'so very sorry...',
 'u better b sorry 🙄',
 "oh boy i'm sorry",
 'sorry, meant host in handshake.',
 "i'm sorry and i know!!",
 'yes, sorry. i deleted :-)',
 "and i'm very sorry",
 'sorry it was so fun',
 'too busy. sorry',
 'sorry fpr your loss //:',
 "i'm sorry, that's so awful.",
 'gold. sorry!',
 'i’m sorry who are you',
 "i'm sorry she involved you",
 'sorry i meant this:',
 'lol true sorry for this',
 'i am sorry. hugs!!',
 "i'm so sorry. ily.",
 '🤔 true! sorry 😅',
 "oh i'm sorry and*",
 "i'm sorry fis...?",
 'sorry my life sucks',
 "sorry still smokin' though",
 'weak, sorry dude',
 'irrelevant fk..sorry for language guys...',
 'sorry released',
 'whoops sorry',
 "😲 so sorry! that's awful!",
 "sorry, dude. there

In [12]:
with open(os.path.join(DATA_FOLDER, 'twitter_ood_breakdown.txt'), 'w') as twitter_out:
    for utterance in twitter_sorry_filtered:
        print(utterance, file=twitter_out)

In [13]:
reddit_lines_filtered_short = set([])
with open(os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME), 'r', encoding='utf-8') as twitter_in:
    for line_json in twitter_in:
        line = json.loads(line_json)['body'].lower().strip()
        if not line or line in reddit_lines_filtered_short:
            continue
        if len(line.split()) < 6:
            reddit_lines_filtered_short.add(line)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [61]:
reddit_mistake = [line for line in reddit_lines_filtered_short if 'mistake' in line]

In [62]:
reddit_mistake_filtered = [utt for utt in reddit_mistake if utterance_is_ok(utt)]

In [58]:
reddit_mistake_filtered

['sorry about that mistake.',
 'not making that mistake again',
 'fair enough. my mistake.',
 'yeah, fixed. my mistake',
 'my mistake, thanks for clarifying.',
 'https://www.erieinsurance.com/auto-insurance/driving-safety/driving-mistakes',
 'ah right, my mistake',
 '*gasp* miggy made a mistake?!',
 "don't learn from mistakes?",
 'understood.  mistakes were made.',
 "because people don't make mistakes?",
 "i don't ever mkae mistakes.",
 "i'm sorry.\n\n\nmistakes were made.",
 'a costly mistake.',
 'same same. mistakes.',
 'lol honest mistake',
 'except it was a mistake',
 'mwd on. big mistake.',
 'mistake is more like it.',
 'easily [mistaken](http://media-cache-ak0.pinimg.com/736x/8e/a1/0f/8ea10ffe22d35d9d771b0c9de80bb9b2.jpg).',
 'eh we all make mistakes',
 'i stand with my mistake',
 "this mistake isn't really reasonable.",
 'my mistake. thanks!',
 'that is a huge mistake',
 'conflicts with mistake :\\',
 "my mistake. won't happen again.",
 'rookie mistake by the keeper.',
 'well. m

In [63]:
with open(os.path.join(DATA_FOLDER, 'reddit_ood_breakdown.txt'), 'w') as reddit_out:
    for utterance in reddit_mistake_filtered:
        print(utterance, file=reddit_out)