In [1]:
from __future__ import print_function
import os
from operator import itemgetter
import json

from utils.twitter import download_data
from utils.nlp_utils import is_positive, is_negative, profanity_check, contains_nes, contains_blacklisted_pos_tags

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ishalyminov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
DATA_FOLDER = 'data'
TWITTER_DATASET_FILENAME = 'twitter_en_big.txt'
REDDIT_DATASET_FILENAME = 'reddit.txt'
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

In [3]:
FULL_FILENAME = os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME)

if not os.path.exists(FULL_FILENAME):
    download_data(FULL_FILENAME)

In [4]:
def utterance_is_ok(in_utterance):
    return (not contains_blacklisted_pos_tags(in_utterance)) \
           and profanity_check(in_utterance) \
           and (not contains_nes(in_utterance))

In [5]:
with open(os.path.join(DATA_FOLDER, TWITTER_DATASET_FILENAME), 'r', encoding='utf-8') as twitter_in:
    lines = list(set(map(lambda x: x.lower().strip(), twitter_in.readlines())))

In [6]:
print('# unique utterances: {}'.format(len(lines)))

# unique utterances: 4691739


In [7]:
utterances_short = [utt for utt in lines if len(utt.split()) < 6]

In [8]:
utterances_sorry = [utt for utt in utterances_short if 'sorry' in utt]

In [9]:
twitter_sorry_filtered = [utt for utt in utterances_sorry if utterance_is_ok(utt)]

In [10]:
twitter_sorry_filtered[:100]

["you're sorry all right",
 'thanks, but sorry, empty-handed.',
 'oh ok sorry about that',
 'sorry for forgetting about you',
 "that's awful man, sorry.",
 'wrong women! sorry',
 "unreal, i'm sorry",
 'sorry for the disappointment',
 'sorry for unfollowing,',
 'lol, sorry. fixing.',
 'sorry i missed this! :-/',
 'i am so sorry brother!',
 'sorry meant pen is..',
 'sorry for this ignorant person.',
 "i'm so sorry.",
 'yes really and sorry...',
 'sorry for such short notice!!',
 'whoops sorry',
 'sorry, forgot the tag',
 'i know. sorry guys',
 'sorry my mistake, 4s then',
 "i'm sorry !! 😕",
 'no!!!! (old slide, sorry)',
 'sorry wrong link:',
 'sorry, thats what i meant.',
 'staying home sorry!',
 'im sorry okay',
 "i'm sorry that's no cat",
 'sorry we were just practicing',
 'ah sorry. room service nko?',
 'married up, sorry.',
 "sorry i'm just excited",
 'was it me? sorry',
 'so sorry. poor piggy.',
 'sorry i asked.',
 'ugh. sorry.',
 '... no info yet, sorry',
 'omg sorry to u both',
 '

In [11]:
with open(os.path.join(DATA_FOLDER, 'twitter_ood_breakdown.txt'), 'w') as twitter_out:
    for utterance in twitter_sorry_filtered:
        print(utterance, file=twitter_out)

In [20]:
reddit_lines_filtered_short = set([])
with open(os.path.join(DATA_FOLDER, REDDIT_DATASET_FILENAME), 'r', encoding='utf-8') as reddit_in:
    for line_json in reddit_in:
        line = json.loads(line_json)['body'].lower().strip()
        if not line or line in reddit_lines_filtered_short:
            continue
        if len(line.split()) < 6:
            reddit_lines_filtered_short.add(line)

In [21]:
reddit_mistake = [line for line in reddit_lines_filtered_short if 'mistake' in line]

In [22]:
reddit_mistake_filtered = [utt for utt in reddit_mistake if utterance_is_ok(utt)]

In [23]:
reddit_mistake_filtered

['ah, my mistake.',
 'lying or mistaken?',
 'upmodded for learning from mistakes.',
 'common mistake.',
 'mistake',
 'whoops, my mistake.',
 '.. and spelling mistakes.',
 'boy was he mistaken!!',
 '...my mistake. mispost.',
 'my mistake ... :d',
 'my mistake. apologies.',
 'it was no mistake.',
 'mistakes like this, yes.',
 "he's made his mistakes.",
 'my mistake.  thanks!']

In [24]:
with open(os.path.join(DATA_FOLDER, 'reddit_ood_breakdown.txt'), 'w') as reddit_out:
    for utterance in reddit_mistake_filtered:
        print(utterance, file=reddit_out)