In [1]:
import json
import random
import re
import nltk

from nltk.tokenize import TweetTokenizer

In [2]:
VALID_SUBREDDITS = {'worldnews'}
INVALID_BODY_CONTENT = ['[removed]', '[deleted]', 'i am a bot']


def is_valid(comment):
    for invalid_content in INVALID_BODY_CONTENT:
        if invalid_content.lower() in comment['body'].lower():
            return False
    
    return comment['subreddit'] in VALID_SUBREDDITS

def is_controversial(comment):
    return comment['controversiality'] > 0

tokenizer = TweetTokenizer()

def preprocess_comment(comment):
    filters = {        
        # Markdown filters
        r"\[([^\[]+)\]\(([^\)]+)\)": "\1",  # [urls]()
        r"(\*\*|__)(.*?)\1": "\2",          # **bold** text
        r"(\*|_)(.*?)\1": "\2",             # __italicized__ text

        # Get rid of URLs
        r"https?:\/\/\S+\b|www\.(\w+\.)+\S*": "",
        
        # Reddit-specific
        r"\/?u\/[A-Za-z0-9_-]+": " __user_token__ ",
        r"[-+]?[.\d]*[\d]+[:,.\d]*": " __number_token__ ",
        
        # Split all by slash
        r"\/": " / ",
    }
    
    comment = comment['body']
    for f, sub in filters.items():
        comment = re.sub(f, sub, comment)
    
    comment = comment.lower()
    
    tokenized = nltk.tokenize.word_tokenize(comment)
    return ' '.join(tokenized)

In [3]:
def train_test_split(data_path, train_path, test_path):
    with open(f"{train_path}.c", 'w') as train_c, \
         open(f"{train_path}.nc", 'w') as train_nc, \
         open(f"{test_path}.c", 'w') as test_c, \
         open(f"{test_path}.nc", 'w') as test_nc:
        
        outfiles = {
            'train': {
                'c': train_c,
                'nc': train_nc,
            },
            'test': {
                'c': test_c,
                'nc': test_nc,
            }
        }
        
        with open(data_path, 'r') as infile:
            for idx, line in enumerate(infile):
                comment = json.loads(line)

                if not is_valid(comment):
                    continue
                
                # Maintain a roughly even train/test split
                split = 'train' if idx % 2 else 'test'
                controversial = 'c' if is_controversial(comment) else 'nc'
                
                preprocessed = preprocess_comment(comment)
                if not preprocessed:
                    continue
                
                outfile = outfiles[split][controversial]
                print(preprocessed, file=outfile)

In [4]:
train_test_split('../datasets/RC_2018-03', '../datasets/train', '../datasets/test')