# Preprocessing

## Constants

In [None]:
# Percentage of posts that will be pulled from each author's dataset as the testing set.
PERCENTAGE_FOR_TESTING = 0.2

# Minimum number of posts
MIN_NUM_POSTS = 5

# Minimum number of words
MIN_NUM_WORDS = 1000

# Determines whether or not a list of posts attributed to a single author
# will be shuffled before being separated into training and testing datasets
WILL_SHUFFLE = True

## Filename Constants

In [None]:
import os

blog_directory = "blogs/"
json_directory = "jsons/"

if not os.path.exists(json_directory):
    os.makedirs(json_directory)

unigram_chars_json_filename = "unigram_chars.json"
bigram_chars_json_filename = "bigram_chars.json"
trigram_chars_json_filename = "trigram_chars.json"
unigram_words_json_filename = "unigram_words.json"
bigram_words_json_filename = "bigram_words.json"
trigram_words_json_filename = "trigram_words.json"
unigram_pos_json_filename = "unigram_pos.json"
bigram_pos_json_filename = "bigram_pos.json"
trigram_pos_json_filename = "trigram_pos.json"

leftover_json_filename = "leftover.json"

## N-Gram Builders

Classes that will take characters, words, POS tags and builds a list of n-grams. Each builder can be re-used after calling package_and_reset().

In [None]:
class NGramBuilder:
    def __init__(self, n):
        self.__n = n
        self.__all_ngrams = []  # Holds the list of all of the n-grams
        self.__ngram = []  # Temporary list holding the n-gram that is being built

    def add(self, item):
        """Takes in an item and adds it to the n-grams"""
        self.__ngram.append(item)
        if len(self.__ngram) == self.__n:
            # Completed a whole character n-gram. Add it to the list
            self.__all_ngrams.append(tuple(self.__ngram))
            self.__ngram.pop(0)  # Make room for the next character

    def package_and_reset(self):
        """Return the fully built n-gram list and then reset the builder for reuse."""
        ret_value = self.__all_ngrams
        self.__all_ngrams = []
        self.__ngram = []

        return ret_value


class CharacterNGramBuilder(NGramBuilder):
    def add_char(self, char: str):
        """Takes in a character and adds it to the character n-grams"""
        self.add(char)


class WordNGramBuilder(NGramBuilder):
    def add_word(self, token):
        """Takes in a spacy token and adds it to the word n-grams."""
        self.add(token.text)


class POSNGramBuilder(NGramBuilder):
    def add_postag(self, token):
        """Takes in a spacy token and adds it to the POS n-grams"""
        self.add(token.tag_)

## Data to Disk: JSON Storage

Due to the large amount of data, objects will be serialized into JSON objects and dumped into `*.json` files in order to free up memory during the preprocessing phase. This results in multiple JSON objects added to the end of the json file specified. These values can be loaded in at another time using `json.load()`.

In [None]:
import json

class JSONStorage:
    """Dumps items into a json file"""
    def __init__(self, file_name: str):
        self.__file = open(file_name, "a")

    def close(self):
        self.__file.close()

    def add(self, item: dict):
        self.__file.write(json.dumps(item) + "\n")

## Read in Posts

This defines a function that takes in the filename of an author's 
dataset file, reads the data, and returns the id number of the author
and a list of the author's posts. 

In [None]:
import re

def get_posts(data_directory: str, data_filename: str):
    # Get the author id using regex
    a_id = re.search("^(\d+)", data_filename).group(1)
    
    # Get the array of the author's posts
    post_list = []
    file = open(data_directory + data_filename, "r", encoding='latin1')
    post_mode = False
    post = ""
    for line in file.readlines():
        line = line.strip()
        if re.search("^<post>$", line):
            # Found the start of a post
            post_mode = True
        elif re.search("^</post>$", line):
            # End of the post. Clean up the post and add it to the list
            post_mode = False
            post = post.strip()
            post_list.append(post)
            post = ""
        elif post_mode:
            # Concatenate the line to the post
            post += line
    
    return a_id, post_list

## Analyze Data

Takes in a list of posts and creates a list of dictionaries with the following information:
- unigram_chars: A list of the unigram of each character in order of the sentence
- bigram_chars: A list of the bigram of each character in order of the sentence
- trigram_chars: A list of the trigram of each character in order of the sentence
- unigram_words: A list of the unigrams in order of the sentence
- bigram_words: A list of the bigrams in order of the sentence
- trigram_words: A list of the trigrams in order of the sentence
- unigram_pos: A list of the unigram of each POS tag in order of the sentence
- bigram_pos: A list of the bigram of each POS tag in order of the sentence
- trigram_pos: A list of the trigram of each POS tag in order of the sentence

In [None]:
import spacy

def tokenize_data(post_list: list):
    global all_word_frequency

    nlp = spacy.load("en_core_web_sm")

    ret_value = []
    unigram_chars_builder = CharacterNGramBuilder(1)
    bigram_chars_builder = CharacterNGramBuilder(2)
    trigram_chars_builder = CharacterNGramBuilder(3)
    unigram_words_builder = WordNGramBuilder(1)
    bigram_words_builder = WordNGramBuilder(2)
    trigram_words_builder = WordNGramBuilder(3)
    unigram_pos_builder = POSNGramBuilder(1)
    bigram_pos_builder = POSNGramBuilder(2)
    trigram_pos_builder = POSNGramBuilder(3)

    # Iterate through each post
    for post in post_list:
        post_doc = nlp(post)

        # Handle at a cahracter level
        for char in post:
            unigram_chars_builder.add_char(char)
            bigram_chars_builder.add_char(char)
            trigram_chars_builder.add_char(char)

        # Now handle at a token level
        for token in post_doc:
            word = token.text

            # Add the word to the global word frequency of the whole corpus
            if word in all_word_frequency.keys():
                all_word_frequency[word] += 1
            else:
                all_word_frequency[word] = 1

            unigram_words_builder.add_word(token)
            bigram_words_builder.add_word(token)
            trigram_words_builder.add_word(token)
            unigram_pos_builder.add_postag(token)
            bigram_pos_builder.add_postag(token)
            trigram_pos_builder.add_postag(token)

        # Now add the information into a dictionary and add it to the return value list
        post_info = {"unigram_chars": unigram_chars_builder.package_and_reset(),
                     "bigram_chars": bigram_chars_builder.package_and_reset(),
                     "trigram_chars": trigram_chars_builder.package_and_reset(),
                     "unigram_words": unigram_words_builder.package_and_reset(),
                     "bigram_words": bigram_words_builder.package_and_reset(),
                     "trigram_words": trigram_words_builder.package_and_reset(),
                     "unigram_pos": unigram_pos_builder.package_and_reset(),
                     "bigram_pos": bigram_pos_builder.package_and_reset(),
                     "trigram_pos": trigram_pos_builder.package_and_reset()}
        ret_value.append(post_info)

    return ret_value

## Separate Data

Separates the list of posts put through tokenize_data() into the training dataset and the testing dataset by the percentage outlined by PERCENTAGE_FOR_TESTING. It assumes that all of the posts within the list is attributed to the author id provided. If the post list does not reach the minimum word count or the minimum post count, then it will delegate the data into the leftover_data set. 

The data will be loaded into json files for storage.

In [None]:
import random
import math

def add_data_to_leftover(a_id: int, post_list: list):
    global leftover_json
    for post_id in range(len(post_list)):
        post_info = {"author_id": a_id, "post_id": post_id, "post_info": post_list[post_id]}
        leftover_json.add(post_info)


def add_data_to_train(a_id: int, post_list: list):
    global train_unigram_chars_json
    global train_bigram_chars_json
    global train_trigram_chars_json
    global train_unigram_words_json
    global train_bigram_words_json
    global train_trigram_words_json
    global train_unigram_pos_json
    global train_bigram_pos_json
    global train_trigram_pos_json

    for post_id in range(len(post_list)):
        post_info = post_list[post_id]
        train_unigram_chars_json.add({"author_id": a_id, "post_id": post_id, "unigram_chars": post_info['unigram_chars']})
        train_bigram_chars_json.add({"author_id": a_id, "post_id": post_id, "bigram_chars": post_info['bigram_chars']})
        train_trigram_chars_json.add({"author_id": a_id, "post_id": post_id, "trigram_chars": post_info['trigram_chars']})
        train_unigram_words_json.add({"author_id": a_id, "post_id": post_id, "unigram_words": post_info['unigram_words']})
        train_bigram_words_json.add({"author_id": a_id, "post_id": post_id, "bigram_words": post_info['bigram_words']})
        train_trigram_words_json.add({"author_id": a_id, "post_id": post_id, "trigram_words": post_info['trigram_words']})
        train_unigram_pos_json.add({"author_id": a_id, "post_id": post_id, "unigram_pos": post_info['unigram_pos']})
        train_bigram_pos_json.add({"author_id": a_id, "post_id": post_id, "bigram_pos": post_info['bigram_pos']})
        train_trigram_pos_json.add({"author_id": a_id, "post_id": post_id, "trigram_pos": post_info['trigram_pos']})


def add_data_to_test(a_id: int, post_list: list):
    global test_unigram_chars_json
    global test_bigram_chars_json
    global test_trigram_chars_json
    global test_unigram_words_json
    global test_bigram_words_json
    global test_trigram_words_json
    global test_unigram_pos_json
    global test_bigram_pos_json
    global test_trigram_pos_json

    for post_id in range(len(post_list)):
        post_info = post_list[post_id]
        test_unigram_chars_json.add({"author_id": a_id, "post_id": post_id, "unigram_chars": post_info['unigram_chars']})
        test_bigram_chars_json.add({"author_id": a_id, "post_id": post_id, "bigram_chars": post_info['bigram_chars']})
        test_trigram_chars_json.add({"author_id": a_id, "post_id": post_id, "trigram_chars": post_info['trigram_chars']})
        test_unigram_words_json.add({"author_id": a_id, "post_id": post_id, "unigram_words": post_info['unigram_words']})
        test_bigram_words_json.add({"author_id": a_id, "post_id": post_id, "bigram_words": post_info['bigram_words']})
        test_trigram_words_json.add({"author_id": a_id, "post_id": post_id, "trigram_words": post_info['trigram_words']})
        test_unigram_pos_json.add({"author_id": a_id, "post_id": post_id, "unigram_pos": post_info['unigram_pos']})
        test_bigram_pos_json.add({"author_id": a_id, "post_id": post_id, "bigram_pos": post_info['bigram_pos']})
        test_trigram_pos_json.add({"author_id": a_id, "post_id": post_id, "trigram_pos": post_info['trigram_pos']})


def separate_data(a_id: int, post_list: list):
    global MIN_NUM_POSTS
    global MIN_NUM_WORDS
    global WILL_SHUFFLE
    global PERCENTAGE_FOR_TESTING

    if WILL_SHUFFLE:
        random.shuffle(post_list)

    # Ensure that the posts meet the minimum requirements
    num_posts = len(post_list)
    num_words = 0
    for post in post_list:
        num_words += len(post["unigram_words"])

    if num_posts >= MIN_NUM_POSTS and num_words >= MIN_NUM_WORDS:
        # Separate the post list into two parts, where one is PERCENTAGE_FOR_TESTING
        # of the whole post list
        num_posts_testing = math.floor(len(post_list) * PERCENTAGE_FOR_TESTING)
        testing_set = post_list[:num_posts_testing]
        training_set = post_list[num_posts_testing:]

        # Now add them to the respective global dataset
        add_data_to_test(a_id, testing_set)
        add_data_to_train(a_id, training_set)
    else:
        # This post list doesn't make the cut. Put it in the leftover set
        add_data_to_leftover(a_id, post_list)

## Initialize Globals

In [None]:
all_word_frequency = {}  

## Open JSON Storage

In [None]:
train_unigram_chars_json = JSONStorage(json_directory + "train_" + unigram_chars_json_filename)
train_bigram_chars_json = JSONStorage(json_directory + "train_" + bigram_chars_json_filename)
train_trigram_chars_json = JSONStorage(json_directory + "train_" + trigram_chars_json_filename)
train_unigram_words_json = JSONStorage(json_directory + "train_" + unigram_words_json_filename)
train_bigram_words_json = JSONStorage(json_directory + "train_" + bigram_words_json_filename)
train_trigram_words_json = JSONStorage(json_directory + "train_" + trigram_words_json_filename)
train_unigram_pos_json = JSONStorage(json_directory + "train_" + unigram_pos_json_filename)
train_bigram_pos_json = JSONStorage(json_directory + "train_" + bigram_pos_json_filename)
train_trigram_pos_json = JSONStorage(json_directory + "train_" + trigram_pos_json_filename)

test_unigram_chars_json = JSONStorage(json_directory + "test_" + unigram_chars_json_filename)
test_bigram_chars_json = JSONStorage(json_directory + "test_" + bigram_chars_json_filename)
test_trigram_chars_json = JSONStorage(json_directory + "test_" + trigram_chars_json_filename)
test_unigram_words_json = JSONStorage(json_directory + "test_" + unigram_words_json_filename)
test_bigram_words_json = JSONStorage(json_directory + "test_" + bigram_words_json_filename)
test_trigram_words_json = JSONStorage(json_directory + "test_" + trigram_words_json_filename)
test_unigram_pos_json = JSONStorage(json_directory + "test_" + unigram_pos_json_filename)
test_bigram_pos_json = JSONStorage(json_directory + "test_" + bigram_pos_json_filename)
test_trigram_pos_json = JSONStorage(json_directory + "test_" + trigram_pos_json_filename)

leftover_json = JSONStorage(json_directory + leftover_json_filename)

## Process All Data

In [None]:
# Go through all of the files
print("Reading through data...")
num_files = len(os.listdir(blog_directory))
i = 1
for filename in os.listdir(blog_directory):
    if filename.endswith(".xml"):
        author_id, posts = get_posts(blog_directory, filename)
        posts = tokenize_data(posts)
        separate_data(author_id, posts)
        print(i, "/", num_files, ": Finished processing author", author_id)
        i += 1
    else:
        continue

## Load Into JSON

In [None]:
print("Outputting vacabulary data to json file...")

train_unigram_chars_json.close()
train_bigram_chars_json.close()
train_trigram_chars_json.close()
train_unigram_words_json.close()
train_bigram_words_json.close()
train_trigram_words_json.close()
train_unigram_pos_json.close()
train_bigram_pos_json.close()
train_trigram_pos_json.close()

test_unigram_chars_json.close()
test_bigram_chars_json.close()
test_trigram_chars_json.close()
test_unigram_words_json.close()
test_bigram_words_json.close()
test_trigram_words_json.close()
test_unigram_pos_json.close()
test_bigram_pos_json.close()
test_trigram_pos_json.close()

leftover_json.close()

with open(json_directory + 'vocabulary.json', 'w') as json_file:
    json.dump(all_word_frequency, json_file)

print("Processing completed.")