# Preprocessing

## Constants

In [1]:
# Percentage of posts that will be pulled from each author's dataset as the testing set.
PERCENTAGE_FOR_TESTING = 0.2

# Minimum number of posts
MIN_NUM_POSTS = 5

# Minimum number of words
MIN_NUM_WORDS = 1000

# Determines whether or not a list of posts attributed to a single author
# will be shuffled before being separated into training and testing datasets
WILL_SHUFFLE = True

## Globals

In [2]:
testing_data = []
training_data = []
all_word_frequency = {}

# List of data that did not make the minimum amount of data threshold
leftover_data = []

## Read in Posts

This defines a function that takes in the filename of an author's 
dataset file, reads the data, and returns the id number of the author
and a list of the author's posts. 

In [3]:
import re

def get_posts(data_directory: str, data_filename: str):
    # Get the author id using regex
    a_id = re.search("^(\d+)", data_filename).group(1)
    
    # Get the array of the author's posts
    post_list = []
    file = open(data_directory + data_filename, "r", encoding='latin1')
    post_mode = False
    post = ""
    for line in file.readlines():
        line = line.strip()
        if re.search("^<post>$", line):
            # Found the start of a post
            post_mode = True
        elif re.search("^</post>$", line):
            # End of the post. Clean up the post and add it to the list
            post_mode = False
            post = post.strip()
            post_list.append(post)
            post = ""
        elif post_mode:
            # Concatenate the line to the post
            post += line
    
    return a_id, post_list

## Count the Data

Counts the number of words within a list of posts attributed to a single author.
It returns the total number of words within the whole list and the number of posts.
It will also add the words to the dictionary containing the frequency of words within the entire
corpus.

In [4]:
import spacy

def count_data(author_posts: list):
    global all_word_frequency
    
    nlp = spacy.load("en_core_web_sm")
    
    # Counters
    num_words = 0
    num_posts = len(author_posts)
    
    for post in author_posts:
        doc = nlp(post)
        
        for token in doc:
            word = token.text
            
            # Add to the total word count of this series of posts
            num_words += 1
            
            # Add the word to the global word frequency of the whole corpus
            if word in all_word_frequency.keys():
                all_word_frequency[word] += 1
            else:
                all_word_frequency[word] = 1
    
    return num_words, num_posts

## Add the Data to Dataset

Adds the post list to the specified global dataset list with the 
provided author id as the tag. It assumes that all posts within the list
is attributed to the provided author id. The global dataset list should be one of the 
following:
- testing_data
- training_data
- leftover_data

In [5]:
def add_data_to_set(a_id: int, post_list: list, global_dataset: list):
    for post in post_list:
        post_info = {"author_id": a_id, "post": post}
        global_dataset.append(post_info)

## Separate Data

Separates the list of posts into the training dataset and the testing dataset
by the percentage outlined by PERCENTAGE_FOR_TESTING. It assumes that 
all of the posts within the list is attributed to the author id provided.
In addition, it assumes that there are enough posts to meet the MIN_NUM_POSTS
requirements, so it won't have enough posts to provide at least one post for
either the training or testing set.

In [6]:
import random
import math

def separate_data(a_id: int, post_list: list):
    global WILL_SHUFFLE
    global PERCENTAGE_FOR_TESTING
    
    if WILL_SHUFFLE:
        random.shuffle(post_list)
        
    # Separate the post list into two parts, where one is PERCENTAGE_FOR_TESTING
    # of the whole post list
    num_posts_testing = math.floor(len(post_list) * PERCENTAGE_FOR_TESTING)
    testing_set = post_list[:num_posts_testing]
    training_set = post_list[num_posts_testing:]
    
    # Now add them to the respective global dataset
    global testing_data
    global training_data
    add_data_to_set(a_id, testing_set, testing_data)
    add_data_to_set(a_id, training_set, training_data)

## Process Data and Output to Files

In [None]:
import os
directory = "blogs/"

# Go through all of the files
print("Reading through data...")
for filename in os.listdir(directory):
    if filename.endswith(".xml"):
        author_id, posts = get_posts(directory, filename)
        word_count, post_count = count_data(posts)
        
        # Determine if this data makes the cut
        if word_count >= MIN_NUM_WORDS and post_count >= MIN_NUM_POSTS:
            # Separate out the data into training and dataset
            separate_data(author_id, posts)
        else:
            # Add the data to the leftover data
            add_data_to_set(author_id, posts, leftover_data)
        print("Finished processing author", author_id)
    else:
        continue

# Now output the data into JSON files
import json

print("Outputting the data to json files...")
with open('training_set.json', 'w') as json_file:
    dataset = {"data": training_data}
    json.dump(dataset, json_file, indent = 2, sort_keys=True)
with open('testing_set.json', 'w') as json_file:
    dataset = {"data": testing_data}
    json.dump(dataset, json_file, indent = 2, sort_keys=True)
with open('leftover_set.json', 'w') as json_file:
    dataset = {"data": leftover_data}
    json.dump(dataset, json_file, indent = 4, sort_keys=True)
with open('vocabulary.json', 'w') as json_file:
    json.dump(all_word_frequency, json_file)
print("Processing completed...")    

Reading through data...
Finished processing author 4162441
Finished processing author 3489929
Finished processing author 3954575
Finished processing author 3364931
Finished processing author 3162067
Finished processing author 813360
Finished processing author 4028373
Finished processing author 3630901
Finished processing author 2467122
Finished processing author 3732850
Finished processing author 3846432
Finished processing author 3600967
Finished processing author 3753301
Finished processing author 4157968
Finished processing author 3699514
Finished processing author 2727849
Finished processing author 3791552
Finished processing author 4278694
Finished processing author 1618178
Finished processing author 669719
Finished processing author 3865169
Finished processing author 4310425
Finished processing author 4186320
Finished processing author 2221350
Finished processing author 2318045
Finished processing author 827534
Finished processing author 3842783
Finished processing author 3315654