# Preprocessing

Run this notebook to create `processed_dataset.csv` in `../data`

In [None]:
import glob
import string
import os
import re
import random
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

def get_token_list(file_path):
    """Gets a list of tokens (words) from a file."""
    with open(file_path, "r", encoding="utf-8") as f:
        file_text = f.read()
    pattern = r"[\r\n]*\[BODY\]:\s*[\r\n]*"
    body = re.split(pattern, file_text)[1]
    body = body.replace("\n", "")
    body = body.lower()
    translation_map = str.maketrans("", "", string.punctuation)
    body = body.translate(translation_map)
    tokens = body.split(" ")
    tokens = [token for token in tokens if token != ""]
    return tokens


# Dataset contains input (a single original token) in the first column and the label (a single normalized token)
dataset = np.zeros((0, 2))

# Iterate over original and modernized documents from each century and add them to the dataset if the documents have the same number of tokens
# This is a naive (but largely effective) attempt at word alignments
centuries = ["16th_century", "17th_century", "18th_century", "19th_century"]
for century in centuries:    
    print(f"Processing {century} documents...")
    original_files = glob.glob(f"../data/post_scriptum_spanish/original/{century}/*.txt")
    modernized_files = glob.glob(f"../data/post_scriptum_spanish/modernized/{century}/*.txt")
    for index, file in enumerate(original_files):
        original_tokens = get_token_list(file)
        modernized_tokens = get_token_list(modernized_files[index])
        if len(original_tokens) == len(modernized_tokens):
            new_data = np.column_stack((original_tokens, modernized_tokens))
            dataset = np.append(dataset, new_data, axis=0)

df = pd.DataFrame(dataset, columns=["Original", "Modernized"])
df.to_csv(r"../data/processed_dataset.csv", index=False)