In [84]:
import os
import re
from tqdm import tqdm
import pandas as pd

# Read in and clean text

In [85]:
def collect_txt(directory: str) -> dict:
    """Read in all TXT files in the folder and store the text into memory
    as a dictionary with special characters removed and converted to lowercase

    Args:
        directory (str): Folder name where all letter files are stored

    Returns:
        dict: Dictionary with year as the key and the cleaned
            (lowercase and special characters removed) text as the value
    """

    letters = {}

    # Ensure that the directory exists
    assert os.path.isdir(directory)

    # Walk through the directory with the letter files
    for curr_path, directories, files in os.walk(directory):

        # Extract year values to use as the key
        year = curr_path[-4:]

        # Find all TXT files
        for file in files:
            if file.endswith('txt'):

                # Read in the raw text of each TXT file
                with open(os.path.join(os.getcwd(), directory, year, file),
                          'r', encoding='ISO-8859-1') as f:
                    raw_text = f.read()

                # Convert to lower-case and remove special characters
                clean_text = re.sub(r'[^\w\d&]', ' ', raw_text.lower())
                letters[year] = clean_text

    return letters

In [86]:
letters = collect_txt('letters')

# Import NLTK library and dependencies

In [97]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jlee0\AppData\Roaming\nltk_data...


True

# Pre-process text to prepare for text analysis

## Tokenize text

In [88]:
def tokenize_text(letters: dict) -> dict:
    """Tokenize annual letters using NLTK's tokenizer

    Args:
        letters (dict): Dictionary with year as the key and the cleaned
            (lowercase and special characters removed) text as the value

    Returns:
        dict: Dictionary with year as the key and a list of tokenized text as the value
    """

    tokenized = {}

    # Tokenize each document by splitting the document into tokens
    for year, text in letters.items():
        tokenized[year] = word_tokenize(text)

    # Remove numbers, but not words that contain numbers
    tokenized = {year: [token for token in text if not token.isnumeric()] for year, text in tokenized.items()}

    # Remove words that are only one character
    tokenized = {year: [token for token in text if len(token) > 1] for year, text in tokenized.items()}

    return tokenized


In [89]:
tokenized = tokenize_text(letters)

## Remove stopwords

In [90]:
def remove_stopwords(tokenized: dict) -> dict:
    """Remove English stopwords from annual letters

    Args:
        tokenized (dict): Dictionary with year as the key and the tokenized text as the value

    Returns:
        dict: Dictionary with year as the key and a list of stop-word removed tokens as the value
    """

    no_stopwords = {}

    for year, text in tqdm(tokenized.items()):
        no_stopwords[year] = [token for token in text if token not in stopwords.words('english')]

    return no_stopwords

In [91]:
no_stopwords = remove_stopwords(tokenized)

100%|██████████| 46/46 [05:15<00:00,  6.85s/it]


## Lemmatize text

In [95]:
def lemmatize_text(no_stopwords: dict) -> dict:
    """Lemmatize each stopword-removed list of tokens to standardize the tokens
    for each letter

    Args:
        no_stopwords (dict): Dictionary with year as the key and a list of stop-word removed tokens as the value

    Returns:
        dict: Dictionary with year as the key and a list of lemmatized and 
            stop-word removed tokens as the value
    """
    
    lemmatizer = WordNetLemmatizer()

    lemmatized = {}
    for year, text in tqdm(no_stopwords.items()):
        lemmatized[year] = [lemmatizer.lemmatize(token) for token in text]

    return lemmatized

In [98]:
lemmatized = lemmatize_text(no_stopwords)

100%|██████████| 46/46 [00:02<00:00, 20.36it/s]


# Generate wordcloud