### Text Cleaning Functions/Code to be used for Modeling

Apply functions to columns of a text df.

In [36]:
# libraries used
import pandas as pd
import string
import nltk
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [37]:
# reading in practice file
df = pd.read_csv('data/last_words_of_the_executed.csv')
df.head()

# df.isna().sum()

# initializing nltk objects
lemmatizer = WordNetLemmatizer()
stopwords_eng = set(stopwords.words('english'))


In [38]:
def clean_text(text):
    # remove non-ASCII ("weird") characters
    text = text.encode('ascii', errors='ignore').decode()

    # removing digits
    # removing puncuation
    clean_text = ""
    for char in text:
        # only keep non-digits or non-puncuation
        if not (char.isdigit() or char in string.punctuation):
            clean_text += char

    # lowercasing
    clean_text = clean_text.lower()

    # fix any whitespace inconsistencies
    clean_text = re.sub(r"\s+", " ", clean_text).strip()

    return clean_text # Replace with cleaned version of text

# Apply the clean_text() function to the statement column
df['Statement_cleaned'] = df['Statement'].apply(clean_text)

df.head()

Unnamed: 0,Name,Month,Day,Year,Statement,Statement_cleaned
0,MARMADUKE STEVENSON,October,27.0,1659,"Be it known to all this day, that we suffer no...",be it known to all this day that we suffer not...
1,WILLIAM ROBINSON,October,27.0,1659,"This is the day of your visitation, wherein th...",this is the day of your visitation wherein the...
2,WILLIAM LEDDRA,March,14.0,1661,For bearing my testimony for the Lord against ...,for bearing my testimony for the lord against ...
3,UNNAMED RINGLEADER,,,1673,"I have been among drawn Swords, flying bullets...",i have been among drawn swords flying bullets ...
4,THOMAS LUTHERLAND,February,23.0,1692,I had rather go to an Ale-house than to any Ch...,i had rather go to an alehouse than to any chu...


In [39]:
def stopword_text(clean_text):
    final_text = ""
    tokens = nltk.tokenize.word_tokenize(clean_text)

    for word in tokens:
        # skipping stopwords
        if word in stopwords_eng:
            continue

        # adding non-stopwords back in
        final_text += word + " "

    # strip any extra spaces
    final_text = final_text.strip()

    return final_text

# Apply the stopword_text() function to the statement column
df['Statement_cleaned'] = df['Statement_cleaned'].apply(stopword_text)

df.head()

Unnamed: 0,Name,Month,Day,Year,Statement,Statement_cleaned
0,MARMADUKE STEVENSON,October,27.0,1659,"Be it known to all this day, that we suffer no...",known day suffer evil doers conscience sake da...
1,WILLIAM ROBINSON,October,27.0,1659,"This is the day of your visitation, wherein th...",day visitation wherein lord hath visited youth...
2,WILLIAM LEDDRA,March,14.0,1661,For bearing my testimony for the Lord against ...,bearing testimony lord deceivers deceived brou...
3,UNNAMED RINGLEADER,,,1673,"I have been among drawn Swords, flying bullets...",among drawn swords flying bullets roaring cann...
4,THOMAS LUTHERLAND,February,23.0,1692,I had rather go to an Ale-house than to any Ch...,rather go alehouse church pray young people ta...


In [40]:
def lemmatize_text(clean_text):
    # convert text into tokens
    tokens = nltk.word_tokenize(clean_text)
    # lemmatize each token/word
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    # convert back into string
    return " ".join(lemmas)

# Apply the lemmatize_text() function to the statement column
df['Statement_cleaned'] = df['Statement_cleaned'].apply(lemmatize_text)

df.head()

Unnamed: 0,Name,Month,Day,Year,Statement,Statement_cleaned
0,MARMADUKE STEVENSON,October,27.0,1659,"Be it known to all this day, that we suffer no...",known day suffer evil doer conscience sake day...
1,WILLIAM ROBINSON,October,27.0,1659,"This is the day of your visitation, wherein th...",day visitation wherein lord hath visited youth...
2,WILLIAM LEDDRA,March,14.0,1661,For bearing my testimony for the Lord against ...,bearing testimony lord deceiver deceived broug...
3,UNNAMED RINGLEADER,,,1673,"I have been among drawn Swords, flying bullets...",among drawn sword flying bullet roaring cannon...
4,THOMAS LUTHERLAND,February,23.0,1692,I had rather go to an Ale-house than to any Ch...,rather go alehouse church pray young people ta...
