In [1]:
path_to_newspapers = '../../data/corpora/newspapers_test/'
newspaper_test = 'newspaper_test.txt'

In [2]:
# Imports
import codecs
from nltk.tokenize import sent_tokenize
import re
import spacy
import unicodedata

In [3]:
regex_expressions = {"initials": r"\b([A-Z][.](\s)?)+", "prefixes": r"(Mr|St|Mrs|Ms|Dr|Esq|Sec|Secretar)[.]",\
                     "addresses": "", "dates": "", "line_break": r"¬\n", "space": r"/s",\
                     "dashes": r"[-]+", "quote_marks": r"(“|”)", \
                     "months_abrv": r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[.](\s*(\d{1,2})(,|\.)?)?(\s*\d+)?",\
                     "pennies": r"(\d+[.]?\s*)[d][.]", "months_and_years": r"\d{1,2}[.]\s*(\d{4})"}


In [4]:
# strip all accented characters:
def strip_accents(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text)

def process_periods(text):
    # no matchobj needed since this is only called in other processing functions
    text = re.sub(r"[.]","<prd>", text)
    return text

def process_periods_to_commas(matchobj):
    text = matchobj.group(0)
    text = re.sub(r"[.]", ",", text)
    return text

# process initials for regex, and return a format that we can identify
def process_initials(matchobj):
    text = matchobj.group(0)
    text = process_periods(text)
    text = re.sub(r"\s*", "", text)
    text = text + " "
    return text

def process_months_abrv(matchobj):
    text = matchobj.group(0)
    text = process_periods(text)
    # text = " <date>"+text+"<date> "
    return text

def process_pennies(matchobj):
    text = matchobj.group(0)
    text = re.sub(r"d[.]?","pennies", text)
    text = process_periods(text)
    return text

def preprocess_text(text):
    # remove all the line breaks created by newspaper processor
    text = re.sub(regex_expressions["line_break"],"", text)
    # marking initials:
    text = re.sub(regex_expressions["initials"], process_initials, text)
    # process titles:
    text = re.sub(regex_expressions["prefixes"],"\\1<prd>", text, flags=re.IGNORECASE)
    # process month abbreviations:
    text = re.sub(regex_expressions["months_abrv"], process_months_abrv, text, flags=re.IGNORECASE)
    # process instances of months [period] year:
    text = re.sub(regex_expressions["months_and_years"], process_periods_to_commas, text)
    # process instances of "No."
    text = re.sub(r"(No|Nos)[.]","number", text, flags=re.IGNORECASE)
    # strip all dashes:
    text = re.sub(regex_expressions["dashes"], " ", text)
    # transform all quotes to ' " ':
    text = re.sub(regex_expressions["quote_marks"], '"', text)
    # strip all pennies "XX d." in the text:
    text = re.sub(regex_expressions["pennies"], process_pennies, text)
    # strip all accents from the text:
    text = strip_accents(text)
    return text


def clean_tokenized_sent(sent):
    # removing newline notations
    clean_sent = re.sub('\n', ' ', sent)
    clean_sent = re.sub('\r', ' ', clean_sent)
    # transforming multiple spaces to one space
    clean_sent = re.sub('\s+',' ', clean_sent)
    # put back the periods:
    clean_sent = re.sub("<prd>", ".", clean_sent)
    return clean_sent


def clean_tokenized_list(sent_list):
    cleaned_tokenized_sentences = []
    for sent in sent_list:
        clean_set = clean_tokenized_sent(sent)
        cleaned_tokenized_sentences.append(clean_set)
    return cleaned_tokenized_sentences

In [5]:
with codecs.open(path_to_newspapers + newspaper_test, 'r', encoding='utf-8', errors="ignore") as raw_text:
    dirty_text = raw_text.read()

# print(dirty_text[:6000])

In [6]:
preprocessed_text = preprocess_text(dirty_text)
# print(preprocessed_text)

In [7]:
tokenized_sentences = sent_tokenize(preprocessed_text)
cleaned_tokenized_sentences = clean_tokenized_list(tokenized_sentences)

In [8]:
# POS try:
# first loading english language support
nlp = spacy.load("en_core_web_sm")

def pos_tag_sentence(sent):
    tagged_sentence = []
    analyzed_sent = nlp(sent, disable = ['ner'])
    # getting the complete tokenized sentence
    for token in analyzed_sent:
        tagged_word = (token, token.tag_)
        tagged_sentence.append(tagged_word)
        
    return tagged_sentence

In [9]:
spacy.explain('NNP')

'noun, proper singular'

In [10]:
for index, sent in enumerate(cleaned_tokenized_sentences):
    if index <= 50:
        tagged_sentence = pos_tag_sentence(sent)
        print(index, end=": ")
        print(sent)
        print(tagged_sentence, end="\n\n")
    # if len(sent.split()) >= 4:
    #     print(index, end=": ")
    #     print(sent)

0: AFE F.E. WRITE immediately.
[(AFE, 'NNP'), (F.E., 'NNP'), (WRITE, 'VBP'), (immediately, 'RB'), (., '.')]

1: In London de vue de are 1 HIMNEY TOP.
[(In, 'IN'), (London, 'NNP'), (de, 'FW'), (vue, 'NNP'), (de, 'NNP'), (are, 'VBP'), (1, 'CD'), (HIMNEY, 'NNP'), (TOP, 'NN'), (., '.')]

2: E.E.F. , alive, weil, and at Hythe.
[(E.E.F., 'NNP'), (,, ','), (alive, 'JJ'), (,, ','), (weil, 'NN'), (,, ','), (and, 'CC'), (at, 'IN'), (Hythe, 'NNP'), (., '.')]

3: A Happy New Year to you sil.
[(A, 'DT'), (Happy, 'NNP'), (New, 'NNP'), (Year, 'NNP'), (to, 'IN'), (you, 'PRP'), (sil, 'VB'), (., '.')]

4: AMES.
[(AMES, 'NNP'), (., '.')]

5: Dear James, pray send to the last address, posible, LETTERS, of great important three from your ever, and most affectionate WILLIAM, HE Overseers of the Parish of Grately.
[(Dear, 'NNP'), (James, 'NNP'), (,, ','), (pray, 'VB'), (send, 'VB'), (to, 'IN'), (the, 'DT'), (last, 'JJ'), (address, 'NN'), (,, ','), (posible, 'JJ'), (,, ','), (LETTERS, 'NNS'), (,, ','), (of, '