In [1]:
path_to_newspapers = '../../data/corpora/newspapers_test/'
# newspaper_test = 'newspaper_test.txt'

In [2]:
# Imports
import codecs
from nltk.tokenize import sent_tokenize
import os
import pandas as pd
import re
import spacy
import unicodedata

In [3]:
regex_expressions = {"initials": r"\b([A-Z][.](\s)?)+", "prefixes": r"(Mr|St|Mrs|Ms|Dr|Esq|Sec|Secretar)[.]",\
                     "addresses": "", "dates": "", "line_break": r"¬\n", "space": r"/s",\
                     "dashes": r"[-]+", "quote_marks": r"(“|”)", \
                     "months_abrv": r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[.](\s*(\d{1,2})(,|\.)?)?(\s*\d+)?",\
                     "pennies": r"(\d+[.]?\s*)[d][.]", "months_and_years": r"\d{1,2}[.]\s*(\d{4})"}


In [4]:
def input_corpus_of_txts(path=path_to_newspapers):
    list_of_filenames_and_dirty_texts = []
    for filename in os.listdir(path):
        with codecs.open(path + filename, 'r', encoding='utf-8', errors="ignore") as raw_text:
            dirty_text = raw_text.read()
        list_of_filenames_and_dirty_texts.append((filename, dirty_text))
    return list_of_filenames_and_dirty_texts


# strip all accented characters:
def strip_accents(text):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text)

def process_periods(text):
    # no matchobj needed since this is only called in other processing functions
    text = re.sub(r"[.]","<prd>", text)
    return text

def process_periods_to_commas(matchobj):
    text = matchobj.group(0)
    text = re.sub(r"[.]", ",", text)
    return text

# processing functions for regex calls in preprocess_text() function
# process initials for regex, and return a format that we can identify
def process_initials(matchobj):
    text = matchobj.group(0)
    text = process_periods(text)
    text = re.sub(r"\s*", "", text)
    text = text + " "
    return text

def process_months_abrv(matchobj):
    text = matchobj.group(0)
    text = process_periods(text)
    # text = " <date>"+text+"<date> "
    return text

def process_pennies(matchobj):
    text = matchobj.group(0)
    text = re.sub(r"d[.]?","pennies", text)
    text = process_periods(text)
    return text

# combine it together
def preprocess_text(text):
    # remove all the line breaks created by newspaper processor
    text = re.sub(regex_expressions["line_break"],"", text)
    # marking initials:
    text = re.sub(regex_expressions["initials"], process_initials, text)
    # process titles:
    text = re.sub(regex_expressions["prefixes"],"\\1<prd>", text, flags=re.IGNORECASE)
    # process month abbreviations:
    text = re.sub(regex_expressions["months_abrv"], process_months_abrv, text, flags=re.IGNORECASE)
    # process instances of months [period] year:
    text = re.sub(regex_expressions["months_and_years"], process_periods_to_commas, text)
    # process instances of "No."
    text = re.sub(r"(No|Nos)[.]","number", text, flags=re.IGNORECASE)
    # strip all dashes:
    text = re.sub(regex_expressions["dashes"], " ", text)
    # transform all quotes to ' " ':
    text = re.sub(regex_expressions["quote_marks"], '"', text)
    # strip all pennies "XX d." in the text:
    text = re.sub(regex_expressions["pennies"], process_pennies, text)
    # strip all accents from the text:
    text = strip_accents(text)
    return text


In [5]:


def clean_tokenized_sent(sent):
    # removing newline notations
    clean_sent = re.sub('\n', ' ', sent)
    clean_sent = re.sub('\r', ' ', clean_sent)
    # transforming multiple spaces to one space
    clean_sent = re.sub('\s+',' ', clean_sent)
    split_sentence = clean_sent.split()
    
    # transform all the words that are completely uppercase to lowercase
    for index, word in enumerate(split_sentence):
        if (word.isupper()):
            new_word = word.lower()
            split_sentence[index] = new_word
    clean_sent = " ".join(split_sentence)
    
    # put back the periods:
    clean_sent = re.sub("<prd>", ".", clean_sent)
    # clean_sent = clean_sent.lower()
    return clean_sent


### Not needed if done in df
def clean_tokenized_list(sent_list):
    cleaned_tokenized_sentences = []
    for sent in sent_list:
        clean_set = clean_tokenized_sent(sent)
        cleaned_tokenized_sentences.append(clean_set)
    return cleaned_tokenized_sentences

def process_dirty_texts_to_df(list_of_filenames_and_dirty_texts):
    filenames = []
    cleaned_texts = []
    cleaned_corpus_as_dictionary = {}
    for filename, dirty_text in list_of_filenames_and_dirty_texts:
        preprocessed_text = preprocess_text(dirty_text)
        tokenized_sentences = sent_tokenize(preprocessed_text)
        cleaned_tokenized_sentences = clean_tokenized_list(tokenized_sentences)
        for clean_tokenized_sentence in cleaned_tokenized_sentences:
            filenames.append(filename)
            cleaned_texts.append(clean_tokenized_sentence)
    cleaned_corpus_as_dictionary['file_names'] = filenames
    cleaned_corpus_as_dictionary['sentences'] = cleaned_texts
    
    df = pd.DataFrame(cleaned_corpus_as_dictionary)
    return df

In [52]:
# POS try:
# first loading english language support

# faster but less accurate model:
# nlp = spacy.load("en_core_web_sm")

# slower but more accurate model:
# download it first
# !python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")

def get_pos_counts_from_tagged_sentence(analyzed_sent):
    pos_counts = []
    pos_counts_raw = analyzed_sent.count_by(spacy.attrs.IDS['POS'])
    for pos, count in pos_counts_raw.items():
        tag = analyzed_sent.vocab[pos].text
        pos_count = (tag, count)
        pos_counts.append(pos_count)
    # return a list of pos_counts
    return pos_counts

def pos_tag_sentence(sent):
    tagged_sentence = []
    analyzed_sent = nlp(sent, disable = ['ner'])
    # getting the complete tokenized sentence
    for token in analyzed_sent:
        tagged_word = (token, token.pos_)
        tagged_sentence.append(tagged_word)
    pos_counts = get_pos_counts_from_tagged_sentence(analyzed_sent)
    # return a tuple of both
    return (tagged_sentence, pos_counts)

def pos_tag_list_of_sentences(list_of_cleaned_sentences):
    pos_tagged_text = []
    for sent in list_of_cleaned_sentences:
        tagged_sent = pos_tag_sentence(sent)
        pos_tagged_text.append(tagged_sent)
    # returns a list of tuples
    return pos_tagged_text


def pos_tag_texts_from_df(df, sentences_column='sentences'):
    df['tagged_sentences'] = ''
    df['pos_counts'] = ''
    for index, row in df.iterrows():
        sentence = row[sentences_column]
        tagged_sentence, pos_counts = pos_tag_sentence(sentence)
        df.at[index, 'tagged_sentences'] = tagged_sentence
        df.at[index, 'pos_counts'] = pos_counts
        # print(tagged_sentence)
        # if index >= 10:
        #     break
    return df


In [53]:
## Get total POS counts from df once ready
def get_total_pos_counts(list_of_pos_counts):
    for index, pos_counts in enumerate(list_of_pos_counts):
        for tag, count in pos_counts:
            print(tag)
            print(count)
        if index >= 10:
            break

In [54]:
## Creating a dictionary of all the texts with keys as their filenames
dirty_texts = input_corpus_of_txts(path=path_to_newspapers)
# print(dirty_texts)
df = process_dirty_texts_to_df(dirty_texts)
df.head()

Unnamed: 0,file_names,sentences
0,newspaper_test2.txt,afe F.E. write immediately.
1,newspaper_test2.txt,In London de vue de are 1 himney top.
2,newspaper_test2.txt,"E.E.F. , alive, weil, and at Hythe."
3,newspaper_test2.txt,a Happy New Year to you sil.
4,newspaper_test2.txt,ames.


In [55]:
df = pos_tag_texts_from_df(df)
df.head()

Unnamed: 0,file_names,sentences,tagged_sentences,pos_counts
0,newspaper_test2.txt,afe F.E. write immediately.,"[(afe, PROPN), (F.E., PROPN), (write, VERB), (...","[(PROPN, 2), (VERB, 1), (ADV, 1), (PUNCT, 1)]"
1,newspaper_test2.txt,In London de vue de are 1 himney top.,"[(In, ADP), (London, PROPN), (de, X), (vue, X)...","[(ADP, 1), (PROPN, 1), (X, 3), (AUX, 1), (NUM,..."
2,newspaper_test2.txt,"E.E.F. , alive, weil, and at Hythe.","[(E.E.F., PROPN), (,, PUNCT), (alive, ADJ), (,...","[(PROPN, 3), (PUNCT, 4), (ADJ, 1), (CCONJ, 1),..."
3,newspaper_test2.txt,a Happy New Year to you sil.,"[(a, DET), (Happy, PROPN), (New, PROPN), (Year...","[(DET, 1), (PROPN, 3), (ADP, 1), (PRON, 1), (N..."
4,newspaper_test2.txt,ames.,"[(ames, PROPN), (., PUNCT)]","[(PROPN, 1), (PUNCT, 1)]"


In [None]:
 # tagged_sentence = pos_tag_list_of_sentences(cleaned_tokenized_sentences)

In [None]:
# print(tagged_sentence[0][1])

In [None]:
# spacy.explain('X')

In [None]:
# print(dirty_text)

In [None]:
for index, sent in enumerate(cleaned_tokenized_sentences):
    if index <= 50:
        tagged_sentence, pos_counts = pos_tag_sentence(sent)
        print(index, end=": ")
        print(sent)
        print(tagged_sentence)
        print(pos_counts, end="\n\n")