In [1]:
import sys
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    
    #old: percents = round(100.0 * count / float(total), 1)
    percents = round(100.1 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()  # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)

In [5]:
## the Times XML parser p1
import csv
import os
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET


#defines the file's directory for the xml files
path = './2022_01_31_Newspaper_Parser/PC-bu/xml_sheets/'

def getMainAttribute(tree, writer, minOCR=40.0):

    #get root of xml file
    root = tree.getroot()
    
    #issue number
    # print(root.tag + root.attrib['ID'])
    issueID = root.tag+root.attrib['ID']

    #get date
    full_date = root.find('.//composed').text
    # print(date)

    #get width and height of the paper
    width = root.find('.//pageid').attrib['width']
    height = root.find('.//pageid').attrib['height']

    #Articles is defined as a list (we get all the articles in the paper)
    Articles = root.findall(".//article[@type='Article']")

    #loop through the articles
    for Article in Articles:
        #check for ocr scores
        ocrScore = float(Article.find(".//ocr").text)    
        if ocrScore < minOCR:
            break
        pageNumber = Article.find(".//pi[@pgref='1']")
           #breaks loop if we are not on the first page
        if pageNumber is None:
            break

        title = Article.find(".//ti").text
        # print(title)

        TextInArticle = Article.find(".//text")
        #get <text.cr>
        TextInput = ''
        for TextContainer in TextInArticle:
            #get <pg> and <p>
            for Paragraph in TextContainer:
                if Paragraph.tag == "pg":
                    positionValues = [int(lbtr) for lbtr in Paragraph.attrib['pos'].split(',')]
                    #if Paragraph.attrib['clipref'] != "2":
                    #    break

                #subparagraph gives us the wd positions
                
                for SubParagraph in Paragraph:
                    
                                #[0] = left
                                #[1] = bottom
                                #[2] = right
                                #[3] = top
                                #in case it's needed
                    subPositionValues = [int(lbtr) for lbtr in SubParagraph.attrib['pos'].split(',')]
                    if(int(subPositionValues[2]) <= (int(width)/6)+5):
                        # print("1st BREAK")
                        break    
                    elif(int(subPositionValues[2]) >= (3*(int(width)/6))+5):
                        # print("2nd BREAK")
                        break
                    else:
                        # print("3rd BREAK")
                        # print(SubParagraph.text)
                        TextInput += SubParagraph.text + ' '
                    #print(SubParagraph.text, end=' ')    
                    #print(positionValues[0],positionValues[1],positionValues[2],positionValues[3])
                #print(TextInput)
        writer.writerow([issueID, full_date, title, ocrScore, TextInput])


In [6]:
def write_newspaper_XMLs_to_df(name_of_csv='testrun', path_to_XML_sheets=path, minOCR=40.0):
    with open(name_of_csv+'.csv', 'w', newline='', encoding="utf8", errors='ignore') as csvFile:
        i = 0
        writer = csv.writer(csvFile) 
        writer.writerow(["issueID", "full_date", "title", "ocrScore", "text"])
        for files in os.listdir(path_to_XML_sheets):
            # progress(i, len(os.listdir(path)))
            if not files.endswith('.xml'):
                continue
            fullname = os.path.join(path_to_XML_sheets, files)
            tree  = ET.parse(fullname)
            getMainAttribute(tree, writer, minOCR)
            i += 1
            progress(i, len(os.listdir(path)))
    #group duplicate rows
    df = pd.read_csv(r'testrun.csv', delimiter=',')
    # new: df = pd.read_csv('testrun.csv', delimiter=',')
    #drop nan values
    df.replace(['None', 'nan'], np.nan, inplace=True)
    df.dropna(how='any', inplace=True)
    return df

def give_me_just_the_year_from_date(df, date_column='full_date'):
    for index, row in df.iterrows():
        full_date = row[date_column]
        full_date_split = full_date.split()
        df.at[index, 'text_year'] = full_date_split[2] 
    return df
#reformat for a new output csv
# out = df.astype(str).groupby(['date', 'issueID', 'title']).agg(', '.join)
# print(out)
# out.to_csv('formatted_output.csv')
# out.head(-20)

In [7]:
df = write_newspaper_XMLs_to_df()
df = give_me_just_the_year_from_date(df)
df.head()

[############################################################] 100.1% ...

Unnamed: 0,issueID,full_date,title,ocrScore,text,text_year
0,issueN0029851,"September 1, 1880",Births,48.95,BIRTHS.1,1880
3,issueN0027091,"August 9, 1872",Deaths,58.25,"DEATHS. On'the 3d inst.. at Ostende, deeply la...",1872
4,issueN0027091,"August 9, 1872",L. G.-Have an important communication for you.,51.69,G.-Have an iTmuporta3nt communication for you....,1872
5,issueN0014350,"December 10, 1831","FOR CALCUTTA, will land passengers at Madras, ...",47.82,"OS P 'ARCEMENT DEED, consisting of tw,slt,betw...",1831
9,issueN0012785,"December 9, 1826","SHIP LOWTHER CASTLE, East Indiaman.-The CREDIT...",45.49,"1~OTJD; Mae, aGtNTLaMA's~ ~NRY~~DIGS~ WHIEI d?...",1826


In [11]:
import re
from nltk.tokenize import sent_tokenize

prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
dashes = "(--)+"
df['sentences'] = ''
for index, row in df.iterrows():
    dirty_text = row['text']
    dirty_text = re.sub(prefixes,"\\1<prd>", dirty_text)
    dirty_text = re.sub(dashes, ' \\1 ', dirty_text)
    dirty_text = re.sub('(“|”)', '"', dirty_text)
    clean_text_list = sent_tokenize(dirty_text)

    clean_sentences = []
    for sent in clean_text_list:
        # res.append(re.sub('\n', '', sent))
        # removing newline notations
        clean_sent = re.sub('\n', ' ', sent)
        clean_sent = re.sub('\r', ' ', clean_sent)
        # transforming multiple spaces to one space
        clean_sent = re.sub('\s+',' ', clean_sent)
        clean_sentences.append(clean_sent)
    df.at[index, 'sentences']= clean_sentences
    

In [12]:
df.head()

Unnamed: 0,issueID,full_date,title,ocrScore,text,text_year,sentences
0,issueN0029851,"September 1, 1880",Births,48.95,BIRTHS.1,1880,[BIRTHS.1]
3,issueN0027091,"August 9, 1872",Deaths,58.25,"DEATHS. On'the 3d inst.. at Ostende, deeply la...",1872,"[DEATHS., On'the 3d inst.. at Ostende, deeply ..."
4,issueN0027091,"August 9, 1872",L. G.-Have an important communication for you.,51.69,G.-Have an iTmuporta3nt communication for you....,1872,[G.-Have an iTmuporta3nt communication for you...
5,issueN0014350,"December 10, 1831","FOR CALCUTTA, will land passengers at Madras, ...",47.82,"OS P 'ARCEMENT DEED, consisting of tw,slt,betw...",1831,"[OS P 'ARCEMENT DEED, consisting of tw,slt,bet..."
9,issueN0012785,"December 9, 1826","SHIP LOWTHER CASTLE, East Indiaman.-The CREDIT...",45.49,"1~OTJD; Mae, aGtNTLaMA's~ ~NRY~~DIGS~ WHIEI d?...",1826,"[1~OTJD; Mae, aGtNTLaMA's~ ~NRY~~DIGS~ WHIEI d..."


In [15]:
print(df.ocrScore.values.tolist())
print(df.ocrScore.values.mean())

[48.95, 58.25, 51.69, 47.82, 45.49, 43.19, 53.33, 47.45, 44.01, 49.07, 66.11, 62.18, 62.04, 60.84, 56.4, 60.87, 54.93, 66.33, 56.48, 65.99, 50.28, 56.79, 69.26, 67.03, 52.73, 47.99, 50.97, 53.38, 54.15, 51.7]
55.19000000000001


In [None]:
## POS tagging:

def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    percents = round(100.1 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)
    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush() 
    
# first loading english language support
nlp = spacy.load("en_core_web_sm")

## Takes in a dataframe and clean text column (as string), and returns the df with POS tags for all the texts
## Multiple columns are created, one for each POS tag, and one that contains all POS tags (I did this to more easily
## be able to grab POS percentages afterward)
def get_POS_tags_for_text_in_df(df, text_row_to_analyze='text'):
    # setting up column for pos counts
    df['all_pos_counts'] = ''
    df["parts_of_speech_total_count"] = ''
    # loop through df and get all POS tags:
    for index, row in df.iterrows():
        # show progress
        progress(index, len(df.index))
        
        # grab text
        text = row[text_row_to_analyze]
        
        # this is a memory buffer, to extend max length of available ram according to the text being analyzed
        # https://datascience.stackexchange.com/questions/38745/increasing-spacy-max-nlp-limit
        nlp.max_length = len(text) + 100
        
        # disable modules not in use to save memory
        analyzed_doc = nlp(text, disable = ['ner'])
        
        # grabbing all pos counts in the text in non-human readable format
        pos_counts_in_text = analyzed_doc.count_by(spacy.attrs.IDS['POS'])
        
        # setting up list to render pos hashes in human readable format:
        human_readable_pos_count_list = []
        
        # iterating through counts to make hashes human readable:
        for pos, count in pos_counts_in_text.items():
            human_readable_tag = analyzed_doc.vocab[pos].text
            # rendering as list to input back into df
            human_readable_tag_and_count = list((human_readable_tag, count))
            human_readable_pos_count_list.append(human_readable_tag_and_count)
        # looping through the human readable counts, assigning their label to the column
        # and the count to the row for each pos tag
        for element in human_readable_pos_count_list:
            df.at[index, 'POS_' + str(element[0])+'_count'] = element[1]
        
        # placing all the pos counts for each text in the all_pos_counts column
        df.at[index, 'all_pos_counts'] = human_readable_pos_count_list
        
        
    df = df.fillna(0)
    # getting POS percentages for each POS tag in texts
    # There are much easier and more efficient ways to do this rather than looping over the entire df again but we were pressed for time...
    # TODO: integrate this loop into previous loop
    for index, row in df.iterrows():
        total = 0.0
        for name in df.columns.values.tolist():
            if name.startswith("POS_"):
                # get total POS elements count for sanity
                total += row[name]
        try:
            df.at[index, "parts_of_speech_total_count"] = int(total)
        except Exception as e:
            print(e)
        for name in df.columns.values.tolist():
            if name.startswith("POS_"):
                # assign new name for column
                new_name = "%" + name
                # get % of total POS in text
                if total != 0:
                    percentage = round((row[name] / total) * float(100), 3)
                else:
                    print(row[text_row_to_analyze])
                # if this is the first index, create the column name to avoid errors
                if index == 0:
                    df[new_name] = 0.0
                df.at[index, new_name] = percentage
    return df

In [5]:
# new_df.to_csv("test_each_AC_same_row.csv")