### Imports, Reading in files & Defining functions

In [51]:
import os
import io
import regex as re
import numpy as np
from tqdm import tqdm
from termcolor import colored
from colorama import Back, Style

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

from autocorrect import Speller # Spell checker
import nltk

#nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = [word.upper() for word in stopwords.words('english')]
hard_coded_non_section_words = ['URL', 'UTC', 'JSTOR', 'AMERICAN', 'JOURNAL', 'SOCIOLOGY', 'ABSTRACT', 'TABLE', 'CHART',
                                'AMERICAN JOURNAL OF SOCIOLOGY', 'UNIVERSITY OF CHICAGO', 'AMERICA', 'JOURNAL OF SOCIOLOGY',
                                'REFERENCES', 'AMERICAN SOCIOLOGICAL REVIEW', 'AMERICAN SOCIOLOGICAL ASSOCIATION']

def PDFtoString(filePath, pdfFolder=None):
    
    out = io.StringIO()
    if pdfFolder is not None:
        filePath = os.path.join(pdfFolder, filePath)
    with open(filePath, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, out, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for indx, page in enumerate(PDFPage.create_pages(doc)):
            #if indx != 0:
            interpreter.process_page(page)
            
    return out.getvalue() 

def printMetaInfo(convertedStrings, pdfPaths, journal = 'AJS'):
    
    # Print the regex matches for access date, abstract, table/chart, and references
    for indx, string in enumerate(convertedStrings):
        print(indx, colored(pdfPaths[indx], 'red'))
        if re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string):
            print('\t', colored(re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0], 'blue'))
        print('\t', colored('Abstract match:', 'magenta'), colored(re.search(r'ABSTRACT', string), 'magenta'))
        
        # Chart, table, or figure matches
        matches = re.finditer(r'CHART', string)
        for match in matches:
            print('\t', colored('Chart match:', 'green'), colored(match, 'green'))
        matches = re.finditer(r'TABLE', string)
        for match in matches:
            print('\t', colored('Table match:', 'green'), colored(match, 'green'))
        matches = re.finditer(r'FIG|FIGURE|Fig', string)
        for match in matches:
            print('\t', colored('Figure match:', 'green'), colored(match, 'green'))  
            
        print('\t', colored('Reference match:', 'magenta'), colored(re.search(r'REFERENCES', string), 'magenta'))
        
        # This finds the title words which aren't stop words & aren't digits and upper cases them all
        if journal == 'AJS':
            titles = [re.findall(r'(?<=AJS_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in pdfPaths]
        elif journal == 'ASR':
            titles = [re.findall(r'(?<=ASR_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in pdfPaths]
        title_words = [i for i in titles[indx].upper().split() if i not in stop_words and not i.isdigit()]
        
        # Then adds them to stop words & other hard-coded regularly occuring words
        non_section_words = hard_coded_non_section_words + title_words + stop_words 
        non_section_words += [title.upper() for title in titles] + [title.strip('The ').upper() for title in titles]
        
        matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}', string)
        sections = dict()
        
        for match in matches:
            if match.group() not in non_section_words and match not in sections:
                sections[match.group()] = match.span()
        for section in sections:
            print('\t', colored('Section match:', 'blue'), colored([section, sections[section]], 'blue'))

def createOutputStrings(convertedStrings, folder = None, journal = 'AJS'):
    out = list() #Output strings go here, hopefully one out per pdf s.t. len(out) == len(convertedStrings)
    for indx, string in enumerate(convertedStrings):
        # Define meta-information
        outString = '-------------\n||Meta-info||\n-------------\n'

        # The second condition is more restrictive, so check that that's not None.
        header = article = None
        if re.search(r'extend access to\n(.)+?(?=C)(.)+?(?=\w{2,})', string) is not None:
            header = string[:re.search(r'extend access to\n(.)+?(?=C)', string).end()].strip()
            if re.search(r' Your use of the JSTOR', header) is not None:
                header = header[:re.search(r' Your use of the JSTOR', header).span()[0]]
            article = string[re.search(r'extend access to\n(.)+?(?=C)(.)+?(?=\w{2,})', string).end():]
            outString += re.sub(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', '', header)

        # Add access date
        outString += '\n\n' + re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0] + '\n'

        # Add abstract info
        abstract = re.search(r'ABSTRACT', string)
        if abstract is not None:
            outString += 'Abstract match at {}'.format(abstract.span()) + '\n'
        elif abstract is None:
            outString += 'No abstract found.\n'

        # Add chart info
        chart_matches = re.finditer(r'CHART', string)    
        if chart_matches is not None:
            for match in chart_matches:
                outString += 'Chart match at {}'.format(match.span()) + '\n'

        # Add table info 
        table_matches = re.finditer(r'TABLE', string)
        if table_matches is not None:
            for match in table_matches:
                outString += 'Chart match at {}'.format(match.span()) + '\n'
                
        # Add figure info
        figure_matches = re.finditer(r'FIG|FIGURE|Fig', string)
        if figure_matches is not None:
            for match in figure_matches:
                outString += 'Figure match at {}'.format(match.span()) + '\n'

        # Add references info
        references = re.search(r'REFERENCES', string)
        if references is not None:
            outString += 'Reference match at {}'.format(references.span()) + '\n'
        elif references is None:
            outString += 'No reference section found.\n'

        # Add section info 

        ### First, look for the title words and tokenize them.
        if journal == 'AJS':
            titles = [re.findall(r'(?<=AJS_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in folder]
        elif journal == 'ASR':
            titles = [re.findall(r'(?<=ASR_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in folder]
        title_words = [i for i in titles[indx].upper().split() if i not in stop_words and not i.isdigit()]

        ### Then add them to stop words & other hard-coded regularly occuring words
        non_section_words = hard_coded_non_section_words + title_words + stop_words
        matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}', string)

        if matches is not None:
            sections = dict()
            for match in matches:
                if match.group() not in non_section_words and match not in sections:
                    sections[match.group()] = match.span()
        for section in sections:
            outString += 'Section header "{}" found at {}'.format(section, sections[section]) + '\n'

        # Add in the article itself
        outString += '\n-----------\n||Article||\n-----------\n'
        if article is not None:
            outString += article
        elif article is None: #If regex wasn't able to split it according to the JSTOR access message just put the full article
            outString += string
        # Append the newly made string to the list. Again, there should be 1 per pdf
        out.append(outString)
    return out
    
def writeOut(out, pdfFolder = None, outFolder = None):
    for indx, file in tqdm(enumerate(out)):
        writeFilePath = 'corpus/{}/{}.txt'.format(outFolder, pdfFolder[indx][:-4])
        with open(writeFilePath, 'w') as f:
            f.write(file)
    print('done!')

def highlight(pattern, text, printOut = True):
    output = text
    lookforward = 0
    for match in pattern.finditer(text):
        start, end = match.start() + lookforward, match.end() + lookforward
        output = output[:start] + Back.YELLOW + Style.BRIGHT + output[start:end] + Style.RESET_ALL + output[end:]
        lookforward = len(output) - len(text)  

    if printOut:
        print(output)
    else:
        return output

In [2]:
with open('example.txt', 'r') as f:
    d = f.read()
pattern = re.compile(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}')
highlight(pattern, d)

-------------
||Meta-info||
-------------
Moving Teenagers Out of High-Risk Neighborhoods: How Girls Fare Better than Boys
Author(s): Susan Clampet-Lundquist, Kathryn Edin, Jeffrey R. Kling and  Greg J. Duncan
Source: American Journal of Sociology, Vol. 116, No. 4 (January 2011), pp. 1154-89
Published by: The University of Chicago Press
Stable URL: http://www.jstor.org/stable/10.1086/657352


Accessed: 08-06-2016 04:54 UTC
No abstract found.
Chart match at (31510, 31515)
Chart match at (90528, 90533)
Figure match at (23265, 23268)
Figure match at (69817, 69820)
Figure match at (93828, 93831)
Reference match at (92509, 92519)
Section header "[43m[1mINTRODUCTION[0m" found at (1984, 1996)
Section header "[43m[1mPOLICY BACKGROUND[0m" found at (5842, 5859)
Section header "[43m[1mLITERATURE REVIEW[0m" found at (15602, 15619)
Section header "[43m[1mDATA AND METHODS[0m" found at (21550, 21566)
Section header "[43m[1mRESULTS[0m" found at (32317, 32324)
Section header "[43m[1mCO

## American Journal of Sociology articles

In [3]:
%%time
# AJS articles - split into 3 periods

pre1946 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/pre1946/'
pre1946pdfs = os.listdir(pre1946) # list of all the pdf files 
pre1946pdfs.sort() # sort by year (and title)

l946to1966 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/1946to1966/'
l946to1966pdfs = os.listdir(l946to1966) # list of all the pdf files
l946to1966pdfs.sort()

post1971 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/post1971/'
post1971pdfs = os.listdir(post1971) # list of all the pdf files
post1971pdfs.sort()

# Convert the articles to strings... this is the time-consuming step
convertedStrings_pre1946 = [PDFtoString(os.path.join(pre1946, file)) for file in tqdm(pre1946pdfs) if file[-4:] == '.pdf']
convertedStrings_1946to1966 = [PDFtoString(os.path.join(l946to1966, file)) for file in tqdm(l946to1966pdfs) if file[-4:] == '.pdf']
convertedStrings_post1971 = [PDFtoString(os.path.join(post1971, file)) for file in tqdm(post1971pdfs) if file[-4:] == '.pdf']

100%|██████████| 63/63 [01:20<00:00,  1.27s/it]
100%|██████████| 37/37 [00:41<00:00,  1.13s/it]
100%|██████████| 68/68 [02:43<00:00,  2.40s/it]

CPU times: user 4min 41s, sys: 1.75 s, total: 4min 43s
Wall time: 4min 44s





### String pre-processing

In [4]:
# Remove the double lines / extra space characters & footer download / use notice
convertedStrings_pre1946 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_pre1946]
pattern = re.compile(r'This content downloaded from (.)+\n(.)+\x0c')
#highlight(pattern, convertedStrings_pre1946[4])
convertedStrings_pre1946 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_pre1946]
#printMetaInfo(convertedStrings_pre1946, pdfPaths = pre1946pdfs)

In [5]:
convertedStrings_1946to1966 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_1946to1966]
pattern = re.compile(r'[A-Z]{4,}(\s+?[A-Z0-9]{2,}){0,}')
#highlight(pattern, convertedStrings_1946to1966[10])
convertedStrings_1946to1966 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_1946to1966]
#printMetaInfo(convertedStrings_1946to1966, pdfPaths = l946to1966pdfs)

In [6]:
convertedStrings_post1971 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_post1971]
convertedStrings_post1971 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_post1971]
pattern = re.compile(r'[A-Z]{4,}(\s+?[A-Z0-9]{2,}){0,}')
#highlight(pattern, convertedStrings_post1971[2])
#printMetaInfo(convertedStrings_post1971, pdfPaths = post1971pdfs)

### Writing the files out to .txt files

In [7]:
out_pre1946 = createOutputStrings(convertedStrings_pre1946, folder = pre1946pdfs)
out_1946to1966 = createOutputStrings(convertedStrings_1946to1966, folder = l946to1966pdfs)
out_post1971 = createOutputStrings(convertedStrings_post1971, folder = post1971pdfs)

writeOut(out_pre1946, pdfFolder = pre1946pdfs, outFolder = 'AJS_pre1946')
writeOut(out_1946to1966, pdfFolder = l946to1966pdfs, outFolder = 'AJS_1946to1966')
writeOut(out_post1971, pdfFolder = post1971pdfs, outFolder = 'AJS_post1971')

63it [00:00, 3387.06it/s]
37it [00:00, 4570.17it/s]
68it [00:00, 2181.44it/s]

done!
done!
done!





## American Sociological Review articles

In [8]:
ASRpre1946 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/ASR pdf files/pre1946/'
ASRpre1946pdfs = os.listdir(ASRpre1946) # list of all the pdf files 
ASRpre1946pdfs.sort() # sort by year (and title)

ASRpost1946 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/ASR pdf files/post1946/'
ASRpost1946pdfs = os.listdir(ASRpost1946) # list of all the pdf files
ASRpost1946pdfs.sort()

# Convert the articles to strings... this is the time-consuming step
convertedStrings_ASRpre1946 = [PDFtoString(os.path.join(ASRpre1946, file)) for file in tqdm(ASRpre1946pdfs)]
convertedStrings_ASRpost1946 = [PDFtoString(os.path.join(ASRpost1946, file)) for file in tqdm(ASRpost1946pdfs)]

100%|██████████| 16/16 [00:13<00:00,  1.22it/s]
100%|██████████| 121/121 [04:08<00:00,  2.06s/it]


In [9]:
# Remove the double lines / extra space characters & footer download / use notice
convertedStrings_ASRpre1946 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_ASRpre1946]
convertedStrings_ASRpre1946 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_ASRpre1946]
#printMetaInfo(convertedStrings_ASRpre1946, pdfPaths = ASRpre1946pdfs, journal = 'ASR')

In [10]:
convertedStrings_ASRpost1946 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_ASRpost1946]
convertedStrings_ASRpost1946 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_ASRpost1946]
#printMetaInfo(convertedStrings_ASRpost1946, pdfPaths = ASRpost1946pdfs, journal = 'ASR')

In [11]:
out_ASRpre1946 = createOutputStrings(convertedStrings_ASRpre1946, folder = ASRpre1946pdfs, journal = 'ASR')
out_ASRpost1946 = createOutputStrings(convertedStrings_ASRpost1946, folder = ASRpost1946pdfs, journal = 'ASR')

writeOut(out_ASRpre1946, pdfFolder = ASRpre1946pdfs, outFolder = 'ASR_pre1946')
writeOut(out_ASRpost1946, pdfFolder = ASRpost1946pdfs, outFolder = 'ASR_post1946')

16it [00:00, 3318.11it/s]
121it [00:00, 3718.03it/s]

done!
done!





### Consolidating PyTesseract output

In [12]:
# First, for a few test cases - the first two files of AJS_pre1946
import shutil

tess_bF = 'tesseract-corpus-raw'
journalFolder= 'AJS_pre1946'

# For each folder in the journalFolders, find the out.txt file and copy it to the corpus w/ the name of the folder
for folder in os.listdir(os.path.join(tess_bF, journalFolder))[:2]: # First two files for now
    for file in os.listdir(os.path.join(tess_bF, journalFolder, folder)):
        if file[-4:] == '.txt':
            outFile = open(os.path.join('tesseract-corpus', journalFolder, folder) + '.txt', 'w')
            srcFile = open(os.path.join(tess_bF, journalFolder, folder, file), 'r')
            shutil.copyfileobj(srcFile, outFile)

In [13]:
# Then, cycle through each journal and do the same
journalFolders = ['AJS_pre1946', 'ASR_pre1946', 'ASR_post1946', 'AJS_1946to1966', 'AJS_post1971']

# For each folder in the journalFolders, find the out.txt file and copy it to the corpus w/ the name of the folder
for journal in journalFolders:
    for folder in os.listdir(os.path.join(tess_bF, journal)):
        if folder[0] != '.':
            for file in os.listdir(os.path.join(tess_bF, journal, folder)):
                if file[-4:] == '.txt':
                    outFile = open(os.path.join('tesseract-corpus', journal, folder) + '.txt', 'w')
                    srcFile = open(os.path.join(tess_bF, journal, folder, file), 'r')
                    shutil.copyfileobj(srcFile, outFile) 
                    
# Resulting output is the corpus .txt files in tesseract-corpus nicely sectioned into each of the journal folders

### Adding headers back into the PyTesseract output

In [19]:
os.listdir('tesseract-corpus')

for journal in journalFolders[:1]: # [AJS_pre1946, ASR_pre1946, ASR_post1946, AJS_1946to1966, AJS_post1971]
    convertedTxtFiles = os.listdir(os.path.join('tesseract-corpus', journal))
    convertedTxtFiles.sort()
    for indx, file in enumerate(convertedTxtFiles[:5]): # First five files
        print(indx, file)
        tess_file = open(os.path.join('tesseract-corpus', journal, file), 'r').read()

0 AJS_1896_1_4_Anti-Monopoly Legislation in the US.txt
1 AJS_1896_1_4_Business Men and Social Theorists.txt
2 AJS_1896_1_4_Christian Sociology.txt
3 AJS_1896_1_4_Contributions to Social Philosophy.txt
4 AJS_1896_1_4_Recent Sociological Tendencies in France.txt


In [32]:
# Example tesseract output text file
#print(convertedTxtFiles[0])
tess_file = open(os.path.join('tesseract-corpus', 'AJS_pre1946', convertedTxtFiles[0]), 'r').read()
print(tess_file)

Anti-Monopoly Legislation in the United States

Author(s): J. D. Forrest

Source: American Journal of Sociology, Vol. 1, No. 4 (Jan., 1896), pp. 411-425
Published by: The University of Chicago Press

Stable URL: http://www.jstor.org/stable/2761872

Accessed: 08-05-2016 22:57 UTC
Recent legislation against trusts, monopolistic corporations,
and railway consolidations is the modern expression of a senti-
ment which finds its roots far back in English history. Modern
monopolies bear little resemblance to those which existed prior
to the middle of this century; yet the opposition to the latter,
taking shape in constitutions and statutes, has largely influenced
public opinion against modern capitalistic combinations. Modern
monopolies are the outgrowth of industrial and economic condi-
tions, The older ones were arbitrarily created by kings for the
benefit of favorites or for purposes of revenue. It was but natural,
therefore, that such monopolies should be vigorously attacked
during the lo

### We want to repurpose the section headers - we don't care about the indices so much as we do about the words around it

In [49]:
journal = 'AJS'
folder = pre1946pdfs
string = convertedStrings_pre1946[1]
outString = ''
### First, look for the title words and tokenize them.
if journal == 'AJS':
    titles = [re.findall(r'(?<=AJS_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in folder]
elif journal == 'ASR':
    titles = [re.findall(r'(?<=ASR_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in folder]
title_words = [i for i in titles[indx].upper().split() if i not in stop_words and not i.isdigit()]

### Then add them to stop words & other hard-coded regularly occuring words
non_section_words = hard_coded_non_section_words + title_words + stop_words
matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}', string)

if matches is not None:
    sections = dict()
    for match in matches:
        if match.group() not in non_section_words and match not in sections:
            sections[match.group()] = match.span()
for section in sections:
    outString += 'Section header "{}" found at {}'.format(section, sections[section]) + '\n'

In [53]:
%%time
spell = Speller('en')
spell(string)

CPU times: user 13.2 s, sys: 86.7 ms, total: 13.3 s
Wall time: 13.4 s


'      Business Men and Social Theorists\nAuthor(s): C. R. Henderson\nSource: American Journal of Sociology, Vol. 1, No. 4 (Jan., 1896), pp. 385-397\nPublished by: The University of Chicago Press\nStable URL: http://www.jstor.org/stable/2761870\nAccessed: 08-05-2016 22:26 UTC Your use of the STR archive indicates your acceptance of the Terms & Conditions of Use, available at http://about.jstor.org/terms STR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted digital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about STR, please contact support@jstor.org. The University of Chicago Press is collaborating with STR to digitized, preserve and extend access to\nAmerican Journal of Sociology  C    THE AMERICAN  JOURNAL Of SOCIOLOGY  VOLUME I JANUARY, I896 NUMBER 4  BUSINESS EN AND SOCIAL THEORISTS.  REPRESENTATIVES

In [50]:
matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}', string)
if matches is not None:
    sections = dict()
    for match in matches:
        if match.group() not in non_section_words and match not in sections:
            sections[match.group()] = match.span()
            print(match.group(), match.span())

AMERICAN  JOURNAL OF SOCIOLOGY  VOLUME (885, 923)
JANUARY (926, 933)
NUMBER (940, 946)
BUSINESS MEN AND SOCIAL THEORISTS (950, 983)
REPRESENTATIVES (986, 1001)
BUSINESS MEN AND SOCIAL THEORISTS (4722, 4755)
BUSINESS MEN AND SOCIAL THEORISTS (9141, 9174)
AMERICAN IOURNAL OF SOCIOLOGY (11412, 11441)
BUSINESS MEN AND SOCIAL THEORISTS (13673, 13706)
MERICAN JOURNAL OF SOCIOLOGY (16001, 16029)
BUSINESS MEN AND SOCIAL THEORVISTS (18316, 18350)
BUSINESS MEN (22983, 22995)
SOCIAL THEORISTS (23001, 23017)
BUSINESS MEN AND SOCIAL THEORISTS (27435, 27468)
HENDERSON (28668, 28677)


In [34]:
# Add section info 

        ### First, look for the title words and tokenize them.
        if journal == 'AJS':
            titles = [re.findall(r'(?<=AJS_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in folder]
        elif journal == 'ASR':
            titles = [re.findall(r'(?<=ASR_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in folder]
        title_words = [i for i in titles[indx].upper().split() if i not in stop_words and not i.isdigit()]

        ### Then add them to stop words & other hard-coded regularly occuring words
        non_section_words = hard_coded_non_section_words + title_words + stop_words
        matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}', string)

        if matches is not None:
            sections = dict()
            for match in matches:
                if match.group() not in non_section_words and match not in sections:
                    sections[match.group()] = match.span()
        for section in sections:
            outString += 'Section header "{}" found at {}'.format(section, sections[section]) + '\n'

-------------
||Meta-info||
-------------
Anti-Monopoly Legislation in the United States
Author(s): J. D. Forrest
Source: American Journal of Sociology, Vol. 1, No. 4 (Jan., 1896), pp. 411-425
Published by: The University of Chicago Press
Stable URL: http://www.jstor.org/stable/2761872


Accessed: 08-05-2016 22:57 UTC
No abstract found.
No reference section found.
Section header "ANTI" found at (32901, 32905)
Section header "MONOPOLY LEGISLATION IN THE UNITED  STATES" found at (897, 939)
Section header "RECENT" found at (942, 948)
Section header "MONOPOLY LEGISLA TION" found at (32907, 32928)
Section header "XXIV" found at (7166, 7170)
Section header "MONOPOLY LEGISLATION" found at (9796, 9816)
Section header "MONOPOL" found at (18830, 18837)
Section header "LEGISLA TION" found at (28510, 28522)
Section header "XIII" found at (16283, 16287)
Section header "XVII" found at (16416, 16420)
Section header "HALLE" found at (21144, 21149)
Section header "MONVOPOL" found at (28499, 28507)
Sect

In [20]:
string = out_pre1946[0] # Example string

# Split string into header tag and main article
header_tag, article_header, main_article = re.split('(-----------\n\|\|Article\|\|\n-----------)', string)
article = article_header + main_article

section_pattern = re.compile(r'(?<=Section header ") ?([A-Z]*\s*){1,}.*(?=" found at)')
section_location = re.compile(r'(\d{1,}+, \d{1,}+)')
sections = dict()
for match_pair in zip(section_pattern.finditer(header_tag), section_location.finditer(header_tag)):
    sections[match_pair[0].group()] = [int(indx) for indx in match_pair[1].group().split(', ')]
for section, section_indices in sections.items():
    print(section, section_indices)

ANTI [32901, 32905]
MONOPOLY LEGISLATION IN THE UNITED  STATES [897, 939]
RECENT [942, 948]
MONOPOLY LEGISLA TION [32907, 32928]
XXIV [7166, 7170]
MONOPOLY LEGISLATION [9796, 9816]
MONOPOL [18830, 18837]
LEGISLA TION [28510, 28522]
XIII [16283, 16287]
XVII [16416, 16420]
HALLE [21144, 21149]
MONVOPOL [28499, 28507]
FORREST [33722, 33729]


In [223]:
for section, section_indices in sections.items():
    pattern = re.compile(section)
    smallest_indx_diff = 1e5
    for counter, match in enumerate(pattern.finditer(string)):
        print(counter, match)
        if np.abs(section_indices[0] - match.span()[0]) < smallest_indx_diff:
            smallest_indx_diff = np.abs(section_indices[0] - match.span()[0])
    for indx, match in enumerate(pattern.finditer(string)):
        if indx == counter:
            print(match.group())
            print(re.search(re.compile(match.group()), article))

0 <regex.Match object; span=(383, 387), match='ANTI'>
1 <regex.Match object; span=(1082, 1086), match='ANTI'>
2 <regex.Match object; span=(5328, 5332), match='ANTI'>
3 <regex.Match object; span=(9981, 9985), match='ANTI'>
4 <regex.Match object; span=(14525, 14529), match='ANTI'>
5 <regex.Match object; span=(24073, 24077), match='ANTI'>
6 <regex.Match object; span=(28683, 28687), match='ANTI'>
7 <regex.Match object; span=(33091, 33095), match='ANTI'>
ANTI
<regex.Match object; span=(36, 40), match='ANTI'>
0 <regex.Match object; span=(429, 471), match='MONOPOLY LEGISLATION IN THE UNITED  STATES'>
1 <regex.Match object; span=(1087, 1129), match='MONOPOLY LEGISLATION IN THE UNITED  STATES'>
MONOPOLY LEGISLATION IN THE UNITED  STATES
<regex.Match object; span=(41, 83), match='MONOPOLY LEGISLATION IN THE UNITED  STATES'>
0 <regex.Match object; span=(509, 515), match='RECENT'>
1 <regex.Match object; span=(1132, 1138), match='RECENT'>
RECENT
<regex.Match object; span=(86, 92), match='RECENT'>
0

In [224]:
re.search(re.compile(match.group()), article)

<regex.Match object; span=(32866, 32873), match='FORREST'>

In [200]:
pattern = re.compile('ANTI')
for match in pattern.finditer(string):
    print(match.group(), match.span(), match.span()[0] - len(header_tag), match.span()[1] - len(header_tag),
          match.span())

ANTI (383, 387) -663 -659
ANTI (1082, 1086) 36 40
ANTI (5328, 5332) 4282 4286
ANTI (9981, 9985) 8935 8939
ANTI (14525, 14529) 13479 13483
ANTI (24073, 24077) 23027 23031
ANTI (28683, 28687) 27637 27641
ANTI (33091, 33095) 32045 32049


In [225]:
highlight(re.compile('ANTI'), string)

-------------
||Meta-info||
-------------
Anti-Monopoly Legislation in the United States
Author(s): J. D. Forrest
Source: American Journal of Sociology, Vol. 1, No. 4 (Jan., 1896), pp. 411-425
Published by: The University of Chicago Press
Stable URL: http://www.jstor.org/stable/2761872


Accessed: 08-05-2016 22:57 UTC
No abstract found.
No reference section found.
Section header "[43m[1mANTI[0m" found at (32901, 32905)
Section header "MONOPOLY LEGISLATION IN THE UNITED  STATES" found at (897, 939)
Section header "RECENT" found at (942, 948)
Section header "MONOPOLY LEGISLA TION" found at (32907, 32928)
Section header "XXIV" found at (7166, 7170)
Section header "MONOPOLY LEGISLATION" found at (9796, 9816)
Section header "MONOPOL" found at (18830, 18837)
Section header "LEGISLA TION" found at (28510, 28522)
Section header "XIII" found at (16283, 16287)
Section header "XVII" found at (16416, 16420)
Section header "HALLE" found at (21144, 21149)
Section header "MONVOPOL" found at (28499

In [144]:
section_location = re.compile(r'(\d{1,}+, \d{1,}+)')
for match_pair in zip(section_pattern.finditer(header_tag), section_location.finditer(header_tag)):
    print(match_pair[0].group(), match_pair[1].group())

ANTI 32901, 32905
MONOPOLY LEGISLATION IN THE UNITED  STATES 897, 939
RECENT 942, 948
MONOPOLY LEGISLA TION 32907, 32928
XXIV 7166, 7170
MONOPOLY LEGISLATION 9796, 9816
MONOPOL 18830, 18837
LEGISLA TION 28510, 28522
XIII 16283, 16287
XVII 16416, 16420
HALLE 21144, 21149
MONVOPOL 28499, 28507
FORREST 33722, 33729


In [16]:
string = out_pre1946[-7] # Example string - 
header_tag = string[:re.search(r'-----------\n\|\|Article\|\|\n-----------', string).span()[0]]
print(header_tag)

pattern = re.compile(r'Section header "?([A-Z]*\s*){1,}.*')
matches = pattern.finditer(header_tag)
for match in matches:
    print(match.group())

-------------
||Meta-info||
-------------
International Peace-By Court or Government?
Author(s): Hans Kelsen
Source: American Journal of Sociology, Vol. 46, No. 4 (Jan., 1941), pp. 571-581
Published by: The University of Chicago Press
Stable URL: http://www.jstor.org/stable/2769924


Accessed: 03-06-2016 20:16 UTC
Abstract match at (948, 956)
No reference section found.
Section header "INTERNATIONAL PEACE" found at (25179, 25198)
Section header "COURT OR  GOVERNMENT" found at (911, 931)
Section header "HANS KELSEN  ABSTRACT" found at (935, 956)
Section header "AMERICAN JOURNAL OF SOCIOLOGY  III" found at (8401, 8435)
Section header "VIII" found at (19535, 19539)
Section header "HARVARD UNIVERSITY" found at (25795, 25813)


Section header "INTERNATIONAL PEACE" found at (25179, 25198)
Section header "COURT OR  GOVERNMENT" found at (911, 931)
Section header "HANS KELSEN  ABSTRACT" found at (935, 956)
Section header "AMERICAN JOURNAL OF SOCIOLOGY  III" found at (8401, 8435)
Section header 

In [499]:
non_section_words = hard_coded_non_section_words + stop_words
lookback = 30

for string in out_pre1946:
    header_tag = string[:re.search(r'-----------\n\|\|Article\|\|\n-----------', string).span()[0]]
    article = string[re.search(r'-----------\n\|\|Article\|\|\n-----------', string).span()[0]:]
    pattern = re.compile(r'Section header "?([A-Z]*\s*){1,}.*')
    
    matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}', header_tag)
    for match in matches:
        if match.group() not in non_section_words and match not in sections:
            sections[match.group()] = match.span()
    compareIndices = list()
    for section in sections:
        match_indx = sections[section][0]
        match_spans = dict()
        substring = article[match_indx - lookback:match_indx]
        for indx, word in enumerate(substring.split()):
            print(substring.split())
            matches = re.finditer(word, string)
            match_spans[word] = np.array([match.span() for match in matches])
        for word in match_spans:
            counter = match_spans[word].shape[0] - 1
            if counter != -1:
                compareIndices.append(match_spans[word][counter])

    start_indices, end_indices = list(), list()
    for i in range(1, len(compareIndices)):
        if compareIndices[i-1][1] + 1 == compareIndices[i][0]:
            start_indx = compareIndices[i-1][0]
            end_indx = compareIndices[i][1]

            start_indices.append(start_indx)
            end_indices.append(end_indx)
    for indx1 in start_indices:
        for indx2 in end_indices:
            if indx2 - indx1 > 10 and indx2 - indx1 < 30:
                unique_identifier = string[indx1:indx2]

    print('Tag match found: {}'.format(re.search(unique_identifier, tess_file)))

['this', 'century;', 'yet', 'the', 'opposit']
['this', 'century;', 'yet', 'the', 'opposit']
['this', 'century;', 'yet', 'the', 'opposit']
['this', 'century;', 'yet', 'the', 'opposit']
['this', 'century;', 'yet', 'the', 'opposit']
['e', 'in', 'constitutions', 'and', 'statute']
['e', 'in', 'constitutions', 'and', 'statute']
['e', 'in', 'constitutions', 'and', 'statute']
['e', 'in', 'constitutions', 'and', 'statute']
['e', 'in', 'constitutions', 'and', 'statute']
['ublic', 'opinion', 'against', 'modern', 'c']
['ublic', 'opinion', 'against', 'modern', 'c']
['ublic', 'opinion', 'against', 'modern', 'c']
['ublic', 'opinion', 'against', 'modern', 'c']
['ublic', 'opinion', 'against', 'modern', 'c']
['ern', 'monopolies', 'are', 'the', 'outgro']
['ern', 'monopolies', 'are', 'the', 'outgro']
['ern', 'monopolies', 'are', 'the', 'outgro']
['ern', 'monopolies', 'are', 'the', 'outgro']
['ern', 'monopolies', 'are', 'the', 'outgro']
['and', 'economic', 'condi-', 'tions.', 'Th']
['and', 'economic', 'con

['latter,', 'taking', 'shape', 'in', 'cons']
['tes,', 'has', 'largely', 'influenced', 'p']
['tes,', 'has', 'largely', 'influenced', 'p']
['tes,', 'has', 'largely', 'influenced', 'p']
['tes,', 'has', 'largely', 'influenced', 'p']
['tes,', 'has', 'largely', 'influenced', 'p']
['onopolies', 'are', 'the', 'outgrowth', 'of']
['onopolies', 'are', 'the', 'outgrowth', 'of']
['onopolies', 'are', 'the', 'outgrowth', 'of']
['onopolies', 'are', 'the', 'outgrowth', 'of']
['onopolies', 'are', 'the', 'outgrowth', 'of']
['by', 'kings', 'for', 'the', 'benefit', 'of', 'f']
['by', 'kings', 'for', 'the', 'benefit', 'of', 'f']
['by', 'kings', 'for', 'the', 'benefit', 'of', 'f']
['by', 'kings', 'for', 'the', 'benefit', 'of', 'f']
['by', 'kings', 'for', 'the', 'benefit', 'of', 'f']
['by', 'kings', 'for', 'the', 'benefit', 'of', 'f']
['by', 'kings', 'for', 'the', 'benefit', 'of', 'f']
['was', 'but', 'natural,', 'therefore,']
['was', 'but', 'natural,', 'therefore,']
['was', 'but', 'natural,', 'therefore,']
['w

['r', 'many', 'years,', 'burst', 'forth', 'in', 'i']
['lence', 'that', 'Elizabeth', 'yielded', 'h']
['lence', 'that', 'Elizabeth', 'yielded', 'h']
['lence', 'that', 'Elizabeth', 'yielded', 'h']
['lence', 'that', 'Elizabeth', 'yielded', 'h']
['lence', 'that', 'Elizabeth', 'yielded', 'h']
['The', 'evil', 'flourished', 'again', 'unde']
['The', 'evil', 'flourished', 'again', 'unde']
['The', 'evil', 'flourished', 'again', 'unde']
['The', 'evil', 'flourished', 'again', 'unde']
['The', 'evil', 'flourished', 'again', 'unde']
['parliament', 'passed', 'an', 'act', 'decla']
['parliament', 'passed', 'an', 'act', 'decla']
['parliament', 'passed', 'an', 'act', 'decla']
['parliament', 'passed', 'an', 'act', 'decla']
['parliament', 'passed', 'an', 'act', 'decla']
['certain', 'For', 'a', 'description', 'of']
['certain', 'For', 'a', 'description', 'of']
['certain', 'For', 'a', 'description', 'of']
['certain', 'For', 'a', 'description', 'of']
['certain', 'For', 'a', 'description', 'of']
['ee', 'Hallam,',

['hts', 'was', 'the', 'model', 'for', 'the', 'sect']
['hts', 'was', 'the', 'model', 'for', 'the', 'sect']
['titutions', 'bearing', 'the', 'same', 'tit']
['titutions', 'bearing', 'the', 'same', 'tit']
['titutions', 'bearing', 'the', 'same', 'tit']
['titutions', 'bearing', 'the', 'same', 'tit']
['titutions', 'bearing', 'the', 'same', 'tit']
['was', 'that', 'the', 'people', 'had', 'a', 'rig']
['was', 'that', 'the', 'people', 'had', 'a', 'rig']
['was', 'that', 'the', 'people', 'had', 'a', 'rig']
['was', 'that', 'the', 'people', 'had', 'a', 'rig']
['was', 'that', 'the', 'people', 'had', 'a', 'rig']
['was', 'that', 'the', 'people', 'had', 'a', 'rig']
['was', 'that', 'the', 'people', 'had', 'a', 'rig']
['persons,', 'property,', 'and', 'privil']
['persons,', 'property,', 'and', 'privil']
['persons,', 'property,', 'and', 'privil']
['persons,', 'property,', 'and', 'privil']
['were', 'made', 'by', 'all', 'the', 'colonie']
['were', 'made', 'by', 'all', 'the', 'colonie']
['were', 'made', 'by', 'all

['in', 'New', 'York', 'and', 'published', 'a', 'd']
['people', 'to', 'inherent', 'privileges']
['people', 'to', 'inherent', 'privileges']
['people', 'to', 'inherent', 'privileges']
['people', 'to', 'inherent', 'privileges']
['nies', 'except', 'Georgia', 'met', 'in', 'th']
['nies', 'except', 'Georgia', 'met', 'in', 'th']
['nies', 'except', 'Georgia', 'met', 'in', 'th']
['nies', 'except', 'Georgia', 'met', 'in', 'th']
['nies', 'except', 'Georgia', 'met', 'in', 'th']
['nies', 'except', 'Georgia', 'met', 'in', 'th']
['tal', 'Congress', 'and', 'issued', 'a', 'decl']
['tal', 'Congress', 'and', 'issued', 'a', 'decl']
['tal', 'Congress', 'and', 'issued', 'a', 'decl']
['tal', 'Congress', 'and', 'issued', 'a', 'decl']
['tal', 'Congress', 'and', 'issued', 'a', 'decl']
['tal', 'Congress', 'and', 'issued', 'a', 'decl']
['e', 'English', 'model', 'and', 'explicitly']
['e', 'English', 'model', 'and', 'explicitly']
['e', 'English', 'model', 'and', 'explicitly']
['e', 'English', 'model', 'and', 'explici

['pposition', 'to', 'the', 'latter,', 'taki']
['pposition', 'to', 'the', 'latter,', 'taki']
['nfluenced', 'public', 'opinion', 'agai']
['nfluenced', 'public', 'opinion', 'agai']
['nfluenced', 'public', 'opinion', 'agai']
['nfluenced', 'public', 'opinion', 'agai']
['listic', 'combinations.', 'Modern', 'm']
['listic', 'combinations.', 'Modern', 'm']
['listic', 'combinations.', 'Modern', 'm']
['listic', 'combinations.', 'Modern', 'm']
['d', 'prior', 'to', 'the', 'middle', 'of', 'this']
['d', 'prior', 'to', 'the', 'middle', 'of', 'this']
['d', 'prior', 'to', 'the', 'middle', 'of', 'this']
['d', 'prior', 'to', 'the', 'middle', 'of', 'this']
['d', 'prior', 'to', 'the', 'middle', 'of', 'this']
['d', 'prior', 'to', 'the', 'middle', 'of', 'this']
['d', 'prior', 'to', 'the', 'middle', 'of', 'this']
['nst', 'modern', 'capitalistic', 'combin']
['nst', 'modern', 'capitalistic', 'combin']
['nst', 'modern', 'capitalistic', 'combin']
['nst', 'modern', 'capitalistic', 'combin']
['ndustrial', 'and', 'ec

['ter,', 'taking', 'shape', 'in', 'constitu']
['tutes,', 'has', 'largely', 'influenced']
['tutes,', 'has', 'largely', 'influenced']
['tutes,', 'has', 'largely', 'influenced']
['tutes,', 'has', 'largely', 'influenced']
['against', 'modern', 'capitalistic', 'c']
['against', 'modern', 'capitalistic', 'c']
['against', 'modern', 'capitalistic', 'c']
['against', 'modern', 'capitalistic', 'c']
['s', 'are', 'the', 'outgrowth', 'of', 'industr']
['s', 'are', 'the', 'outgrowth', 'of', 'industr']
['s', 'are', 'the', 'outgrowth', 'of', 'industr']
['s', 'are', 'the', 'outgrowth', 'of', 'industr']
['s', 'are', 'the', 'outgrowth', 'of', 'industr']
['s', 'are', 'the', 'outgrowth', 'of', 'industr']
['trarily', 'created', 'by', 'kings', 'for', 't']
['trarily', 'created', 'by', 'kings', 'for', 't']
['trarily', 'created', 'by', 'kings', 'for', 't']
['trarily', 'created', 'by', 'kings', 'for', 't']
['trarily', 'created', 'by', 'kings', 'for', 't']
['trarily', 'created', 'by', 'kings', 'for', 't']
['purposes

['g', 'opposed', 'to', 'the', 'natural', 'freed']
['g', 'opposed', 'to', 'the', 'natural', 'freed']
['benefit', 'of', 'favorites', 'or', 'for', 'p']
['benefit', 'of', 'favorites', 'or', 'for', 'p']
['benefit', 'of', 'favorites', 'or', 'for', 'p']
['benefit', 'of', 'favorites', 'or', 'for', 'p']
['benefit', 'of', 'favorites', 'or', 'for', 'p']
['benefit', 'of', 'favorites', 'or', 'for', 'p']
['e', 'Great', 'Charter', 'which', 'guarante']
['e', 'Great', 'Charter', 'which', 'guarante']
['e', 'Great', 'Charter', 'which', 'guarante']
['e', 'Great', 'Charter', 'which', 'guarante']
['e', 'Great', 'Charter', 'which', 'guarante']
['clauses,"', 'as', 'Hallam', 'terms', 'th']
['clauses,"', 'as', 'Hallam', 'terms', 'th']
['clauses,"', 'as', 'Hallam', 'terms', 'th']
['clauses,"', 'as', 'Hallam', 'terms', 'th']
['clauses,"', 'as', 'Hallam', 'terms', 'th']
['taking', 'shape', 'in', 'constitutions']
['taking', 'shape', 'in', 'constitutions']
['taking', 'shape', 'in', 'constitutions']
['taking', 'shape

['.', 'Modern', 'monopolies', 'are', 'the', 'o']
['.', 'Modern', 'monopolies', 'are', 'the', 'o']
['.', 'Modern', 'monopolies', 'are', 'the', 'o']
['.', 'Modern', 'monopolies', 'are', 'the', 'o']
['.', 'Modern', 'monopolies', 'are', 'the', 'o']
['.', 'Modern', 'monopolies', 'are', 'the', 'o']
['created', 'by', 'kings', 'for', 'the', 'bene']
['created', 'by', 'kings', 'for', 'the', 'bene']
['created', 'by', 'kings', 'for', 'the', 'bene']
['created', 'by', 'kings', 'for', 'the', 'bene']
['created', 'by', 'kings', 'for', 'the', 'bene']
['created', 'by', 'kings', 'for', 'the', 'bene']
['nue.', 'It', 'was', 'but', 'natural,', 'ther']
['nue.', 'It', 'was', 'but', 'natural,', 'ther']
['nue.', 'It', 'was', 'but', 'natural,', 'ther']
['nue.', 'It', 'was', 'but', 'natural,', 'ther']
['nue.', 'It', 'was', 'but', 'natural,', 'ther']
['nue.', 'It', 'was', 'but', 'natural,', 'ther']
['ously', 'attacked', 'during', 'the', 'lon']
['ously', 'attacked', 'during', 'the', 'lon']
['ously', 'attacked', 'dur

['re', 'common', 'necessaries', 'of', 'life,']
[';', 'yet', 'the', 'opposition', 'to', 'the', 'la']
[';', 'yet', 'the', 'opposition', 'to', 'the', 'la']
[';', 'yet', 'the', 'opposition', 'to', 'the', 'la']
[';', 'yet', 'the', 'opposition', 'to', 'the', 'la']
[';', 'yet', 'the', 'opposition', 'to', 'the', 'la']
[';', 'yet', 'the', 'opposition', 'to', 'the', 'la']
[';', 'yet', 'the', 'opposition', 'to', 'the', 'la']
['n', 'against', 'modern', 'capitalistic']
['n', 'against', 'modern', 'capitalistic']
['n', 'against', 'modern', 'capitalistic']
['n', 'against', 'modern', 'capitalistic']
['l', 'and', 'economic', 'condi-', 'tions.']
['l', 'and', 'economic', 'condi-', 'tions.']
['l', 'and', 'economic', 'condi-', 'tions.']
['l', 'and', 'economic', 'condi-', 'tions.']
['l', 'and', 'economic', 'condi-', 'tions.']
['the', 'benefit', 'of', 'favorites', 'or']
['the', 'benefit', 'of', 'favorites', 'or']
['the', 'benefit', 'of', 'favorites', 'or']
['the', 'benefit', 'of', 'favorites', 'or']
['the', '

error: unbalanced parenthesis at position 3

In [511]:
lookback = 30
compareIndices = list()
for string in out_pre1946:
    # Split article into header tag containing meta-information and the main article 
    header_tag = string[:re.search(r'-----------\n\|\|Article\|\|\n-----------', string).span()[0]]
    article = string[re.search(r'-----------\n\|\|Article\|\|\n-----------', string).span()[0]:]
    
    # Look for sections within the article
    matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}', string)
    sections = dict()
    for match in matches:
        if match.group() not in non_section_words and match not in sections:
            sections[match.group()] = match.span()
    for section in sections:
        match_indx = sections[section][0]
        match_spans = dict()
        substring = string[match_indx - lookback:match_indx]
        print(substring.split())
        #for indx, word in enumerate(substring.split()):
        #    matches = re.finditer(word, string)
        #    match_spans[word] = np.array([match.span() for match in matches])
        #for word in match_spans:
        #    counter = match_spans[word].shape[0] - 1
        #    if counter != -1:
        #        compareIndices.append(match_spans[word][counter])

['itutional', 'Law,"', 'p.', '247.', 'C']
['||Article||', '-----------', 'ANTI-']
['ATION', 'IN', 'THE', 'UNITED', 'STATES.']
['nal', 'Law,"', 'p.', '247.', 'C', 'ANTI-']
['ave', 'made', 'this', 'dec-', '1', 'Lecture']
['ey', 'are', 'supposed', 'to', 'C', 'ANTI-']
[',', 'Trusts,', 'p.', '17.', 'C', 'A', 'NTI-']
['asso-', 'C', 'ANTI-', 'MONVOPOL', 'Y']
['2', 'Art.', 'XI.,', 'Sec.', 'i1.', '1o', 'Art.']
['5', 'Art.', 'XI.,', 'Sec.', '3.', "'3", 'Art.']
['ncile', 'the', 'two', 'statements.', 'VON']
['cooperative', 'asso-', 'C', 'ANTI-']
['d', 'controlled', 'by', 'public', 'J.', 'D.']
['-', '||Article||', '-----------', 'THE']
['OURNAL', 'OF', 'SOCIOLOGY', 'VOLUME', 'I']
['OLOGY', 'VOLUME', 'I', 'JANUARY,', 'I896']
['t', 'of', 'human', 'life', 'will', 'be', 'C']
['SS', 'MEN', 'AND', 'SOCIAL', 'THEORISTS.']
['literary', 'skill,', 'C', '390', 'THE']
['p.', 'i', 'o-i', 'I', 'I.', 'C', '392', 'THE', 'A']
['in', 'the', 'issue', 'of', 'social', 'C']
['ght', 'and', 'protection,', 'and', 'C']
[','

In [477]:
lookback = 30
compareIndices = list()
for section in sections:
    match_indx = sections[section][0]
    match_spans = dict()
    substring = string[match_indx - lookback:match_indx]
    for indx, word in enumerate(substring.split()):
        matches = re.finditer(word, string)
        match_spans[word] = np.array([match.span() for match in matches])
    for word in match_spans:
        counter = match_spans[word].shape[0] - 1
        if counter != -1:
            compareIndices.append(match_spans[word][counter])

start_indices, end_indices = list(), list()
for i in range(1, len(compareIndices)):
    if compareIndices[i-1][1] + 1 == compareIndices[i][0]:
        start_indx = compareIndices[i-1][0]
        end_indx = compareIndices[i][1]

        start_indices.append(start_indx)
        end_indices.append(end_indx)
for indx1 in start_indices:
    for indx2 in end_indices:
        if indx2 - indx1 > 10 and indx2 - indx1 < 30:
            unique_identifier = string[indx1:indx2]

print('Tag match found: {}'.format(re.search(unique_identifier, tess_file)))

Tag match found: <regex.Match object; span=(19611, 19629), match='ideology of power.'>
