### Imports, Reading in files & Defining functions

In [1]:
import os
import re
import io
from tqdm import tqdm
from termcolor import colored
from colorama import Back, Style

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


def PDFtoString(filePath, pdfFolder=None):
    
    out = io.StringIO()
    if pdfFolder is not None:
        filePath = os.path.join(pdfFolder, filePath)
    with open(filePath, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, out, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for indx, page in enumerate(PDFPage.create_pages(doc)):
            #if indx != 0:
            interpreter.process_page(page)
            
    return out.getvalue() 

def printMetaInfo(convertedStrings, pdfPaths):
    
    # Print the regex matches for access date, abstract, table/chart, and references
    for indx, string in enumerate(convertedStrings):
        print(indx, colored(pdfPaths[indx], 'red'))
        if re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string):
            print('\t', colored(re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0], 'blue'))
        print('\t', colored('Abstract match:', 'magenta'), colored(re.search(r'ABSTRACT', string), 'magenta'))
        matches = re.finditer(r'CHART', string)
        for match in matches:
            print('\t', colored('Chart match:', 'green'), colored(match, 'green'))
        matches = re.finditer(r'TABLE', string)
        for match in matches:
            print('\t', colored('Table match:', 'green'), colored(match, 'green'))
        print('\t', colored('Reference match:', 'magenta'), colored(re.search(r'REFERENCE', string), 'magenta'))
    
def createOutputFile(convertedStrings):
    out = list()
    for indx, string in enumerate(convertedStrings):
        outString = '-------------\n||Meta-info||\n-------------\n'
        outString += re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0] + '\n'

        abstract = re.search(r'ABSTRACT', string)
        if abstract is not None:
            outString += 'Abstract match at {}'.format(abstract.span()) + '\n'
        elif abstract is None:
            outString += 'No abstract found.\n'
        chart_matches = re.finditer(r'CHART', string)    
        if chart_matches is not None:
            for match in chart_matches:
                outString += 'Chart match at {}'.format(match.span()) + '\n'

        table_matches = re.finditer(r'TABLE', string)
        if table_matches is not None:
            for match in table_matches:
                outString += 'Chart match at {}'.format(match.span()) + '\n'

        references = re.search(r'REFERENCES', string)
        if references is not None:
            outString += 'Reference match at {}'.format(references.span()) + '\n'
        elif references is None:
            outString += 'No reference section found.\n'
        outString += '\n-----------\n||Article||\n-----------\n'
        outString += string
        out.append(outString)
    return out

def highlight(pattern, text, print_output=True):
    output = text
    lookforward = 0
    for match in pattern.finditer(text):
        start, end = match.start() + lookforward, match.end() + lookforward
        output = output[:start] + Back.YELLOW + Style.BRIGHT + output[start:end] + Style.RESET_ALL + output[end:]
        lookforward = len(output) - len(text)  

    if print_output:
        print(output)
    else:
        return output

### Converting AJS articles into strings (this is the time consuming step)

In [2]:
%%time
# AJS articles - split into 3 periods

pre1946 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/pre1946/'
pre1946pdfs = os.listdir(pre1946) # list of all the pdf files 
pre1946pdfs.sort() # sort by year (and title)

l946to1966 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/1946to1966/'
l946to1966pdfs = os.listdir(l946to1966) # list of all the pdf files
l946to1966pdfs.sort()

post1971 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/post1971/'
post1971pdfs = os.listdir(post1971) # list of all the pdf files
post1971pdfs.sort()

# Convert the articles to strings... this is the time-consuming step
convertedStrings_pre1946 = [PDFtoString(os.path.join(pre1946, file)) for file in tqdm(pre1946pdfs)]
convertedStrings_1946to1966 = [PDFtoString(os.path.join(l946to1966, file)) for file in tqdm(l946to1966pdfs)]
convertedStrings_post1971 = [PDFtoString(os.path.join(post1971, file)) for file in tqdm(post1971pdfs)]

100%|██████████| 63/63 [01:29<00:00,  1.41s/it]
100%|██████████| 37/37 [00:46<00:00,  1.26s/it]
100%|██████████| 68/68 [02:56<00:00,  2.60s/it]

CPU times: user 4min 57s, sys: 3.5 s, total: 5min 1s
Wall time: 5min 12s





### String pre-processing

In [3]:
# Remove the double lines / extra space characters & footer download / use notice
convertedStrings_pre1946 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_pre1946]
pattern = re.compile(r'This content downloaded from (.)+\n(.)+\x0c')
highlight(pattern, convertedStrings_pre1946[4])
convertedStrings_pre1946 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_pre1946]
printMetaInfo(convertedStrings_pre1946, pdfPaths = pre1946pdfs)
#out = createOutputFile(convertedStrings_pre1946)

      Recent Sociological Tendencies in France
Author(s): James H. Tufts
Source: American Journal of Sociology, Vol. 1, No. 4 (Jan., 1896), pp. 446-456
Published by: The University of Chicago Press
Stable URL: http://www.jstor.org/stable/2761875
Accessed: 08-05-2016 23:09 UTC Your use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at http://about.jstor.org/terms JSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted digital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about JSTOR, please contact support@jstor.org. The University of Chicago Press is collaborating with JSTOR to digitize, preserve and extend access to
American Journal of Sociology [43m[1mThis content downloaded from 170.140.105.123 on Sun, 08 May 2016 23:09:52 UTC
All use subject to http://about.

In [None]:
string = convertedStrings_pre1946[50]
header = string[:re.search(r'extend access to\n(.)+?(?=C)', string).end()]
article = string[re.search(r'extend access to\n(.)+?(?=C)(.)+?(?=\w{2,})', string).end():]

In [None]:
p = re.compile(r'extend access to\n(.)+?(?=C)(.)+?(?=\w{2,})')
#re.search(r'\w{2,}', article)
highlight(p, string)

In [None]:
string[re.search(r'extend access to\n(.)+', string).end():]

In [None]:
#At some point, I'll want to use this to find section headers
#re.findall(r'[A-Z]{2,}', string) #Find all the all caps characters
#[i for i in re.findall(r'[A-Z]{2,}', string) if i not in ['URL', 'UTC', 'JSTOR']]

### 1946 to 1966 | AJS articles

In [None]:
#This will be used on the output strings in the input analysis text to separate the meta-info from the article itself
meta_info, article = re.split('\n-----------\n||Article||\n-----------\n', out[1])

#re.search(r'Abstract match at \(\d*, \d*\)', meta_info)
re.findall(r'Chart match at \(\d*, \d*\)', meta_info)

In [None]:
%%time
#re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)
for indx, file in enumerate(pdfPaths):
    print(indx, colored(file, 'red'))
    string = PDFtoString(os.path.join(pdfFolder, file), pdfFolder = pdfFolder)
    print(colored(re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0], 'blue'))
    print(colored('Abstract match:', 'magenta'), colored(re.search(r'ABSTRACT', string), 'magenta'))
    matches = re.finditer(r'CHART', string)
    for match in matches:
        print(colored('Chart match:', 'green'), colored(match, 'green'))

In [None]:
#Define the folder in which all the pdfs are in 
# - assuming the type of analysis needs to be different depending on the journal / time period

# Laptop file paths
#baseFolder = '/home/ishi/Desktop/franzosi/Language-of-Science/articles/AJS pdf files/1946to1966'
#pdfFolder = '/Users/saranmedical-smile/Desktop/Language-of-Science/articles/AJS pdf files/1946to1966'
#txtFolder = '/Users/saranmedical-smile/Desktop/Language-of-Science/corpus/AJS_1946to1966'

# iMacPro file paths
pdfFolder = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/1946to1966'
txtFolder = '/Users/Praveens/Desktop/ishan/Language-of-Science/corpus/AJS_1946to1966'

txtPaths = os.listdir(txtFolder) # list of all txt files
pdfPaths = os.listdir(pdfFolder) # list of all pdf files

txtPaths.sort(), pdfPaths.sort()

for indx, f in enumerate(pdfPaths[:5]):
    print(colored(indx, 'green'), colored(f, 'magenta'), '\n')
    

string = PDFtoString(os.path.join(pdfPaths[35]), pdfFolder = pdfFolder) #Example output of PDFtoString

### Print the corpus file

In [None]:
with open(os.path.join(baseFolder, filePaths[6]), 'r') as f:
    data = f.read()
print(data[:1000])# The header looks terrible - might make sense to use pdf2text just to extract header info

### Regex Method 1 - re.compile.finditer(string)

This returns the indices at which the pattern occurs

In [None]:
pattern = re.compile(r'\d[ ]? \w') #Important to start regex expression with r --> raw string
matches = pattern.finditer(string)
for match in matches:
    pass#print(match)

### Regex Method 2 - re.findall(pattern, string)

This returns the pattern itself

In [None]:
%%time
#re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)
for indx, file in enumerate(pdfPaths):
    print(indx, colored(file, 'red'))
    string = PDFtoString(os.path.join(pdfFolder, file))
    print(colored(re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0], 'blue'))
    print(colored('Abstract match:', 'magenta'), colored(re.search(r'ABSTRACT', string), 'magenta'))
    matches = re.finditer(r'CHART', string)
    for match in matches:
        print(colored('Chart match:', 'green'), colored(match, 'green'))

### Split string by re.compile(pattern, string) return val

In [None]:
access_ts = r'(Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC)'
#JSTOR_msg = r'\n \nREFERENCES \nLinked references are available on JSTOR for this article:\nhttp://www.jstor.org/stable/2774936?seq=1&cid=pdf-reference#references_tab_contents \nYou may need to log in to JSTOR to access the linked references.\n \nYour use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at\n\nhttp://about.jstor.org/terms\n\n \n\nJSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted\n\ndigital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about\n\nJSTOR, please contact support@jstor.org.\n\nThe University of Chicago Press is collaborating with JSTOR to digitize, preserve and extend access to\nAmerican Journal of Sociology\n\n'
JSTOR_msg = r'[A-Za-z]*, [0-9]* [A-Za-z]* [0-9]* [0-9][0-9]:[0-9][0-9]:[0-9][0-9] UTC'
str_split = re.split(access_ts, string)
# 0 --> Header w/out access date; 1 --> access_ts; 2 --> Body

In [None]:
matches = re.finditer(JSTOR_msg, str_split[2])
for match in matches:
    print(match)

In [None]:
JSTOR_msg = r'[A-Za-z]*,\s*[0-9]*\s*[A-Za-z]*\s*[0-9]*\s*[0-9][0-9]:[0-9][0-9]:[0-9][0-9]\s*UTC\s*'
matches = re.finditer(JSTOR_msg, str_split[2])
for indx, match in enumerate(matches):
    print(indx, match)

In [None]:
AJS_mark = r'\d*\s*THE\s*AMERICAN\s*JOURNAL\s*OF\s*SOCIOLOGY\s*\d*\s*'
AJS_matches = re.finditer(AJS_mark, string)
for indx, match in enumerate(AJS_matches):
    print(indx, match)

In [None]:
for match in matches:
    print(match)

In [None]:
for table in re.finditer(r'Table \d*', string):
    print(table)

In [None]:
print(table), table.group()