In [55]:
import os
import re
import io

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

from termcolor import colored
#Define the folder in which all the pdfs are in 
# - assuming the type of analysis needs to be different depending on the journal / time period
baseFolder = '/home/ishi/Desktop/franzosi/Language-of-Science/articles/AJS pdf files/1946to1966'
filePaths = os.listdir(baseFolder) # list of all files
filePaths.sort()

In [59]:
def PDFtoString(filePath):
    
    out = io.StringIO()
    with open(os.path.join(baseFolder,filePath), 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, out, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    #string = out.getvalue()
    return out.getvalue()

### Regex Method 1 - re.compile.finditer(string)

This returns the indices at which the pattern occurs

In [60]:
pattern = re.compile(r'\d[ ]? \w') #Important to start regex expression with r --> raw string
matches = pattern.finditer(string)
for match in matches:
    pass#print(match)

### Regex Method 2 - re.findall(pattern, string)

This returns the pattern itself

In [81]:
%%time
#re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)
for indx, file in enumerate(filePaths):
    print(colored(indx, 'green'), colored(file, 'red'))
    string = PDFtoString(os.path.join(baseFolder, file))
    print(colored(re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0], 'blue'))

[32m0[0m [31mAJS_1946_51_4_Antipathy and Social Behavior.pdf[0m
[34mAccessed: 06-06-2016 18:17 UTC[0m
[32m1[0m [31mAJS_1946_51_4_Definitions of Economic Terms in the Dictionary of Sociology.pdf[0m
[34mAccessed: 06-06-2016 18:25 UTC[0m
[32m2[0m [31mAJS_1946_51_4_Ethnic Behavior in Industry-Sponsorship and Rejection in a New England Factory.pdf[0m
[34mAccessed: 06-06-2016 18:19 UTC[0m
[32m3[0m [31mAJS_1946_51_4_Intermarriages Between German Pioneers and Other Nationalities in Minnesota in 1860 and 1870.pdf[0m
[34mAccessed: 06-06-2016 18:20 UTC[0m
[32m4[0m [31mAJS_1946_51_4_Methods of Measurement of Aesthetic Folkways.pdf[0m
[34mAccessed: 06-06-2016 18:15 UTC[0m
[32m5[0m [31mAJS_1946_51_4_Patients and Prejudice-Lay Attitudes Toward Women Physicians.pdf[0m
[34mAccessed: 06-06-2016 18:16 UTC[0m
[32m6[0m [31mAJS_1946_51_4_Social Class and Friendship Among School Children.pdf[0m
[34mAccessed: 06-06-2016 18:21 UTC[0m
[32m7[0m [31mAJS_1946_51_4_Socio

### Split string by re.compile(pattern, string) return val

In [169]:
access_ts = r'(Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC)'
JSTOR_msg = r'\n \nREFERENCES \nLinked references are available on JSTOR for this article:\nhttp://www.jstor.org/stable/2774936?seq=1&cid=pdf-reference#references_tab_contents \nYou may need to log in to JSTOR to access the linked references.\n \nYour use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at\n\nhttp://about.jstor.org/terms\n\n \n\nJSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted\n\ndigital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about\n\nJSTOR, please contact support@jstor.org.\n\nThe University of Chicago Press is collaborating with JSTOR to digitize, preserve and extend access to\nAmerican Journal of Sociology\n\n'
JSTOR_msg += r'This content downloaded from 170.140.26.180 on [A-Za-z]*, [0-9][0-9] [A-za-z]* \d\d\d\d [0-9][0-9]:[0-9][0-9]:[0-9][0-9] UTC\n\nAll use subject to http://about.jstor.org/terms\n\n\x0c'
str_split = re.split(access_ts, string)
str_split # 0 --> Header w/out access date; 1 --> access_ts; 2 --> Body

[' \nThe Altruism Scale: A Measure of Co-operative, Individualistic, and Competitive\nInterpersonal Orientation\nAuthor(s): Jack Sawyer\nSource: American Journal of Sociology, Vol. 71, No. 4 (Jan., 1966), pp. 407-416\nPublished by: The University of Chicago Press\nStable URL: http://www.jstor.org/stable/2774936\n',
 'Accessed: 06-06-2016 19:10 UTC',
 '\n \nREFERENCES \nLinked references are available on JSTOR for this article:\nhttp://www.jstor.org/stable/2774936?seq=1&cid=pdf-reference#references_tab_contents \nYou may need to log in to JSTOR to access the linked references.\n \nYour use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at\n\nhttp://about.jstor.org/terms\n\n \n\nJSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted\n\ndigital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarsh

In [177]:
tmp = 'Mon, 06 Jun 2016 19:10:40 UTC'
JSTOR_msg = r'[A-Za-z]*, [0-9][0-9] [A-Za-z]* \d\d\d\d [][]:10:40 UTC'#\n\nAll use subject to http://about.jstor.org/terms\n\n\x0c The Altruism Scale: A Me'
re.match(JSTOR_msg, tmp)

In [178]:
re.match(r'[A-Za-z]*, [0-9][0-9] [A-Za-z]* \d\d\d\d [][]:10:40 UTC', 'Mon, 06 Jun 2016 19:10:40 UTC')

In [168]:
JSTOR_msg in str_split[2]

False

In [148]:
str_split[2]

'\n \nREFERENCES \nLinked references are available on JSTOR for this article:\nhttp://www.jstor.org/stable/2774936?seq=1&cid=pdf-reference#references_tab_contents \nYou may need to log in to JSTOR to access the linked references.\n \nYour use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at\n\nhttp://about.jstor.org/terms\n\n \n\nJSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted\n\ndigital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about\n\nJSTOR, please contact support@jstor.org.\n\nThe University of Chicago Press is collaborating with JSTOR to digitize, preserve and extend access to\nAmerican Journal of Sociology\n\nThis content downloaded from 170.140.26.180 on Mon, 06 Jun 2016 19:10:40 UTC\n\nAll use subject to http://about.jstor.org/terms\n\n\x0c Th